fixed problem with escaped unicode chars, from Spanish smple data

This commit is contained in:
2019-07-18 17:12:48 +02:00
parent 5693f91a20
commit 640710541e
7 changed files with 9486 additions and 12 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -87,14 +87,14 @@ class TranscriptConverter:
else:
if word.upper() == 'I':
return True
word_category = tagged_words[index][1]
word_category = tagged_words[index][1]
return word_category in helpers.PROPER_NOUN_TAGS
def get_word_object(
self,
word_object,
index,
tagged_words,
self,
word_object,
index,
tagged_words,
word_objects,
speaker_segments=None,
):
@@ -107,7 +107,7 @@ class TranscriptConverter:
self.check_if_always_capitalized(word, index, tagged_words),
self.get_next_word(word_objects, index),
self.get_speaker_id(word_object, speaker_segments),
)
)
def get_next_word(self, word_objects, index):
if index < len(word_objects) - 1:

View File

@@ -1,3 +1,4 @@
import codecs
import json
import re
@@ -38,6 +39,7 @@ class GoogleConverter(TranscriptConverter):
punc_after = helpers.get_punc_after(word_obj.word) or False
the_word = word_obj.word
if punc_before:
the_word = the_word[len(punc_before):]
if punc_after:
@@ -128,6 +130,7 @@ def make_json_friendly(json_string):
new_string += line
new_string = new_string.replace('\\', '')
if '\\' in new_string:
new_string = codecs.escape_decode(new_string)[0].decode('utf-8')
return new_string[:-2] + ']'

View File

@@ -24,19 +24,19 @@ def is_a_proper_noun(phrase):
def get_punc_before(word):
punc = []
for char in word:
if char.isalpha():
return punc
if char in PUNCTUATION:
punc.append(char)
else:
return punc
def get_punc_after(word):
punc = []
for char in reversed(word):
if char.isalpha():
return punc
if char in PUNCTUATION:
punc.insert(0, char)
else:
return punc
def is_path(string):

View File

@@ -11,7 +11,7 @@ output_choices = [k for k, v in
if callable(v)]
@click.command()
@click.option('-p', '--print-output', is_flag=True, default=True,
@click.option('-p', '--print-output', is_flag=True, default=False,
help='pretty print the transcript, breaks pipeability')
@click.option('--language-code', default='en-US',
help='specify language, defaults to en-US.')

BIN
transcription_jobs.p Normal file

Binary file not shown.