fixed problem with escaped unicode chars, from Spanish smple data
This commit is contained in:
2722
B-050719-1400H-MARIA_R-PART_1.MP4.json
Normal file
2722
B-050719-1400H-MARIA_R-PART_1.MP4.json
Normal file
File diff suppressed because it is too large
Load Diff
6749
B-050719-1400H-MARIA_R-PART_1.MP4.txt
Normal file
6749
B-050719-1400H-MARIA_R-PART_1.MP4.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -87,14 +87,14 @@ class TranscriptConverter:
|
|||||||
else:
|
else:
|
||||||
if word.upper() == 'I':
|
if word.upper() == 'I':
|
||||||
return True
|
return True
|
||||||
word_category = tagged_words[index][1]
|
word_category = tagged_words[index][1]
|
||||||
return word_category in helpers.PROPER_NOUN_TAGS
|
return word_category in helpers.PROPER_NOUN_TAGS
|
||||||
|
|
||||||
def get_word_object(
|
def get_word_object(
|
||||||
self,
|
self,
|
||||||
word_object,
|
word_object,
|
||||||
index,
|
index,
|
||||||
tagged_words,
|
tagged_words,
|
||||||
word_objects,
|
word_objects,
|
||||||
speaker_segments=None,
|
speaker_segments=None,
|
||||||
):
|
):
|
||||||
@@ -107,7 +107,7 @@ class TranscriptConverter:
|
|||||||
self.check_if_always_capitalized(word, index, tagged_words),
|
self.check_if_always_capitalized(word, index, tagged_words),
|
||||||
self.get_next_word(word_objects, index),
|
self.get_next_word(word_objects, index),
|
||||||
self.get_speaker_id(word_object, speaker_segments),
|
self.get_speaker_id(word_object, speaker_segments),
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_next_word(self, word_objects, index):
|
def get_next_word(self, word_objects, index):
|
||||||
if index < len(word_objects) - 1:
|
if index < len(word_objects) - 1:
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import codecs
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -38,6 +39,7 @@ class GoogleConverter(TranscriptConverter):
|
|||||||
punc_after = helpers.get_punc_after(word_obj.word) or False
|
punc_after = helpers.get_punc_after(word_obj.word) or False
|
||||||
|
|
||||||
the_word = word_obj.word
|
the_word = word_obj.word
|
||||||
|
|
||||||
if punc_before:
|
if punc_before:
|
||||||
the_word = the_word[len(punc_before):]
|
the_word = the_word[len(punc_before):]
|
||||||
if punc_after:
|
if punc_after:
|
||||||
@@ -128,6 +130,7 @@ def make_json_friendly(json_string):
|
|||||||
|
|
||||||
new_string += line
|
new_string += line
|
||||||
|
|
||||||
new_string = new_string.replace('\\', '')
|
if '\\' in new_string:
|
||||||
|
new_string = codecs.escape_decode(new_string)[0].decode('utf-8')
|
||||||
|
|
||||||
return new_string[:-2] + ']'
|
return new_string[:-2] + ']'
|
||||||
|
|||||||
@@ -24,19 +24,19 @@ def is_a_proper_noun(phrase):
|
|||||||
def get_punc_before(word):
|
def get_punc_before(word):
|
||||||
punc = []
|
punc = []
|
||||||
for char in word:
|
for char in word:
|
||||||
if char.isalpha():
|
|
||||||
return punc
|
|
||||||
if char in PUNCTUATION:
|
if char in PUNCTUATION:
|
||||||
punc.append(char)
|
punc.append(char)
|
||||||
|
else:
|
||||||
|
return punc
|
||||||
|
|
||||||
|
|
||||||
def get_punc_after(word):
|
def get_punc_after(word):
|
||||||
punc = []
|
punc = []
|
||||||
for char in reversed(word):
|
for char in reversed(word):
|
||||||
if char.isalpha():
|
|
||||||
return punc
|
|
||||||
if char in PUNCTUATION:
|
if char in PUNCTUATION:
|
||||||
punc.insert(0, char)
|
punc.insert(0, char)
|
||||||
|
else:
|
||||||
|
return punc
|
||||||
|
|
||||||
|
|
||||||
def is_path(string):
|
def is_path(string):
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ output_choices = [k for k, v in
|
|||||||
if callable(v)]
|
if callable(v)]
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('-p', '--print-output', is_flag=True, default=True,
|
@click.option('-p', '--print-output', is_flag=True, default=False,
|
||||||
help='pretty print the transcript, breaks pipeability')
|
help='pretty print the transcript, breaks pipeability')
|
||||||
@click.option('--language-code', default='en-US',
|
@click.option('--language-code', default='en-US',
|
||||||
help='specify language, defaults to en-US.')
|
help='specify language, defaults to en-US.')
|
||||||
|
|||||||
BIN
transcription_jobs.p
Normal file
BIN
transcription_jobs.p
Normal file
Binary file not shown.
Reference in New Issue
Block a user