made google work again after change in formatting of saved transcript. added speaker_id support to base class and Google

This commit is contained in:
2019-03-07 22:55:37 -05:00
parent ed2ece3dfb
commit 3fc6dacfde
9 changed files with 75 additions and 62 deletions

View File

@@ -53,3 +53,4 @@ formats.
- Word (`.doc`, `.docx`) - Word (`.doc`, `.docx`)
- text files - text files
- SRT (subtitles) - SRT (subtitles)
- Draft.js JSON

View File

@@ -111,3 +111,4 @@ formats.
- Word (`.doc`, `.docx`) - Word (`.doc`, `.docx`)
- text files - text files
- SRT (subtitles) - SRT (subtitles)
- Draft.js JSON

View File

@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:
setup( setup(
name="tpro", name="tpro",
version="0.11", version="0.12",
url='https://github.com/zevaverbach/tpro', url='https://github.com/zevaverbach/tpro',
install_requires=[ install_requires=[
'Click', 'Click',

View File

@@ -79,13 +79,14 @@ def test_google():
g = GoogleConverter(transcript_data) g = GoogleConverter(transcript_data)
g.convert() g.convert()
print(g.converted_words[0])
assert g.converted_words[0] == { assert g.converted_words[0] == {
'start': 4, 'start': 0.4,
'end': 5.5, 'end': 2.1,
'confidence': 0.88, 'confidence': 0.9128385782241821,
'word': 'Testing', 'word': 'Okay',
'always_capitalized': False, 'always_capitalized': False,
'punc_after': [','], 'punc_after': [','],
'punc_before': False, 'punc_before': False,
} 'speaker_id': None
}

View File

@@ -27,6 +27,7 @@ def test_pre_process(transcript):
g = GoogleConverter(transcript_data) g = GoogleConverter(transcript_data)
assert g.json_data assert g.json_data
print(g.json_data)

View File

@@ -7,7 +7,9 @@ from . import converters
Word = namedtuple('Word', 'start end confidence word always_capitalized next_word') Word = namedtuple(
'Word',
'start end confidence word always_capitalized next_word speaker_id')
class TranscriptConverter: class TranscriptConverter:
@@ -58,6 +60,11 @@ class TranscriptConverter:
def get_word_confidence(word_object): def get_word_confidence(word_object):
pass pass
@staticmethod
@abc.abstractmethod
def get_speaker_id(word_object):
pass
@staticmethod @staticmethod
@abc.abstractmethod @abc.abstractmethod
def get_word_word(word_object): def get_word_word(word_object):
@@ -78,7 +85,8 @@ class TranscriptConverter:
self.get_word_confidence(word_object), self.get_word_confidence(word_object),
word, word,
self.check_if_always_capitalized(word, index, tagged_words), self.check_if_always_capitalized(word, index, tagged_words),
self.get_next_word(word_objects, index) self.get_next_word(word_objects, index),
self.get_speaker_id(word_object),
) )
def get_next_word(self, word_objects, index): def get_next_word(self, word_objects, index):

View File

@@ -35,6 +35,10 @@ class AmazonConverter(TranscriptConverter):
word_word = 'I' word_word = 'I'
return word_word return word_word
@staticmethod
def get_speaker_id(word_object):
return None
def convert_words(self, word_objects, words, tagged_words=None): def convert_words(self, word_objects, words, tagged_words=None):
converted_words = [] converted_words = []
@@ -74,6 +78,7 @@ class AmazonConverter(TranscriptConverter):
tagged_words), tagged_words),
'punc_after': punc_after, 'punc_after': punc_after,
'punc_before': punc_before, 'punc_before': punc_before,
'speaker_id': word_obj.speaker_id,
}) })
punc_after = False punc_after = False

View File

@@ -14,7 +14,12 @@ class GoogleConverter(TranscriptConverter):
def pre_process(self, transcript_data): def pre_process(self, transcript_data):
friendly = make_json_friendly(transcript_data) friendly = make_json_friendly(transcript_data)
return json.loads(friendly) json_data = json.loads(friendly)
last_datum = json_data[-1]
if last_datum.get('speaker_tag'):
"""Get rid of duplicate content that doesn't have speaker_tags"""
json_data = [jd for jd in json_data if jd.get('speaker_tag')]
return json_data
def get_word_objects(self, json_data): def get_word_objects(self, json_data):
return json_data return json_data
@@ -47,6 +52,7 @@ class GoogleConverter(TranscriptConverter):
tagged_words), tagged_words),
'punc_after': punc_after, 'punc_after': punc_after,
'punc_before': punc_before, 'punc_before': punc_before,
'speaker_id': word_obj.speaker_id,
}) })
return converted_words return converted_words
@@ -74,13 +80,13 @@ class GoogleConverter(TranscriptConverter):
@staticmethod @staticmethod
def get_word_word(word_object): def get_word_word(word_object):
print(word_object)
return word_object['word'] return word_object['word']
def make_json_friendly(json_string): def make_json_friendly(json_string):
lines = [line.strip() for line in json_string.split('\\n')] lines = [line.strip() for line in json_string.split('\n')]
new_string = '['
fields = [ fields = [
'words {', 'words {',
@@ -92,54 +98,41 @@ def make_json_friendly(json_string):
'confidence: ' 'confidence: '
] ]
current_field_index = 0 start_field = 'words {'
new_string = ''
for line in lines: open_braces = 0
current_field = fields[current_field_index] for index, line in enumerate(lines):
if open_braces == 0:
if start_field in line:
open_braces = 1
new_string += '{'
continue
if current_field in line: if '{' in line:
if current_field_index == len(fields) - 1: open_braces += 1
current_field_index = 0 if '}' in line:
else: open_braces -= 1
current_field_index += 1
if current_field_index == 1:
new_string += '}, {'
# "words" was found, don't want to append that
continue
else: if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]:
if current_field_index == 0:
# haven't found the beginning of the next word object
continue
# add quotes around keys
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
'"\\1"',
line)
# add colons after keys
if line.endswith('{'):
line = line.replace('" ', '": ')
# use first two decimals of confidence
if 'confidence' in current_field:
line = ', ' + line
line = line[:20]
if current_field == '}':
line = line + ', ' line = line + ', '
if open_braces == 0:
new_string += '}, '
continue
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
'"\\1"',
line)
if 'start_time' in line:
line = line.replace('"start_time"', '"start_time":')
if 'end_time' in line:
line = line.replace('"end_time"', '"end_time":')
new_string += line new_string += line
# cleanup new_string = new_string.replace('\\', '')
if new_string.startswith('}, '):
new_string = new_string[3:]
if not new_string.startswith('['):
new_string = '[' + new_string
if not new_string.endswith('}]'):
new_string = new_string + '}]'
new_string = new_string.replace(', }', '}').replace('\\', '')
return new_string
return new_string[:-2] + ']'

View File

@@ -4,11 +4,14 @@ def universal_transcript(self, pretty=False):
return json.dumps(self.converted_words, indent=4 if pretty else None) return json.dumps(self.converted_words, indent=4 if pretty else None)
def viral_overlay(self, pretty=False): def viral_overlay(self, pretty=False):
return json.dumps([{ return json.dumps([
'start': word['start'], {'start': word['start'],
'stop': word['end'], 'stop': word['end'],
'text': word['word'].title() if word['always_capitalized'] else word['word']} 'text': word['word'].title()
if word['always_capitalized'] else word['word']
}
for word in self.converted_words], indent=4 if pretty else None for word in self.converted_words]
, indent=4 if pretty else None
) )