updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.

This commit is contained in:
2019-03-07 02:30:48 -05:00
parent 6333f55c87
commit d30ecad583
16 changed files with 346 additions and 46 deletions

View File

@@ -0,0 +1,11 @@
from .amazon import AmazonConverter
from .speechmatics import SpeechmaticsConverter
from .gentle import GentleConverter
from .google import GoogleConverter
services = {
'amazon': AmazonConverter,
'gentle': GentleConverter,
'speechmatics': SpeechmaticsConverter,
'google': GoogleConverter,
}

View File

@@ -0,0 +1,81 @@
import json
from ..converter import TranscriptConverter
from .. import helpers
class AmazonConverter(TranscriptConverter):
name = 'amazon'
def __init__(self, json_data):
super().__init__(json_data)
def get_word_objects(self, json_data):
return json_data['results']['items']
@staticmethod
def get_word_start(word_object):
return float(word_object['start_time'])
@staticmethod
def get_word_end(word_object):
return float(word_object['end_time'])
@staticmethod
def get_word_confidence(word_object):
return float(word_object['alternatives'][0]['confidence'])
@staticmethod
def get_word_word(word_object) -> str:
word_word = word_object['alternatives'][0]['content']
if word_word == 'i':
# weird Amazon quirk
word_word = 'I'
return word_word
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
for i, w in enumerate(word_objects):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
if word_obj.next_word:
next_word = self.get_word_word(word_obj.next_word)
next_word_type = word_obj.next_word['type']
if next_word in ['.', ',']:
punc_after = next_word
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word_obj.word.lower() == 'you' and next_word == 'know':
prev_word = word_objects[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
punc_after = False
return converted_words

View File

@@ -0,0 +1,55 @@
from ..converter import TranscriptConverter
class GentleConverter(TranscriptConverter):
name = 'gentle'
def __init__(self, json_data):
super().__init__(json_data)
def get_word_objects(self, json_data):
return json_data['words']
@staticmethod
def get_word_start(word_object):
return word_object['start']
@staticmethod
def get_word_end(word_object):
return word_object['end']
@staticmethod
def get_word_confidence(word_object):
return 1
@staticmethod
def get_word_word(word_object):
return word_object['alignedWord']
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
num_words = len(words)
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': False,
'punc_before': False,
})
punc_after = False
return converted_words

View File

@@ -0,0 +1,145 @@
import json
import re
from ..converter import TranscriptConverter
from .. import helpers
class GoogleConverter(TranscriptConverter):
def __init__(self, transcript_data: str):
super().__init__(transcript_data)
self.json_data = self.pre_process(transcript_data)
def pre_process(self, transcript_data):
friendly = make_json_friendly(transcript_data)
return json.loads(friendly)
def get_word_objects(self, json_data):
return json_data
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
punc_before = helpers.get_punc_before(word_obj.word) or False
punc_after = helpers.get_punc_after(word_obj.word) or False
the_word = word_obj.word
if punc_before:
the_word = the_word[len(punc_before):]
if punc_after:
the_word = the_word[:-len(punc_after)]
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': the_word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
return converted_words
@classmethod
def get_word_start(cls, word_object):
return cls.get_seconds(word_object['start_time'])
@classmethod
def get_word_end(cls, word_object):
return cls.get_seconds(word_object['end_time'])
@staticmethod
def get_seconds(time: dict) -> float:
seconds = 0
if 'seconds' in time:
seconds = time['seconds']
if 'nanos' in time:
seconds += time['nanos'] / 1_000_000_000
return seconds
@staticmethod
def get_word_confidence(word_object):
return word_object['confidence']
@staticmethod
def get_word_word(word_object):
print(word_object)
return word_object['word']
def make_json_friendly(json_string):
lines = [line.strip() for line in json_string.split('\\n')]
fields = [
'words {',
'start_time {',
'}',
'end_time {',
'}',
'word: ',
'confidence: '
]
current_field_index = 0
new_string = ''
for line in lines:
current_field = fields[current_field_index]
if current_field in line:
if current_field_index == len(fields) - 1:
current_field_index = 0
else:
current_field_index += 1
if current_field_index == 1:
new_string += '}, {'
# "words" was found, don't want to append that
continue
else:
if current_field_index == 0:
# haven't found the beginning of the next word object
continue
# add quotes around keys
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
'"\\1"',
line)
# add colons after keys
if line.endswith('{'):
line = line.replace('" ', '": ')
# use first two decimals of confidence
if 'confidence' in current_field:
line = ', ' + line
line = line[:20]
if current_field == '}':
line = line + ', '
new_string += line
# cleanup
if new_string.startswith('}, '):
new_string = new_string[3:]
if not new_string.startswith('['):
new_string = '[' + new_string
if not new_string.endswith('}]'):
new_string = new_string + '}]'
new_string = new_string.replace(', }', '}').replace('\\', '')
return new_string

View File

@@ -0,0 +1,133 @@
from collections import namedtuple
import json
from ..converter import TranscriptConverter
from .. import helpers
class SpeechmaticsConverter(TranscriptConverter):
name = 'speechmatics'
def __init__(self, path):
super().__init__(path)
def get_word_objects(self, json_data):
return json_data['words']
@staticmethod
def get_word_start(word_object):
return float(word_object['time'])
@staticmethod
def get_word_end(word_object):
return (SpeechmaticsConverter.get_word_start(word_object)
+ float(word_object['duration']))
@staticmethod
def get_word_confidence(word_object):
return float(word_object['confidence'])
@staticmethod
def get_word_word(word_object):
return word_object['name']
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
num_words = len(words)
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
if word_obj.word == '.':
continue
if word_obj.next_word:
next_word = self.get_word_word(word_obj.next_word)
if next_word == '.':
punc_after = '.'
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
punc_after = False
return converted_words
def speechmatics_aligned_text_converter(data):
data = data.readlines()[0]
class Exhausted(Exception):
pass
Word = namedtuple('Word', 'start end word')
def get_time(transcript, index):
time_index = transcript.find('time=', index)
if time_index == -1:
raise Exhausted
close_index = transcript.find('>', time_index)
return float(transcript[time_index + 5: close_index]), close_index
def find_next_word(transcript, start_index):
start, end_of_start_index = get_time(transcript, start_index)
word_start_index = end_of_start_index + 1
word_end_index = transcript.find('<', word_start_index)
word = transcript[word_start_index: word_end_index]
end, close_index = get_time(transcript, word_end_index)
return Word(start, end, word), close_index
words = []
next_index = 0
word = None
while True:
try:
word, next_index = find_next_word(data, next_index)
except Exhausted:
break
else:
words.append(word)
tagged_words = helpers.tag_words([w.word for w in words])
converted_words = []
for i, word in enumerate(words):
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
punc_before = helpers.get_punc_before(word.word)
punc_after = helpers.get_punc_after(word.word)
the_word = word.word
if punc_before or punc_after:
for p in helpers.PUNCTUATION:
the_word = the_word.replace(p, '')
converted_words.append({
'start': word.start,
'end': word.end,
'confidence': 1,
'word': the_word,
'always_capitalized': self.check_if_always_capitalized(
word.word,
i,
tagged_words),
'index': i,
'punc_before': punc_before,
'punc_after': punc_after,
})
return converted_words