updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.

This commit is contained in:
2019-03-07 02:30:48 -05:00
parent 6333f55c87
commit d30ecad583
16 changed files with 346 additions and 46 deletions

View File

@@ -0,0 +1,133 @@
from collections import namedtuple
import json
from ..converter import TranscriptConverter
from .. import helpers
class SpeechmaticsConverter(TranscriptConverter):
name = 'speechmatics'
def __init__(self, path):
super().__init__(path)
def get_word_objects(self, json_data):
return json_data['words']
@staticmethod
def get_word_start(word_object):
return float(word_object['time'])
@staticmethod
def get_word_end(word_object):
return (SpeechmaticsConverter.get_word_start(word_object)
+ float(word_object['duration']))
@staticmethod
def get_word_confidence(word_object):
return float(word_object['confidence'])
@staticmethod
def get_word_word(word_object):
return word_object['name']
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
num_words = len(words)
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
if word_obj.word == '.':
continue
if word_obj.next_word:
next_word = self.get_word_word(word_obj.next_word)
if next_word == '.':
punc_after = '.'
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
punc_after = False
return converted_words
def speechmatics_aligned_text_converter(data):
data = data.readlines()[0]
class Exhausted(Exception):
pass
Word = namedtuple('Word', 'start end word')
def get_time(transcript, index):
time_index = transcript.find('time=', index)
if time_index == -1:
raise Exhausted
close_index = transcript.find('>', time_index)
return float(transcript[time_index + 5: close_index]), close_index
def find_next_word(transcript, start_index):
start, end_of_start_index = get_time(transcript, start_index)
word_start_index = end_of_start_index + 1
word_end_index = transcript.find('<', word_start_index)
word = transcript[word_start_index: word_end_index]
end, close_index = get_time(transcript, word_end_index)
return Word(start, end, word), close_index
words = []
next_index = 0
word = None
while True:
try:
word, next_index = find_next_word(data, next_index)
except Exhausted:
break
else:
words.append(word)
tagged_words = helpers.tag_words([w.word for w in words])
converted_words = []
for i, word in enumerate(words):
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
punc_before = helpers.get_punc_before(word.word)
punc_after = helpers.get_punc_after(word.word)
the_word = word.word
if punc_before or punc_after:
for p in helpers.PUNCTUATION:
the_word = the_word.replace(p, '')
converted_words.append({
'start': word.start,
'end': word.end,
'confidence': 1,
'word': the_word,
'always_capitalized': self.check_if_always_capitalized(
word.word,
i,
tagged_words),
'index': i,
'punc_before': punc_before,
'punc_after': punc_after,
})
return converted_words