updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.
This commit is contained in:
133
transcript_processing/converters/speechmatics.py
Normal file
133
transcript_processing/converters/speechmatics.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class SpeechmaticsConverter(TranscriptConverter):
|
||||
|
||||
name = 'speechmatics'
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return (SpeechmaticsConverter.get_word_start(word_object)
|
||||
+ float(word_object['duration']))
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return float(word_object['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
return word_object['name']
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
if word_obj.word == '.':
|
||||
continue
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def speechmatics_aligned_text_converter(data):
|
||||
data = data.readlines()[0]
|
||||
|
||||
class Exhausted(Exception):
|
||||
pass
|
||||
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
def get_time(transcript, index):
|
||||
time_index = transcript.find('time=', index)
|
||||
if time_index == -1:
|
||||
raise Exhausted
|
||||
close_index = transcript.find('>', time_index)
|
||||
return float(transcript[time_index + 5: close_index]), close_index
|
||||
|
||||
def find_next_word(transcript, start_index):
|
||||
start, end_of_start_index = get_time(transcript, start_index)
|
||||
|
||||
word_start_index = end_of_start_index + 1
|
||||
word_end_index = transcript.find('<', word_start_index)
|
||||
word = transcript[word_start_index: word_end_index]
|
||||
|
||||
end, close_index = get_time(transcript, word_end_index)
|
||||
|
||||
return Word(start, end, word), close_index
|
||||
|
||||
words = []
|
||||
next_index = 0
|
||||
word = None
|
||||
|
||||
while True:
|
||||
try:
|
||||
word, next_index = find_next_word(data, next_index)
|
||||
except Exhausted:
|
||||
break
|
||||
else:
|
||||
words.append(word)
|
||||
|
||||
tagged_words = helpers.tag_words([w.word for w in words])
|
||||
converted_words = []
|
||||
|
||||
for i, word in enumerate(words):
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
punc_before = helpers.get_punc_before(word.word)
|
||||
punc_after = helpers.get_punc_after(word.word)
|
||||
the_word = word.word
|
||||
if punc_before or punc_after:
|
||||
for p in helpers.PUNCTUATION:
|
||||
the_word = the_word.replace(p, '')
|
||||
converted_words.append({
|
||||
'start': word.start,
|
||||
'end': word.end,
|
||||
'confidence': 1,
|
||||
'word': the_word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'index': i,
|
||||
'punc_before': punc_before,
|
||||
'punc_after': punc_after,
|
||||
})
|
||||
|
||||
return converted_words
|
||||
Reference in New Issue
Block a user