updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.
This commit is contained in:
11
transcript_processing/converters/__init__.py
Normal file
11
transcript_processing/converters/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from .amazon import AmazonConverter
|
||||
from .speechmatics import SpeechmaticsConverter
|
||||
from .gentle import GentleConverter
|
||||
from .google import GoogleConverter
|
||||
|
||||
services = {
|
||||
'amazon': AmazonConverter,
|
||||
'gentle': GentleConverter,
|
||||
'speechmatics': SpeechmaticsConverter,
|
||||
'google': GoogleConverter,
|
||||
}
|
||||
81
transcript_processing/converters/amazon.py
Normal file
81
transcript_processing/converters/amazon.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import json
|
||||
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class AmazonConverter(TranscriptConverter):
|
||||
|
||||
name = 'amazon'
|
||||
|
||||
def __init__(self, json_data):
|
||||
super().__init__(json_data)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['results']['items']
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['start_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return float(word_object['end_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return float(word_object['alternatives'][0]['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object) -> str:
|
||||
word_word = word_object['alternatives'][0]['content']
|
||||
if word_word == 'i':
|
||||
# weird Amazon quirk
|
||||
word_word = 'I'
|
||||
return word_word
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
next_word_type = word_obj.next_word['type']
|
||||
if next_word in ['.', ',']:
|
||||
punc_after = next_word
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word_obj.word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = word_objects[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
55
transcript_processing/converters/gentle.py
Normal file
55
transcript_processing/converters/gentle.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from ..converter import TranscriptConverter
|
||||
|
||||
|
||||
|
||||
|
||||
class GentleConverter(TranscriptConverter):
|
||||
|
||||
name = 'gentle'
|
||||
|
||||
def __init__(self, json_data):
|
||||
super().__init__(json_data)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return word_object['start']
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return word_object['end']
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
return word_object['alignedWord']
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
num_words = len(words)
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': False,
|
||||
'punc_before': False,
|
||||
})
|
||||
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
145
transcript_processing/converters/google.py
Normal file
145
transcript_processing/converters/google.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class GoogleConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, transcript_data: str):
|
||||
super().__init__(transcript_data)
|
||||
self.json_data = self.pre_process(transcript_data)
|
||||
|
||||
def pre_process(self, transcript_data):
|
||||
friendly = make_json_friendly(transcript_data)
|
||||
return json.loads(friendly)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
punc_before = helpers.get_punc_before(word_obj.word) or False
|
||||
punc_after = helpers.get_punc_after(word_obj.word) or False
|
||||
|
||||
the_word = word_obj.word
|
||||
if punc_before:
|
||||
the_word = the_word[len(punc_before):]
|
||||
if punc_after:
|
||||
the_word = the_word[:-len(punc_after)]
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': the_word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
return converted_words
|
||||
|
||||
@classmethod
|
||||
def get_word_start(cls, word_object):
|
||||
return cls.get_seconds(word_object['start_time'])
|
||||
|
||||
@classmethod
|
||||
def get_word_end(cls, word_object):
|
||||
return cls.get_seconds(word_object['end_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_seconds(time: dict) -> float:
|
||||
seconds = 0
|
||||
if 'seconds' in time:
|
||||
seconds = time['seconds']
|
||||
if 'nanos' in time:
|
||||
seconds += time['nanos'] / 1_000_000_000
|
||||
return seconds
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return word_object['confidence']
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
print(word_object)
|
||||
return word_object['word']
|
||||
|
||||
|
||||
|
||||
def make_json_friendly(json_string):
|
||||
lines = [line.strip() for line in json_string.split('\\n')]
|
||||
|
||||
fields = [
|
||||
'words {',
|
||||
'start_time {',
|
||||
'}',
|
||||
'end_time {',
|
||||
'}',
|
||||
'word: ',
|
||||
'confidence: '
|
||||
]
|
||||
|
||||
current_field_index = 0
|
||||
new_string = ''
|
||||
|
||||
for line in lines:
|
||||
|
||||
current_field = fields[current_field_index]
|
||||
|
||||
if current_field in line:
|
||||
if current_field_index == len(fields) - 1:
|
||||
current_field_index = 0
|
||||
else:
|
||||
current_field_index += 1
|
||||
if current_field_index == 1:
|
||||
new_string += '}, {'
|
||||
# "words" was found, don't want to append that
|
||||
continue
|
||||
|
||||
else:
|
||||
if current_field_index == 0:
|
||||
# haven't found the beginning of the next word object
|
||||
continue
|
||||
|
||||
# add quotes around keys
|
||||
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
|
||||
'"\\1"',
|
||||
line)
|
||||
|
||||
# add colons after keys
|
||||
if line.endswith('{'):
|
||||
line = line.replace('" ', '": ')
|
||||
|
||||
# use first two decimals of confidence
|
||||
if 'confidence' in current_field:
|
||||
line = ', ' + line
|
||||
line = line[:20]
|
||||
|
||||
if current_field == '}':
|
||||
line = line + ', '
|
||||
|
||||
new_string += line
|
||||
|
||||
# cleanup
|
||||
if new_string.startswith('}, '):
|
||||
new_string = new_string[3:]
|
||||
if not new_string.startswith('['):
|
||||
new_string = '[' + new_string
|
||||
if not new_string.endswith('}]'):
|
||||
new_string = new_string + '}]'
|
||||
new_string = new_string.replace(', }', '}').replace('\\', '')
|
||||
|
||||
return new_string
|
||||
133
transcript_processing/converters/speechmatics.py
Normal file
133
transcript_processing/converters/speechmatics.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class SpeechmaticsConverter(TranscriptConverter):
|
||||
|
||||
name = 'speechmatics'
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return (SpeechmaticsConverter.get_word_start(word_object)
|
||||
+ float(word_object['duration']))
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return float(word_object['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
return word_object['name']
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
if word_obj.word == '.':
|
||||
continue
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def speechmatics_aligned_text_converter(data):
|
||||
data = data.readlines()[0]
|
||||
|
||||
class Exhausted(Exception):
|
||||
pass
|
||||
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
def get_time(transcript, index):
|
||||
time_index = transcript.find('time=', index)
|
||||
if time_index == -1:
|
||||
raise Exhausted
|
||||
close_index = transcript.find('>', time_index)
|
||||
return float(transcript[time_index + 5: close_index]), close_index
|
||||
|
||||
def find_next_word(transcript, start_index):
|
||||
start, end_of_start_index = get_time(transcript, start_index)
|
||||
|
||||
word_start_index = end_of_start_index + 1
|
||||
word_end_index = transcript.find('<', word_start_index)
|
||||
word = transcript[word_start_index: word_end_index]
|
||||
|
||||
end, close_index = get_time(transcript, word_end_index)
|
||||
|
||||
return Word(start, end, word), close_index
|
||||
|
||||
words = []
|
||||
next_index = 0
|
||||
word = None
|
||||
|
||||
while True:
|
||||
try:
|
||||
word, next_index = find_next_word(data, next_index)
|
||||
except Exhausted:
|
||||
break
|
||||
else:
|
||||
words.append(word)
|
||||
|
||||
tagged_words = helpers.tag_words([w.word for w in words])
|
||||
converted_words = []
|
||||
|
||||
for i, word in enumerate(words):
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
punc_before = helpers.get_punc_before(word.word)
|
||||
punc_after = helpers.get_punc_after(word.word)
|
||||
the_word = word.word
|
||||
if punc_before or punc_after:
|
||||
for p in helpers.PUNCTUATION:
|
||||
the_word = the_word.replace(p, '')
|
||||
converted_words.append({
|
||||
'start': word.start,
|
||||
'end': word.end,
|
||||
'confidence': 1,
|
||||
'word': the_word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'index': i,
|
||||
'punc_before': punc_before,
|
||||
'punc_after': punc_after,
|
||||
})
|
||||
|
||||
return converted_words
|
||||
Reference in New Issue
Block a user