147 lines
4.5 KiB
Python
147 lines
4.5 KiB
Python
import json
|
|
|
|
from transcript_processing import helpers
|
|
|
|
|
|
|
|
class AmazonConverter(TranscriptConverter):
|
|
|
|
def __init__(self, path, output_target):
|
|
super().__init__(path, output_target)
|
|
|
|
def get_word_objects(self, json_data):
|
|
return data['results']['items']
|
|
|
|
def get_words(self, word_objects):
|
|
return [self.get_word_word(w)
|
|
for w in word_objects])
|
|
|
|
@staticmethod
|
|
def get_word_start(word_object):
|
|
return float(word_object['start_time'])
|
|
|
|
@staticmethod
|
|
def get_word_end(word_object):
|
|
return float(word_object['end_time'])
|
|
|
|
@staticmethod
|
|
def get_word_confidence(word_object):
|
|
return float(word_object['alternatives'][0]['confidence'])
|
|
|
|
@staticmethod
|
|
def get_word_word(word_object):
|
|
word_word = w['alternatives'][0]['content']
|
|
if word_word == 'i':
|
|
# weird Amazon quirk
|
|
word_word = 'I'
|
|
return word_word
|
|
|
|
def convert_words(self, word_objects, words, tagged_words=None):
|
|
converted_words = []
|
|
|
|
punc_before = False
|
|
punc_after = False
|
|
num_words = len(words)
|
|
index = 0
|
|
|
|
for i, w in enumerate(words):
|
|
if w['type'] == 'punctuation':
|
|
continue
|
|
next_word_punc_after = None
|
|
word_obj = self.get_word_object(w, i, tagged_words, words)
|
|
|
|
if word_obj.next_word:
|
|
next_word = self.get_word_word(word_obj.next_word)
|
|
next_word_type = word_obj.next_word['type']
|
|
if next_word in ['.', ',']:
|
|
punc_after = next_word
|
|
elif next_word_punc_after:
|
|
punc_after = next_word_punc_after
|
|
next_word_punc_after = None
|
|
|
|
if word_obj.word.lower() == 'you' and next_word == 'know':
|
|
prev_word = words[i - 1]
|
|
if prev_word['type'] != 'punctuation':
|
|
converted_words[-1]['punc_after'] = ','
|
|
if next_word_type != 'punctuation':
|
|
next_word_punc_after = ','
|
|
|
|
converted_words.append({
|
|
'start': word_obj.start,
|
|
'end': word_obj.end,
|
|
'confidence': word_obj.confidence,
|
|
'word': word_obj.word,
|
|
'always_capitalized': (
|
|
word_obj.is_proper_noun
|
|
or word_obj.word == 'I'),
|
|
'index': index,
|
|
'punc_after': punc_after,
|
|
'punc_before': punc_before,
|
|
})
|
|
|
|
index += 1
|
|
punc_after = False
|
|
|
|
return converted_words
|
|
|
|
|
|
def amazon_converter(data: dict):
|
|
data = json.load(data)
|
|
converted_words = []
|
|
words = data['results']['items']
|
|
tagged_words = helpers.tag_words(
|
|
[w['alternatives'][0]['content'] for w in words])
|
|
punc_before = False
|
|
punc_after = False
|
|
num_words = len(words)
|
|
index = 0
|
|
|
|
for i, w in enumerate(words):
|
|
if w['type'] == 'punctuation':
|
|
continue
|
|
next_word_punc_after = None
|
|
word_start = float(w['start_time'])
|
|
word_end = float(w['end_time'])
|
|
confidence = float(w['alternatives'][0]['confidence'])
|
|
word = w['alternatives'][0]['content']
|
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
|
|
|
next_word = None
|
|
if i < num_words - 1:
|
|
next_word = words[i + 1]['alternatives'][0]['content']
|
|
next_word_type = words[i + 1]['type']
|
|
if next_word == '.':
|
|
punc_after = '.'
|
|
elif next_word == ',':
|
|
punc_after = ','
|
|
elif next_word_punc_after:
|
|
punc_after = next_word_punc_after
|
|
next_word_punc_after = None
|
|
|
|
if word == 'i':
|
|
# weird Amazon quirk
|
|
word = 'I'
|
|
|
|
if word.lower() == 'you' and next_word == 'know':
|
|
prev_word = words[i - 1]
|
|
if prev_word['type'] != 'punctuation':
|
|
converted_words[-1]['punc_after'] = ','
|
|
if next_word_type != 'punctuation':
|
|
next_word_punc_after = ','
|
|
|
|
converted_words.append({
|
|
'start': word_start,
|
|
'end': word_end,
|
|
'confidence': confidence,
|
|
'word': word,
|
|
'always_capitalized': is_proper_noun or word == 'I',
|
|
'index': index,
|
|
'punc_after': punc_after,
|
|
'punc_before': punc_before,
|
|
})
|
|
|
|
index += 1
|
|
punc_after = False
|
|
|
|
return converted_words
|