194 lines
5.3 KiB
Python
194 lines
5.3 KiB
Python
"""
|
|
|
|
fields for converted transcript:
|
|
|
|
start
|
|
end
|
|
word
|
|
confidence
|
|
index
|
|
always_capitalized
|
|
punc_before
|
|
punc_after
|
|
|
|
"""
|
|
|
|
from collections import namedtuple
|
|
from decimal import Decimal
|
|
import json
|
|
from typing import Dict, Union, List
|
|
|
|
import helpers
|
|
|
|
|
|
def amazon_converter(data: dict):
|
|
data = json.load(data)
|
|
converted_words = []
|
|
words = data['results']['items']
|
|
tagged_words = helpers.tag_words(
|
|
[w['alternatives'][0]['content'] for w in words])
|
|
punc_before = False
|
|
punc_after = False
|
|
num_words = len(words)
|
|
index = 0
|
|
|
|
for i, w in enumerate(words):
|
|
if w['type'] == 'punctuation':
|
|
continue
|
|
next_word_punc_after = None
|
|
word_start = float(w['start_time'])
|
|
word_end = float(w['end_time'])
|
|
confidence = float(w['alternatives'][0]['confidence'])
|
|
word = w['alternatives'][0]['content']
|
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
|
|
|
next_word = None
|
|
if i < num_words - 1:
|
|
next_word = words[i + 1]['alternatives'][0]['content']
|
|
next_word_type = words[i + 1]['type']
|
|
if next_word == '.':
|
|
punc_after = '.'
|
|
elif next_word == ',':
|
|
punc_after = ','
|
|
elif next_word_punc_after:
|
|
punc_after = next_word_punc_after
|
|
next_word_punc_after = None
|
|
|
|
if word == 'i':
|
|
# weird Amazon quirk
|
|
word = 'I'
|
|
|
|
if word.lower() == 'you' and next_word == 'know':
|
|
prev_word = words[i - 1]
|
|
if prev_word['type'] != 'punctuation':
|
|
converted_words[-1]['punc_after'] = ','
|
|
if next_word_type != 'punctuation':
|
|
next_word_punc_after = ','
|
|
|
|
converted_words.append({
|
|
'start': word_start,
|
|
'end': word_end,
|
|
'confidence': confidence,
|
|
'word': word,
|
|
'always_capitalized': is_proper_noun or word == 'I',
|
|
'index': index,
|
|
'punc_after': punc_after,
|
|
'punc_before': punc_before,
|
|
})
|
|
|
|
index += 1
|
|
punc_after = False
|
|
|
|
return converted_words
|
|
|
|
|
|
def speechmatics_converter(data: dict):
|
|
data = json.load(data)
|
|
converted_words = []
|
|
words = data['words']
|
|
tagged_words = helpers.tag_words([w['name'] for w in words])
|
|
punc_before = False
|
|
punc_after = False
|
|
num_words = len(words)
|
|
index = 0
|
|
|
|
for i, w in enumerate(words):
|
|
word_start = float(w['time'])
|
|
word_end = word_start + float(w['duration'])
|
|
confidence = float(w['confidence'])
|
|
word = w['name']
|
|
if word == '.':
|
|
continue
|
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
|
|
|
next_word = None
|
|
if i < num_words - 1:
|
|
next_word = words[i + 1]['name']
|
|
if next_word == '.':
|
|
punc_after = '.'
|
|
|
|
converted_words.append({
|
|
'start': word_start,
|
|
'end': word_end,
|
|
'confidence': confidence,
|
|
'word': word,
|
|
'always_capitalized': is_proper_noun or word == 'I',
|
|
'index': index,
|
|
'punc_after': punc_after,
|
|
'punc_before': punc_before,
|
|
})
|
|
|
|
index += 1
|
|
punc_after = False
|
|
|
|
return converted_words
|
|
|
|
|
|
def speechmatics_aligned_text_converter(data):
|
|
data = data.readlines()[0]
|
|
Word = namedtuple('Word', 'start end word')
|
|
|
|
class Exhausted(Exception):
|
|
pass
|
|
|
|
def get_time(transcript, index):
|
|
time_index = transcript.find('time=', index)
|
|
if time_index == -1:
|
|
raise Exhausted
|
|
close_index = transcript.find('>', time_index)
|
|
return float(transcript[time_index + 5: close_index]), close_index
|
|
|
|
def find_next_word(transcript, start_index):
|
|
start, end_of_start_index = get_time(transcript, start_index)
|
|
|
|
word_start_index = end_of_start_index + 1
|
|
word_end_index = transcript.find('<', word_start_index)
|
|
word = transcript[word_start_index: word_end_index]
|
|
|
|
end, close_index = get_time(transcript, word_end_index)
|
|
|
|
return Word(start, end, word), close_index
|
|
|
|
words = []
|
|
next_index = 0
|
|
word = None
|
|
|
|
while True:
|
|
try:
|
|
word, next_index = find_next_word(data, next_index)
|
|
except Exhausted:
|
|
break
|
|
else:
|
|
words.append(word)
|
|
|
|
tagged_words = helpers.tag_words([w.word for w in words])
|
|
converted_words = []
|
|
|
|
for i, word in enumerate(words):
|
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
|
punc_before = helpers.get_punc_before(word.word)
|
|
punc_after = helpers.get_punc_after(word.word)
|
|
the_word = word.word
|
|
if punc_before or punc_after:
|
|
for p in helpers.PUNCTUATION:
|
|
the_word = the_word.replace(p, '')
|
|
converted_words.append({
|
|
'start': word.start,
|
|
'end': word.end,
|
|
'confidence': 1,
|
|
'word': the_word,
|
|
'always_capitalized': is_proper_noun or word == 'I',
|
|
'index': i,
|
|
'punc_before': punc_before,
|
|
'punc_after': punc_after,
|
|
})
|
|
|
|
return converted_words
|
|
|
|
|
|
converters = {
|
|
'speechmatics': speechmatics_converter,
|
|
'speechmatics_align': speechmatics_aligned_text_converter,
|
|
'amazon': amazon_converter,
|
|
}
|