Files
tpro/converters.py
2018-11-09 04:33:43 -05:00

66 lines
1.5 KiB
Python

"""
fields for converted transcript:
wordStart
wordEnd
word
confidence
index
alwaysCapitalized
puncBefore
puncAfter
"""
from decimal import Decimal
from typing import Dict, Union, List
from helpers import tag_words, PROPER_NOUN_TAGS
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
converted_words = []
words = data['words']
tagged_words = tag_words([w['name'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
word_start = float(w['time'])
word_end = word_start + float(w['duration'])
confidence = float(w['confidence'])
word = w['name']
if word == '.':
continue
is_proper_noun = tagged_words[i][1] in PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['name']
if next_word == '.':
punc_after = '.'
converted_words.append({
'wordStart': word_start,
'wordEnd': word_end,
'confidence': confidence,
'word': word,
'alwaysCapitalized': is_proper_noun or word == 'I',
'index': index,
'puncAfter': punc_after,
'puncBefore': punc_before,
})
index += 1
punc_after = False
return converted_words
converters = {
'speechmatics': speechmatics_converter,
}