64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
"""
|
|
|
|
fields for converted transcript:
|
|
|
|
wordStart
|
|
wordEnd
|
|
word
|
|
confidence
|
|
index
|
|
alwaysCapitalized
|
|
puncBefore
|
|
puncAfter
|
|
|
|
"""
|
|
|
|
from decimal import Decimal
|
|
from typing import Dict, Union, List
|
|
|
|
from helpers import tag_words, PROPER_NOUN_TAGS
|
|
|
|
|
|
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
|
|
converted_words = []
|
|
words = data['words']
|
|
tagged_words = tag_words([w['name'] for w in words])
|
|
punc_before = False
|
|
punc_after = False
|
|
num_words = len(words)
|
|
|
|
for index, w in enumerate(words):
|
|
word_start = float(w['time'])
|
|
word_end = word_start + float(w['duration'])
|
|
confidence = float(w['confidence'])
|
|
word = w['name']
|
|
if word == '.':
|
|
continue
|
|
is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS
|
|
|
|
next_word = None
|
|
if index < num_words - 1:
|
|
next_word = words[index + 1]['name']
|
|
if next_word == '.':
|
|
punc_after = '.'
|
|
|
|
converted_words.append({
|
|
'wordStart': word_start,
|
|
'wordEnd': word_end,
|
|
'confidence': confidence,
|
|
'word': word,
|
|
'alwaysCapitalized': is_proper_noun or word == 'I',
|
|
'index': index,
|
|
'puncAfter': punc_after,
|
|
'puncBefore': punc_before,
|
|
})
|
|
|
|
punc_after = False
|
|
|
|
return converted_words
|
|
|
|
|
|
converters = {
|
|
'speechmatics': speechmatics_converter,
|
|
}
|