from collections import namedtuple import json from transcript_processing import helpers Word = namedtuple('Word', 'start end word') def speechmatics_converter(data): data = json.load(data) converted_words = [] words = data['words'] tagged_words = helpers.tag_words([w['name'] for w in words]) punc_before = False punc_after = False num_words = len(words) index = 0 for i, w in enumerate(words): word_start = float(w['time']) word_end = word_start + float(w['duration']) confidence = float(w['confidence']) word = w['name'] if word == '.': continue is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS next_word = None if i < num_words - 1: next_word = words[i + 1]['name'] if next_word == '.': punc_after = '.' converted_words.append({ 'start': word_start, 'end': word_end, 'confidence': confidence, 'word': word, 'always_capitalized': is_proper_noun or word == 'I', 'index': index, 'punc_after': punc_after, 'punc_before': punc_before, }) index += 1 punc_after = False return converted_words def speechmatics_aligned_text_converter(data): data = data.readlines()[0] class Exhausted(Exception): pass def get_time(transcript, index): time_index = transcript.find('time=', index) if time_index == -1: raise Exhausted close_index = transcript.find('>', time_index) return float(transcript[time_index + 5: close_index]), close_index def find_next_word(transcript, start_index): start, end_of_start_index = get_time(transcript, start_index) word_start_index = end_of_start_index + 1 word_end_index = transcript.find('<', word_start_index) word = transcript[word_start_index: word_end_index] end, close_index = get_time(transcript, word_end_index) return Word(start, end, word), close_index words = [] next_index = 0 word = None while True: try: word, next_index = find_next_word(data, next_index) except Exhausted: break else: words.append(word) tagged_words = helpers.tag_words([w.word for w in words]) converted_words = [] for i, word in enumerate(words): is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS punc_before = helpers.get_punc_before(word.word) punc_after = helpers.get_punc_after(word.word) the_word = word.word if punc_before or punc_after: for p in helpers.PUNCTUATION: the_word = the_word.replace(p, '') converted_words.append({ 'start': word.start, 'end': word.end, 'confidence': 1, 'word': the_word, 'always_capitalized': is_proper_noun or word == 'I', 'index': i, 'punc_before': punc_before, 'punc_after': punc_after, }) return converted_words def gentle_converter