tpro/converters.py

"""

fields for converted transcript:

    start
    end
    word
    confidence
    index
    always_capitalized
    punc_before
    punc_after

"""

from collections import namedtuple
from decimal import Decimal
import json
from typing import Dict, Union, List

import helpers


def amazon_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0

    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS

        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None

        if word == 'i':
            # weird Amazon quirk
            word = 'I'

        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','

        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })

        index += 1
        punc_after = False

    return converted_words


def speechmatics_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['words']
    tagged_words = helpers.tag_words([w['name'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0

    for i, w in enumerate(words):
        word_start = float(w['time'])
        word_end = word_start + float(w['duration'])
        confidence = float(w['confidence'])
        word = w['name']
        if word == '.':
            continue
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS

        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['name']
        if next_word == '.':
            punc_after = '.'

        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })

        index += 1
        punc_after = False

    return converted_words


def speechmatics_aligned_text_converter(data):
    data = data.readlines()[0]
    Word = namedtuple('Word', 'start end word')

    class Exhausted(Exception):
        pass

    def get_time(transcript, index):
        time_index = transcript.find('time=', index)
        if time_index == -1:
            raise Exhausted
        close_index = transcript.find('>', time_index)
        return float(transcript[time_index + 5: close_index]), close_index

    def find_next_word(transcript, start_index):
        start, end_of_start_index = get_time(transcript, start_index)

        word_start_index = end_of_start_index + 1
        word_end_index = transcript.find('<', word_start_index)
        word = transcript[word_start_index: word_end_index]

        end, close_index = get_time(transcript, word_end_index)

        return Word(start, end, word), close_index

    words = []
    next_index = 0
    word = None

    while True:
        try:
            word, next_index = find_next_word(data, next_index)
        except Exhausted:
            break
        else:
            words.append(word)

    tagged_words = helpers.tag_words([w.word for w in words])
    converted_words = []

    for i, word in enumerate(words):
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        punc_before = helpers.get_punc_before(word.word)
        punc_after = helpers.get_punc_after(word.word)
        the_word = word.word
        if punc_before or punc_after:
            for p in helpers.PUNCTUATION:
                the_word = the_word.replace(p, '')
        converted_words.append({
            'start': word.start,
            'end': word.end,
            'confidence': 1,
            'word': the_word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': i,
            'punc_before': punc_before,
            'punc_after': punc_after,
        })

    return converted_words


converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
    'amazon': amazon_converter,
}