tpro/converters/amazon.py

import json

from transcript_processing import helpers


class AmazonConverter(TranscriptConverter):

    def __init__(self, path, output_target):
        super().__init__(path, output_target)

    def get_word_objects(self, json_data):
        return data['results']['items']

    def get_words(self, word_objects):
        return [self.get_word_word(w)
                for w in word_objects])

    @staticmethod
    def get_word_start(word_object):
        return float(word_object['start_time'])

    @staticmethod
    def get_word_end(word_object):
        return float(word_object['end_time'])

    @staticmethod
    def get_word_confidence(word_object):
        return float(word_object['alternatives'][0]['confidence'])

    @staticmethod
    def get_word_word(word_object):
        word_word = w['alternatives'][0]['content']
        if word_word == 'i':
            # weird Amazon quirk
            word_word = 'I'
        return word_word

    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []

        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0

        for i, w in enumerate(words):
            if w['type'] == 'punctuation':
                continue
            next_word_punc_after = None
            word_obj = self.get_word_object(w, i, tagged_words, words)

            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
                next_word_type = word_obj.next_word['type']
                if next_word in ['.', ',']:
                    punc_after = next_word
                elif next_word_punc_after:
                    punc_after = next_word_punc_after
                    next_word_punc_after = None

            if word_obj.word.lower() == 'you' and next_word == 'know':
                prev_word = words[i - 1]
                if prev_word['type'] != 'punctuation':
                    converted_words[-1]['punc_after'] = ','
                if next_word_type != 'punctuation':
                    next_word_punc_after = ','

            converted_words.append({
                'start': word_obj.start,
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
                'always_capitalized': (
                    word_obj.is_proper_noun
                    or word_obj.word == 'I'),
                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })

            index += 1
            punc_after = False

        return converted_words


def amazon_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0

    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS

        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None

        if word == 'i':
            # weird Amazon quirk
            word = 'I'

        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','

        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })

        index += 1
        punc_after = False

    return converted_words