added amazon converter

2018-11-28 02:04:52 -05:00
parent d5a37df5a8
commit c9c4cbe550
8 changed files with 12813 additions and 122 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/Lelandmp3.json
+++ b/Lelandmp3.json
--- a/Lelandmp3_processed.json
+++ b/Lelandmp3_processed.json
--- a/pycache/converters.cpython-36.pyc
+++ b/pycache/converters.cpython-36.pyc
--- a/pycache/models.cpython-36.pyc
+++ b/pycache/models.cpython-36.pyc
--- a/converters.py
+++ b/converters.py
@@ -21,6 +21,67 @@ from typing import Dict, Union, List
 import helpers
 def amazon_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0
    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None
        if word == 'i':
            # weird Amazon quirk
            word = 'I'
        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','
        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })
        index += 1
        punc_after = False
    return converted_words
 def speechmatics_converter(data: dict):
    data = json.load(data)
    converted_words = []
@@ -128,4 +189,5 @@ def speechmatics_aligned_text_converter(data):
 converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
    'amazon': amazon_converter,
 }
--- a/leland_transcript.json
+++ b/leland_transcript.json
@@ -2458,131 +2458,11 @@
      "name": "texture",
      "time": "138.05"
    },
    {
      "duration": "0.39",
      "confidence": "1.000",
      "name": "so",
      "time": "138.77"
    },
    {
      "duration": "0.30",
      "confidence": "1.000",
      "name": "making",
      "time": "139.16"
    },
    {
      "duration": "0.30",
      "confidence": "1.000",
      "name": "sure",
      "time": "139.46"
    },
    {
      "duration": "0.15",
      "confidence": "1.000",
      "name": "that",
      "time": "139.76"
    },
    {
      "duration": "0.15",
      "confidence": "1.000",
      "name": "we",
      "time": "139.91"
    },
    {
      "duration": "0.33",
      "confidence": "1.000",
      "name": "have",
      "time": "140.06"
    },
    {
      "duration": "0.36",
      "confidence": "1.000",
      "name": "something",
      "time": "140.39"
    },
    {
      "duration": "0.12",
      "confidence": "1.000",
      "name": "that",
      "time": "140.75"
    },
    {
      "duration": "0.30",
      "confidence": "1.000",
      "name": "people",
      "time": "140.87"
    },
    {
      "duration": "0.18",
      "confidence": "1.000",
      "name": "don't",
      "time": "141.17"
    },
    {
      "duration": "0.21",
      "confidence": "1.000",
      "name": "want",
      "time": "141.35"
    },
    {
      "duration": "0.09",
      "confidence": "1.000",
-      "name": "to",
+      "name": ".",
-      "time": "141.56"
+      "time": "138.77"
    },
    {
      "duration": "0.39",
      "confidence": "1.000",
      "name": "eat",
      "time": "141.65"
    },
    {
      "duration": "0.21",
      "confidence": "1.000",
      "name": "and",
      "time": "142.10"
    },
    {
      "duration": "0.18",
      "confidence": "0.560",
      "name": "we'll",
      "time": "142.31"
    },
    {
      "duration": "0.39",
      "confidence": "1.000",
      "name": "eat",
      "time": "142.49"
    },
    {
      "duration": "0.27",
      "confidence": "1.000",
      "name": "to",
      "time": "143.24"
    },
    {
      "duration": "0.21",
      "confidence": "1.000",
      "name": "stay",
      "time": "143.51"
    },
    {
      "duration": "0.45",
      "confidence": "1.000",
      "name": "healthy",
      "time": "143.72"
    },
    {
      "duration": "0.15",
      "confidence": "1.000",
      "name": "on",
      "time": "144.41"
    },
    {
      "duration": "0.28",
      "confidence": "1.000",
      "name": "this",
      "time": "144.56"
    }
  ],
  "format": "1.0"
--- a/leland_transcript_processed.json
+++ b/leland_transcript_processed.json