added amazon converter

2018-11-28 02:04:52 -05:00
parent d5a37df5a8
commit c9c4cbe550
8 changed files with 12813 additions and 122 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/Lelandmp3.json
+++ b/Lelandmp3.json
--- a/Lelandmp3_processed.json
+++ b/Lelandmp3_processed.json
--- a/pycache/converters.cpython-36.pyc
+++ b/pycache/converters.cpython-36.pyc
--- a/pycache/models.cpython-36.pyc
+++ b/pycache/models.cpython-36.pyc
--- a/converters.py
+++ b/converters.py
@@ -21,6 +21,67 @@ from typing import Dict, Union, List
 import helpers


+def amazon_converter(data: dict):
+    data = json.load(data)
+    converted_words = []
+    words = data['results']['items']
+    tagged_words = helpers.tag_words(
+        [w['alternatives'][0]['content'] for w in words])
+    punc_before = False
+    punc_after = False
+    num_words = len(words)
+    index = 0
+
+    for i, w in enumerate(words):
+        if w['type'] == 'punctuation':
+            continue
+        next_word_punc_after = None
+        word_start = float(w['start_time'])
+        word_end = float(w['end_time'])
+        confidence = float(w['alternatives'][0]['confidence'])
+        word = w['alternatives'][0]['content']
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
+
+        next_word = None
+        if i < num_words - 1:
+            next_word = words[i + 1]['alternatives'][0]['content']
+            next_word_type = words[i + 1]['type']
+        if next_word == '.':
+            punc_after = '.'
+        elif next_word == ',':
+            punc_after = ','
+        elif next_word_punc_after:
+            punc_after = next_word_punc_after
+            next_word_punc_after = None
+
+        if word == 'i':
+            # weird Amazon quirk
+            word = 'I'
+
+        if word.lower() == 'you' and next_word == 'know':
+            prev_word = words[i - 1]
+            if prev_word['type'] != 'punctuation':
+                converted_words[-1]['punc_after'] = ','
+            if next_word_type != 'punctuation':
+                next_word_punc_after = ','
+
+        converted_words.append({
+            'start': word_start,
+            'end': word_end,
+            'confidence': confidence,
+            'word': word,
+            'always_capitalized': is_proper_noun or word == 'I',
+            'index': index,
+            'punc_after': punc_after,
+            'punc_before': punc_before,
+        })
+
+        index += 1
+        punc_after = False
+
+    return converted_words
+
+
 def speechmatics_converter(data: dict):
    data = json.load(data)
    converted_words = []
@@ -128,4 +189,5 @@ def speechmatics_aligned_text_converter(data):
 converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
+    'amazon': amazon_converter,
 }
--- a/leland_transcript.json
+++ b/leland_transcript.json
@@ -2458,131 +2458,11 @@
      "name": "texture",
      "time": "138.05"
    },
-    {
-      "duration": "0.39",
-      "confidence": "1.000",
-      "name": "so",
-      "time": "138.77"
-    },
-    {
-      "duration": "0.30",
-      "confidence": "1.000",
-      "name": "making",
-      "time": "139.16"
-    },
-    {
-      "duration": "0.30",
-      "confidence": "1.000",
-      "name": "sure",
-      "time": "139.46"
-    },
-    {
-      "duration": "0.15",
-      "confidence": "1.000",
-      "name": "that",
-      "time": "139.76"
-    },
-    {
-      "duration": "0.15",
-      "confidence": "1.000",
-      "name": "we",
-      "time": "139.91"
-    },
-    {
-      "duration": "0.33",
-      "confidence": "1.000",
-      "name": "have",
-      "time": "140.06"
-    },
-    {
-      "duration": "0.36",
-      "confidence": "1.000",
-      "name": "something",
-      "time": "140.39"
-    },
-    {
-      "duration": "0.12",
-      "confidence": "1.000",
-      "name": "that",
-      "time": "140.75"
-    },
-    {
-      "duration": "0.30",
-      "confidence": "1.000",
-      "name": "people",
-      "time": "140.87"
-    },
-    {
-      "duration": "0.18",
-      "confidence": "1.000",
-      "name": "don't",
-      "time": "141.17"
-    },
-    {
-      "duration": "0.21",
-      "confidence": "1.000",
-      "name": "want",
-      "time": "141.35"
-    },
    {
      "duration": "0.09",
      "confidence": "1.000",
-      "name": "to",
-      "time": "141.56"
-    },
-    {
-      "duration": "0.39",
-      "confidence": "1.000",
-      "name": "eat",
-      "time": "141.65"
-    },
-    {
-      "duration": "0.21",
-      "confidence": "1.000",
-      "name": "and",
-      "time": "142.10"
-    },
-    {
-      "duration": "0.18",
-      "confidence": "0.560",
-      "name": "we'll",
-      "time": "142.31"
-    },
-    {
-      "duration": "0.39",
-      "confidence": "1.000",
-      "name": "eat",
-      "time": "142.49"
-    },
-    {
-      "duration": "0.27",
-      "confidence": "1.000",
-      "name": "to",
-      "time": "143.24"
-    },
-    {
-      "duration": "0.21",
-      "confidence": "1.000",
-      "name": "stay",
-      "time": "143.51"
-    },
-    {
-      "duration": "0.45",
-      "confidence": "1.000",
-      "name": "healthy",
-      "time": "143.72"
-    },
-    {
-      "duration": "0.15",
-      "confidence": "1.000",
-      "name": "on",
-      "time": "144.41"
-    },
-    {
-      "duration": "0.28",
-      "confidence": "1.000",
-      "name": "this",
-      "time": "144.56"
+      "name": ".",
+      "time": "138.77"
    }
  ],
  "format": "1.0"
--- a/leland_transcript_processed.json
+++ b/leland_transcript_processed.json