added amazon converter

This commit is contained in:
2018-11-28 02:04:52 -05:00
parent d5a37df5a8
commit c9c4cbe550
8 changed files with 12813 additions and 122 deletions

BIN
.DS_Store vendored

Binary file not shown.

4915
Lelandmp3.json Normal file

File diff suppressed because it is too large Load Diff

3882
Lelandmp3_processed.json Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -21,6 +21,67 @@ from typing import Dict, Union, List
import helpers
def amazon_converter(data: dict):
data = json.load(data)
converted_words = []
words = data['results']['items']
tagged_words = helpers.tag_words(
[w['alternatives'][0]['content'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_start = float(w['start_time'])
word_end = float(w['end_time'])
confidence = float(w['alternatives'][0]['confidence'])
word = w['alternatives'][0]['content']
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['alternatives'][0]['content']
next_word_type = words[i + 1]['type']
if next_word == '.':
punc_after = '.'
elif next_word == ',':
punc_after = ','
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word == 'i':
# weird Amazon quirk
word = 'I'
if word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def speechmatics_converter(data: dict):
data = json.load(data)
converted_words = []
@@ -128,4 +189,5 @@ def speechmatics_aligned_text_converter(data):
converters = {
'speechmatics': speechmatics_converter,
'speechmatics_align': speechmatics_aligned_text_converter,
'amazon': amazon_converter,
}

View File

@@ -2458,131 +2458,11 @@
"name": "texture",
"time": "138.05"
},
{
"duration": "0.39",
"confidence": "1.000",
"name": "so",
"time": "138.77"
},
{
"duration": "0.30",
"confidence": "1.000",
"name": "making",
"time": "139.16"
},
{
"duration": "0.30",
"confidence": "1.000",
"name": "sure",
"time": "139.46"
},
{
"duration": "0.15",
"confidence": "1.000",
"name": "that",
"time": "139.76"
},
{
"duration": "0.15",
"confidence": "1.000",
"name": "we",
"time": "139.91"
},
{
"duration": "0.33",
"confidence": "1.000",
"name": "have",
"time": "140.06"
},
{
"duration": "0.36",
"confidence": "1.000",
"name": "something",
"time": "140.39"
},
{
"duration": "0.12",
"confidence": "1.000",
"name": "that",
"time": "140.75"
},
{
"duration": "0.30",
"confidence": "1.000",
"name": "people",
"time": "140.87"
},
{
"duration": "0.18",
"confidence": "1.000",
"name": "don't",
"time": "141.17"
},
{
"duration": "0.21",
"confidence": "1.000",
"name": "want",
"time": "141.35"
},
{
"duration": "0.09",
"confidence": "1.000",
"name": "to",
"time": "141.56"
},
{
"duration": "0.39",
"confidence": "1.000",
"name": "eat",
"time": "141.65"
},
{
"duration": "0.21",
"confidence": "1.000",
"name": "and",
"time": "142.10"
},
{
"duration": "0.18",
"confidence": "0.560",
"name": "we'll",
"time": "142.31"
},
{
"duration": "0.39",
"confidence": "1.000",
"name": "eat",
"time": "142.49"
},
{
"duration": "0.27",
"confidence": "1.000",
"name": "to",
"time": "143.24"
},
{
"duration": "0.21",
"confidence": "1.000",
"name": "stay",
"time": "143.51"
},
{
"duration": "0.45",
"confidence": "1.000",
"name": "healthy",
"time": "143.72"
},
{
"duration": "0.15",
"confidence": "1.000",
"name": "on",
"time": "144.41"
},
{
"duration": "0.28",
"confidence": "1.000",
"name": "this",
"time": "144.56"
"name": ".",
"time": "138.77"
}
],
"format": "1.0"

File diff suppressed because it is too large Load Diff