added amazon converter
This commit is contained in:
4915
Lelandmp3.json
Normal file
4915
Lelandmp3.json
Normal file
File diff suppressed because it is too large
Load Diff
3882
Lelandmp3_processed.json
Normal file
3882
Lelandmp3_processed.json
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
@@ -21,6 +21,67 @@ from typing import Dict, Union, List
|
||||
import helpers
|
||||
|
||||
|
||||
def amazon_converter(data: dict):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['results']['items']
|
||||
tagged_words = helpers.tag_words(
|
||||
[w['alternatives'][0]['content'] for w in words])
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_start = float(w['start_time'])
|
||||
word_end = float(w['end_time'])
|
||||
confidence = float(w['alternatives'][0]['confidence'])
|
||||
word = w['alternatives'][0]['content']
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
next_word = None
|
||||
if i < num_words - 1:
|
||||
next_word = words[i + 1]['alternatives'][0]['content']
|
||||
next_word_type = words[i + 1]['type']
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
elif next_word == ',':
|
||||
punc_after = ','
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word == 'i':
|
||||
# weird Amazon quirk
|
||||
word = 'I'
|
||||
|
||||
if word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_start,
|
||||
'end': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def speechmatics_converter(data: dict):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
@@ -128,4 +189,5 @@ def speechmatics_aligned_text_converter(data):
|
||||
converters = {
|
||||
'speechmatics': speechmatics_converter,
|
||||
'speechmatics_align': speechmatics_aligned_text_converter,
|
||||
'amazon': amazon_converter,
|
||||
}
|
||||
|
||||
@@ -2458,131 +2458,11 @@
|
||||
"name": "texture",
|
||||
"time": "138.05"
|
||||
},
|
||||
{
|
||||
"duration": "0.39",
|
||||
"confidence": "1.000",
|
||||
"name": "so",
|
||||
"time": "138.77"
|
||||
},
|
||||
{
|
||||
"duration": "0.30",
|
||||
"confidence": "1.000",
|
||||
"name": "making",
|
||||
"time": "139.16"
|
||||
},
|
||||
{
|
||||
"duration": "0.30",
|
||||
"confidence": "1.000",
|
||||
"name": "sure",
|
||||
"time": "139.46"
|
||||
},
|
||||
{
|
||||
"duration": "0.15",
|
||||
"confidence": "1.000",
|
||||
"name": "that",
|
||||
"time": "139.76"
|
||||
},
|
||||
{
|
||||
"duration": "0.15",
|
||||
"confidence": "1.000",
|
||||
"name": "we",
|
||||
"time": "139.91"
|
||||
},
|
||||
{
|
||||
"duration": "0.33",
|
||||
"confidence": "1.000",
|
||||
"name": "have",
|
||||
"time": "140.06"
|
||||
},
|
||||
{
|
||||
"duration": "0.36",
|
||||
"confidence": "1.000",
|
||||
"name": "something",
|
||||
"time": "140.39"
|
||||
},
|
||||
{
|
||||
"duration": "0.12",
|
||||
"confidence": "1.000",
|
||||
"name": "that",
|
||||
"time": "140.75"
|
||||
},
|
||||
{
|
||||
"duration": "0.30",
|
||||
"confidence": "1.000",
|
||||
"name": "people",
|
||||
"time": "140.87"
|
||||
},
|
||||
{
|
||||
"duration": "0.18",
|
||||
"confidence": "1.000",
|
||||
"name": "don't",
|
||||
"time": "141.17"
|
||||
},
|
||||
{
|
||||
"duration": "0.21",
|
||||
"confidence": "1.000",
|
||||
"name": "want",
|
||||
"time": "141.35"
|
||||
},
|
||||
{
|
||||
"duration": "0.09",
|
||||
"confidence": "1.000",
|
||||
"name": "to",
|
||||
"time": "141.56"
|
||||
},
|
||||
{
|
||||
"duration": "0.39",
|
||||
"confidence": "1.000",
|
||||
"name": "eat",
|
||||
"time": "141.65"
|
||||
},
|
||||
{
|
||||
"duration": "0.21",
|
||||
"confidence": "1.000",
|
||||
"name": "and",
|
||||
"time": "142.10"
|
||||
},
|
||||
{
|
||||
"duration": "0.18",
|
||||
"confidence": "0.560",
|
||||
"name": "we'll",
|
||||
"time": "142.31"
|
||||
},
|
||||
{
|
||||
"duration": "0.39",
|
||||
"confidence": "1.000",
|
||||
"name": "eat",
|
||||
"time": "142.49"
|
||||
},
|
||||
{
|
||||
"duration": "0.27",
|
||||
"confidence": "1.000",
|
||||
"name": "to",
|
||||
"time": "143.24"
|
||||
},
|
||||
{
|
||||
"duration": "0.21",
|
||||
"confidence": "1.000",
|
||||
"name": "stay",
|
||||
"time": "143.51"
|
||||
},
|
||||
{
|
||||
"duration": "0.45",
|
||||
"confidence": "1.000",
|
||||
"name": "healthy",
|
||||
"time": "143.72"
|
||||
},
|
||||
{
|
||||
"duration": "0.15",
|
||||
"confidence": "1.000",
|
||||
"name": "on",
|
||||
"time": "144.41"
|
||||
},
|
||||
{
|
||||
"duration": "0.28",
|
||||
"confidence": "1.000",
|
||||
"name": "this",
|
||||
"time": "144.56"
|
||||
"name": ".",
|
||||
"time": "138.77"
|
||||
}
|
||||
],
|
||||
"format": "1.0"
|
||||
|
||||
3952
leland_transcript_processed.json
Normal file
3952
leland_transcript_processed.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user