finished refactoring to a single repo, and to OOP for straight-forward adding of new ASR APIs. added Gentle, and added viral_overlay JSON output. added tests
This commit is contained in:
@@ -1,24 +0,0 @@
|
||||
"""
|
||||
|
||||
fields for converted transcript:
|
||||
|
||||
start
|
||||
end
|
||||
word
|
||||
confidence
|
||||
index
|
||||
always_capitalized
|
||||
punc_before
|
||||
punc_after
|
||||
|
||||
"""
|
||||
|
||||
from transcript_processing.converters.amazon import amazon_converter
|
||||
from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
|
||||
|
||||
|
||||
converters = {
|
||||
'speechmatics': speechmatics_converter,
|
||||
'speechmatics_align': speechmatics_aligned_text_converter,
|
||||
'amazon': amazon_converter,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
|
||||
from transcript_processing import helpers
|
||||
from converter import TranscriptConverter
|
||||
import helpers
|
||||
|
||||
|
||||
|
||||
@@ -10,11 +11,11 @@ class AmazonConverter(TranscriptConverter):
|
||||
super().__init__(path, output_target)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return data['results']['items']
|
||||
return json_data['results']['items']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects])
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
@@ -30,7 +31,7 @@ class AmazonConverter(TranscriptConverter):
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
word_word = w['alternatives'][0]['content']
|
||||
word_word = word_object['alternatives'][0]['content']
|
||||
if word_word == 'i':
|
||||
# weird Amazon quirk
|
||||
word_word = 'I'
|
||||
@@ -44,11 +45,11 @@ class AmazonConverter(TranscriptConverter):
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
for i, w in enumerate(word_objects):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_obj = self.get_word_object(w, i, tagged_words, words)
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
@@ -60,7 +61,7 @@ class AmazonConverter(TranscriptConverter):
|
||||
next_word_punc_after = None
|
||||
|
||||
if word_obj.word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
prev_word = word_objects[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
@@ -83,64 +84,3 @@ class AmazonConverter(TranscriptConverter):
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def amazon_converter(data: dict):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['results']['items']
|
||||
tagged_words = helpers.tag_words(
|
||||
[w['alternatives'][0]['content'] for w in words])
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_start = float(w['start_time'])
|
||||
word_end = float(w['end_time'])
|
||||
confidence = float(w['alternatives'][0]['confidence'])
|
||||
word = w['alternatives'][0]['content']
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
next_word = None
|
||||
if i < num_words - 1:
|
||||
next_word = words[i + 1]['alternatives'][0]['content']
|
||||
next_word_type = words[i + 1]['type']
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
elif next_word == ',':
|
||||
punc_after = ','
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word == 'i':
|
||||
# weird Amazon quirk
|
||||
word = 'I'
|
||||
|
||||
if word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_start,
|
||||
'end': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
60
converters/gentle.py
Normal file
60
converters/gentle.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from converter import TranscriptConverter
|
||||
|
||||
|
||||
|
||||
class GentleConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return word_object['start']
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return word_object['end']
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return 1
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
return word_object['alignedWord']
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
@@ -1,10 +1,74 @@
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
from transcript_processing import helpers
|
||||
from converter import TranscriptConverter
|
||||
import helpers
|
||||
|
||||
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
class SpeechmaticsConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return (SpeechmaticsConverter.get_word_start(word_object)
|
||||
+ float(word_object['duration']))
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return float(word_object['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
return word_object['name']
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
if word_obj.word == '.':
|
||||
continue
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def speechmatics_converter(data):
|
||||
@@ -55,6 +119,8 @@ def speechmatics_aligned_text_converter(data):
|
||||
class Exhausted(Exception):
|
||||
pass
|
||||
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
def get_time(transcript, index):
|
||||
time_index = transcript.find('time=', index)
|
||||
if time_index == -1:
|
||||
@@ -108,6 +174,3 @@ def speechmatics_aligned_text_converter(data):
|
||||
})
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def gentle_converter
|
||||
|
||||
Reference in New Issue
Block a user