brought over changes from when transcript_processing was nested inside transcribely's back_end package. started refactoring converters into OOP

This commit is contained in:
2019-02-06 20:57:21 -05:00
parent c9c4cbe550
commit 84fe4d2fd4
19 changed files with 277 additions and 135802 deletions

24
converters/__init__.py Normal file
View File

@@ -0,0 +1,24 @@
"""
fields for converted transcript:
start
end
word
confidence
index
always_capitalized
punc_before
punc_after
"""
from transcript_processing.converters.amazon import amazon_converter
from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
converters = {
'speechmatics': speechmatics_converter,
'speechmatics_align': speechmatics_aligned_text_converter,
'amazon': amazon_converter,
}

146
converters/amazon.py Normal file
View File

@@ -0,0 +1,146 @@
import json
from transcript_processing import helpers
class AmazonConverter(TranscriptConverter):
def __init__(self, path, output_target):
super().__init__(path, output_target)
def get_word_objects(self, json_data):
return data['results']['items']
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects])
@staticmethod
def get_word_start(word_object):
return float(word_object['start_time'])
@staticmethod
def get_word_end(word_object):
return float(word_object['end_time'])
@staticmethod
def get_word_confidence(word_object):
return float(word_object['alternatives'][0]['confidence'])
@staticmethod
def get_word_word(word_object):
word_word = w['alternatives'][0]['content']
if word_word == 'i':
# weird Amazon quirk
word_word = 'I'
return word_word
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_obj = self.get_word_object(w, i, tagged_words, words)
if word_obj.next_word:
next_word = self.get_word_word(word_obj.next_word)
next_word_type = word_obj.next_word['type']
if next_word in ['.', ',']:
punc_after = next_word
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word_obj.word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': (
word_obj.is_proper_noun
or word_obj.word == 'I'),
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def amazon_converter(data: dict):
data = json.load(data)
converted_words = []
words = data['results']['items']
tagged_words = helpers.tag_words(
[w['alternatives'][0]['content'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_start = float(w['start_time'])
word_end = float(w['end_time'])
confidence = float(w['alternatives'][0]['confidence'])
word = w['alternatives'][0]['content']
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['alternatives'][0]['content']
next_word_type = words[i + 1]['type']
if next_word == '.':
punc_after = '.'
elif next_word == ',':
punc_after = ','
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word == 'i':
# weird Amazon quirk
word = 'I'
if word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words

113
converters/speechmatics.py Normal file
View File

@@ -0,0 +1,113 @@
from collections import namedtuple
import json
from transcript_processing import helpers
Word = namedtuple('Word', 'start end word')
def speechmatics_converter(data):
data = json.load(data)
converted_words = []
words = data['words']
tagged_words = helpers.tag_words([w['name'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
word_start = float(w['time'])
word_end = word_start + float(w['duration'])
confidence = float(w['confidence'])
word = w['name']
if word == '.':
continue
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['name']
if next_word == '.':
punc_after = '.'
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def speechmatics_aligned_text_converter(data):
data = data.readlines()[0]
class Exhausted(Exception):
pass
def get_time(transcript, index):
time_index = transcript.find('time=', index)
if time_index == -1:
raise Exhausted
close_index = transcript.find('>', time_index)
return float(transcript[time_index + 5: close_index]), close_index
def find_next_word(transcript, start_index):
start, end_of_start_index = get_time(transcript, start_index)
word_start_index = end_of_start_index + 1
word_end_index = transcript.find('<', word_start_index)
word = transcript[word_start_index: word_end_index]
end, close_index = get_time(transcript, word_end_index)
return Word(start, end, word), close_index
words = []
next_index = 0
word = None
while True:
try:
word, next_index = find_next_word(data, next_index)
except Exhausted:
break
else:
words.append(word)
tagged_words = helpers.tag_words([w.word for w in words])
converted_words = []
for i, word in enumerate(words):
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
punc_before = helpers.get_punc_before(word.word)
punc_after = helpers.get_punc_after(word.word)
the_word = word.word
if punc_before or punc_after:
for p in helpers.PUNCTUATION:
the_word = the_word.replace(p, '')
converted_words.append({
'start': word.start,
'end': word.end,
'confidence': 1,
'word': the_word,
'always_capitalized': is_proper_noun or word == 'I',
'index': i,
'punc_before': punc_before,
'punc_after': punc_after,
})
return converted_words
def gentle_converter