brought over changes from when transcript_processing was nested inside transcribely's back_end package. started refactoring converters into OOP

2019-02-06 20:57:21 -05:00
parent c9c4cbe550
commit 84fe4d2fd4
19 changed files with 277 additions and 135802 deletions
--- a/converters/init.py
+++ b/converters/init.py
@@ -0,0 +1,24 @@
+"""
+
+fields for converted transcript:
+
+    start
+    end
+    word
+    confidence
+    index
+    always_capitalized
+    punc_before
+    punc_after
+
+"""
+
+from transcript_processing.converters.amazon import amazon_converter
+from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
+
+
+converters = {
+    'speechmatics': speechmatics_converter,
+    'speechmatics_align': speechmatics_aligned_text_converter,
+    'amazon': amazon_converter,
+}
--- a/converters/amazon.py
+++ b/converters/amazon.py
@@ -0,0 +1,146 @@
+import json
+
+from transcript_processing import helpers
+
+
+
+class AmazonConverter(TranscriptConverter):
+
+    def __init__(self, path, output_target):
+        super().__init__(path, output_target)
+
+    def get_word_objects(self, json_data):
+        return data['results']['items']
+
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects])
+
+    @staticmethod
+    def get_word_start(word_object):
+        return float(word_object['start_time'])
+
+    @staticmethod
+    def get_word_end(word_object):
+        return float(word_object['end_time'])
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return float(word_object['alternatives'][0]['confidence'])
+
+    @staticmethod
+    def get_word_word(word_object):
+        word_word = w['alternatives'][0]['content']
+        if word_word == 'i':
+            # weird Amazon quirk
+            word_word = 'I'
+        return word_word
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+
+        punc_before = False
+        punc_after = False
+        num_words = len(words)
+        index = 0
+
+        for i, w in enumerate(words):
+            if w['type'] == 'punctuation':
+                continue
+            next_word_punc_after = None
+            word_obj = self.get_word_object(w, i, tagged_words, words)
+
+            if word_obj.next_word:
+                next_word = self.get_word_word(word_obj.next_word)
+                next_word_type = word_obj.next_word['type']
+                if next_word in ['.', ',']:
+                    punc_after = next_word
+                elif next_word_punc_after:
+                    punc_after = next_word_punc_after
+                    next_word_punc_after = None
+
+            if word_obj.word.lower() == 'you' and next_word == 'know':
+                prev_word = words[i - 1]
+                if prev_word['type'] != 'punctuation':
+                    converted_words[-1]['punc_after'] = ','
+                if next_word_type != 'punctuation':
+                    next_word_punc_after = ','
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': word_obj.word,
+                'always_capitalized': (
+                    word_obj.is_proper_noun 
+                    or word_obj.word == 'I'),
+                'index': index,
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+            index += 1
+            punc_after = False
+
+        return converted_words
+
+
+def amazon_converter(data: dict):
+    data = json.load(data)
+    converted_words = []
+    words = data['results']['items']
+    tagged_words = helpers.tag_words(
+        [w['alternatives'][0]['content'] for w in words])
+    punc_before = False
+    punc_after = False
+    num_words = len(words)
+    index = 0
+
+    for i, w in enumerate(words):
+        if w['type'] == 'punctuation':
+            continue
+        next_word_punc_after = None
+        word_start = float(w['start_time'])
+        word_end = float(w['end_time'])
+        confidence = float(w['alternatives'][0]['confidence'])
+        word = w['alternatives'][0]['content']
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
+
+        next_word = None
+        if i < num_words - 1:
+            next_word = words[i + 1]['alternatives'][0]['content']
+            next_word_type = words[i + 1]['type']
+        if next_word == '.':
+            punc_after = '.'
+        elif next_word == ',':
+            punc_after = ','
+        elif next_word_punc_after:
+            punc_after = next_word_punc_after
+            next_word_punc_after = None
+
+        if word == 'i':
+            # weird Amazon quirk
+            word = 'I'
+
+        if word.lower() == 'you' and next_word == 'know':
+            prev_word = words[i - 1]
+            if prev_word['type'] != 'punctuation':
+                converted_words[-1]['punc_after'] = ','
+            if next_word_type != 'punctuation':
+                next_word_punc_after = ','
+
+        converted_words.append({
+            'start': word_start,
+            'end': word_end,
+            'confidence': confidence,
+            'word': word,
+            'always_capitalized': is_proper_noun or word == 'I',
+            'index': index,
+            'punc_after': punc_after,
+            'punc_before': punc_before,
+        })
+
+        index += 1
+        punc_after = False
+
+    return converted_words
--- a/converters/speechmatics.py
+++ b/converters/speechmatics.py
@@ -0,0 +1,113 @@
+from collections import namedtuple
+import json
+
+from transcript_processing import helpers
+
+
+Word = namedtuple('Word', 'start end word')
+
+
+def speechmatics_converter(data):
+    data = json.load(data)
+    converted_words = []
+    words = data['words']
+    tagged_words = helpers.tag_words([w['name'] for w in words])
+    punc_before = False
+    punc_after = False
+    num_words = len(words)
+    index = 0
+
+    for i, w in enumerate(words):
+        word_start = float(w['time'])
+        word_end = word_start + float(w['duration'])
+        confidence = float(w['confidence'])
+        word = w['name']
+        if word == '.':
+            continue
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
+
+        next_word = None
+        if i < num_words - 1:
+            next_word = words[i + 1]['name']
+        if next_word == '.':
+            punc_after = '.'
+
+        converted_words.append({
+            'start': word_start,
+            'end': word_end,
+            'confidence': confidence,
+            'word': word,
+            'always_capitalized': is_proper_noun or word == 'I',
+            'index': index,
+            'punc_after': punc_after,
+            'punc_before': punc_before,
+        })
+
+        index += 1
+        punc_after = False
+
+    return converted_words
+
+
+def speechmatics_aligned_text_converter(data):
+    data = data.readlines()[0]
+
+    class Exhausted(Exception):
+        pass
+
+    def get_time(transcript, index):
+        time_index = transcript.find('time=', index)
+        if time_index == -1:
+            raise Exhausted
+        close_index = transcript.find('>', time_index)
+        return float(transcript[time_index + 5: close_index]), close_index
+
+    def find_next_word(transcript, start_index):
+        start, end_of_start_index = get_time(transcript, start_index)
+
+        word_start_index = end_of_start_index + 1
+        word_end_index = transcript.find('<', word_start_index)
+        word = transcript[word_start_index: word_end_index]
+
+        end, close_index = get_time(transcript, word_end_index)
+
+        return Word(start, end, word), close_index
+
+    words = []
+    next_index = 0
+    word = None
+
+    while True:
+        try:
+            word, next_index = find_next_word(data, next_index)
+        except Exhausted:
+            break
+        else:
+            words.append(word)
+
+    tagged_words = helpers.tag_words([w.word for w in words])
+    converted_words = []
+
+    for i, word in enumerate(words):
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
+        punc_before = helpers.get_punc_before(word.word)
+        punc_after = helpers.get_punc_after(word.word)
+        the_word = word.word
+        if punc_before or punc_after:
+            for p in helpers.PUNCTUATION:
+                the_word = the_word.replace(p, '')
+        converted_words.append({
+            'start': word.start,
+            'end': word.end,
+            'confidence': 1,
+            'word': the_word,
+            'always_capitalized': is_proper_noun or word == 'I',
+            'index': i,
+            'punc_before': punc_before,
+            'punc_after': punc_after,
+        })
+
+    return converted_words
+
+
+def gentle_converter