finished refactoring to a single repo, and to OOP for straight-forward adding of new ASR APIs. added Gentle, and added viral_overlay JSON output. added tests

2019-02-06 22:28:08 -05:00
parent 84fe4d2fd4
commit e36c8ba30e
11 changed files with 347 additions and 103 deletions
--- a/converters/init.py
+++ b/converters/init.py
@@ -1,24 +0,0 @@
-"""
-
-fields for converted transcript:
-
-    start
-    end
-    word
-    confidence
-    index
-    always_capitalized
-    punc_before
-    punc_after
-
-"""
-
-from transcript_processing.converters.amazon import amazon_converter
-from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
-
-
-converters = {
-    'speechmatics': speechmatics_converter,
-    'speechmatics_align': speechmatics_aligned_text_converter,
-    'amazon': amazon_converter,
-}
--- a/converters/amazon.py
+++ b/converters/amazon.py
@@ -1,6 +1,7 @@
 import json

-from transcript_processing import helpers
+from converter import TranscriptConverter
+import helpers



@@ -10,11 +11,11 @@ class AmazonConverter(TranscriptConverter):
        super().__init__(path, output_target)

    def get_word_objects(self, json_data):
-        return data['results']['items']
+        return json_data['results']['items']

    def get_words(self, word_objects):
        return [self.get_word_word(w)
-                for w in word_objects])
+                for w in word_objects]

    @staticmethod
    def get_word_start(word_object):
@@ -30,7 +31,7 @@ class AmazonConverter(TranscriptConverter):

    @staticmethod
    def get_word_word(word_object):
-        word_word = w['alternatives'][0]['content']
+        word_word = word_object['alternatives'][0]['content']
        if word_word == 'i':
            # weird Amazon quirk
            word_word = 'I'
@@ -44,11 +45,11 @@ class AmazonConverter(TranscriptConverter):
        num_words = len(words)
        index = 0

-        for i, w in enumerate(words):
+        for i, w in enumerate(word_objects):
            if w['type'] == 'punctuation':
                continue
            next_word_punc_after = None
-            word_obj = self.get_word_object(w, i, tagged_words, words)
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)

            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
@@ -60,7 +61,7 @@ class AmazonConverter(TranscriptConverter):
                    next_word_punc_after = None

            if word_obj.word.lower() == 'you' and next_word == 'know':
-                prev_word = words[i - 1]
+                prev_word = word_objects[i - 1]
                if prev_word['type'] != 'punctuation':
                    converted_words[-1]['punc_after'] = ','
                if next_word_type != 'punctuation':
@@ -83,64 +84,3 @@ class AmazonConverter(TranscriptConverter):
            punc_after = False

        return converted_words
-
-
-def amazon_converter(data: dict):
-    data = json.load(data)
-    converted_words = []
-    words = data['results']['items']
-    tagged_words = helpers.tag_words(
-        [w['alternatives'][0]['content'] for w in words])
-    punc_before = False
-    punc_after = False
-    num_words = len(words)
-    index = 0
-
-    for i, w in enumerate(words):
-        if w['type'] == 'punctuation':
-            continue
-        next_word_punc_after = None
-        word_start = float(w['start_time'])
-        word_end = float(w['end_time'])
-        confidence = float(w['alternatives'][0]['confidence'])
-        word = w['alternatives'][0]['content']
-        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
-
-        next_word = None
-        if i < num_words - 1:
-            next_word = words[i + 1]['alternatives'][0]['content']
-            next_word_type = words[i + 1]['type']
-        if next_word == '.':
-            punc_after = '.'
-        elif next_word == ',':
-            punc_after = ','
-        elif next_word_punc_after:
-            punc_after = next_word_punc_after
-            next_word_punc_after = None
-
-        if word == 'i':
-            # weird Amazon quirk
-            word = 'I'
-
-        if word.lower() == 'you' and next_word == 'know':
-            prev_word = words[i - 1]
-            if prev_word['type'] != 'punctuation':
-                converted_words[-1]['punc_after'] = ','
-            if next_word_type != 'punctuation':
-                next_word_punc_after = ','
-
-        converted_words.append({
-            'start': word_start,
-            'end': word_end,
-            'confidence': confidence,
-            'word': word,
-            'always_capitalized': is_proper_noun or word == 'I',
-            'index': index,
-            'punc_after': punc_after,
-            'punc_before': punc_before,
-        })
-
-        index += 1
-        punc_after = False
-
-    return converted_words
--- a/converters/gentle.py
+++ b/converters/gentle.py
@@ -0,0 +1,60 @@
+from converter import TranscriptConverter
+
+
+
+class GentleConverter(TranscriptConverter):
+
+    def __init__(self, path, output_target):
+        super().__init__(path, output_target)
+
+    def get_word_objects(self, json_data):
+        return json_data['words']
+
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects]
+
+    @staticmethod
+    def get_word_start(word_object):
+        return word_object['start']
+
+    @staticmethod
+    def get_word_end(word_object):
+        return word_object['end']
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return 1
+
+    @staticmethod
+    def get_word_word(word_object):
+        return word_object['alignedWord']
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+        punc_before = False
+        punc_after = False
+        num_words = len(words)
+        index = 0
+
+        for i, w in enumerate(word_objects):
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': word_obj.word,
+                'always_capitalized': (
+                    word_obj.is_proper_noun 
+                    or word_obj.word == 'I'),
+                'index': index,
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+            index += 1
+            punc_after = False
+
+        return converted_words
+
--- a/converters/speechmatics.py
+++ b/converters/speechmatics.py
@@ -1,10 +1,74 @@
 from collections import namedtuple
 import json

-from transcript_processing import helpers
+from converter import TranscriptConverter
+import helpers


-Word = namedtuple('Word', 'start end word')
+
+class SpeechmaticsConverter(TranscriptConverter):
+
+    def __init__(self, path, output_target):
+        super().__init__(path, output_target)
+
+    def get_word_objects(self, json_data):
+        return json_data['words']
+
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects]
+
+    @staticmethod
+    def get_word_start(word_object):
+        return float(word_object['time'])
+
+    @staticmethod
+    def get_word_end(word_object):
+        return (SpeechmaticsConverter.get_word_start(word_object) 
+                + float(word_object['duration']))
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return float(word_object['confidence'])
+
+    @staticmethod
+    def get_word_word(word_object):
+        return word_object['name']
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+        punc_before = False
+        punc_after = False
+        num_words = len(words)
+        index = 0
+
+        for i, w in enumerate(word_objects):
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+            if word_obj.word == '.':
+                continue
+
+            if word_obj.next_word:
+                next_word = self.get_word_word(word_obj.next_word)
+                if next_word == '.':
+                    punc_after = '.'
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': word_obj.word,
+                'always_capitalized': (
+                    word_obj.is_proper_noun 
+                    or word_obj.word == 'I'),
+                'index': index,
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+            index += 1
+            punc_after = False
+
+        return converted_words


 def speechmatics_converter(data):
@@ -55,6 +119,8 @@ def speechmatics_aligned_text_converter(data):
    class Exhausted(Exception):
        pass

+    Word = namedtuple('Word', 'start end word')
+
    def get_time(transcript, index):
        time_index = transcript.find('time=', index)
        if time_index == -1:
@@ -108,6 +174,3 @@ def speechmatics_aligned_text_converter(data):
        })

    return converted_words
-
-
-def gentle_converter