made google work again after change in formatting of saved transcript. added speaker_id support to base class and Google

2019-03-07 22:55:37 -05:00
parent ed2ece3dfb
commit 3fc6dacfde
9 changed files with 75 additions and 62 deletions
--- a/README.md
+++ b/README.md
@@ -53,3 +53,4 @@ formats.
 - Word (`.doc`, `.docx`)
 - text files
 - SRT (subtitles)
 - Draft.js JSON
--- a/README_PYPI.md
+++ b/README_PYPI.md
@@ -111,3 +111,4 @@ formats.
 - Word (`.doc`, `.docx`)
 - text files
 - SRT (subtitles)
 - Draft.js JSON
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:
 setup(
    name="tpro",
-    version="0.11",
+    version="0.12",
    url='https://github.com/zevaverbach/tpro',
    install_requires=[
        'Click',
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -79,13 +79,14 @@ def test_google():
    g = GoogleConverter(transcript_data)
    g.convert()
    print(g.converted_words[0])
    assert g.converted_words[0] == {
-            'start': 4,
+        'start': 0.4, 
-            'end': 5.5,
+        'end': 2.1,
-            'confidence': 0.88,
+        'confidence': 0.9128385782241821,
-            'word': 'Testing',
+        'word': 'Okay',
-            'always_capitalized': False,
+        'always_capitalized': False,
-            'punc_after': [','],
+        'punc_after': [','],
-            'punc_before': False,
+        'punc_before': False,
-            }
+        'speaker_id': None
-
+        }
--- a/tests/test_convert_google.py
+++ b/tests/test_convert_google.py
@@ -27,6 +27,7 @@ def test_pre_process(transcript):
    g = GoogleConverter(transcript_data)
    assert g.json_data
    print(g.json_data)
--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -7,7 +7,9 @@ from . import converters
-Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
+Word = namedtuple(
        'Word', 
        'start end confidence word always_capitalized next_word speaker_id')
 class TranscriptConverter:
@@ -58,6 +60,11 @@ class TranscriptConverter:
    def get_word_confidence(word_object):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_speaker_id(word_object):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_word_word(word_object):
@@ -78,7 +85,8 @@ class TranscriptConverter:
            self.get_word_confidence(word_object),
            word,
            self.check_if_always_capitalized(word, index, tagged_words),
-            self.get_next_word(word_objects, index)
+            self.get_next_word(word_objects, index),
            self.get_speaker_id(word_object),
                ) 
    def get_next_word(self, word_objects, index):
--- a/transcript_processing/converters/amazon.py
+++ b/transcript_processing/converters/amazon.py
@@ -35,6 +35,10 @@ class AmazonConverter(TranscriptConverter):
            word_word = 'I'
        return word_word
    @staticmethod
    def get_speaker_id(word_object):
        return None
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
@@ -74,6 +78,7 @@ class AmazonConverter(TranscriptConverter):
                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
                'speaker_id': word_obj.speaker_id,
            })
            punc_after = False
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -14,7 +14,12 @@ class GoogleConverter(TranscriptConverter):
    def pre_process(self, transcript_data):
        friendly = make_json_friendly(transcript_data)
-        return json.loads(friendly)
+        json_data = json.loads(friendly)
        last_datum = json_data[-1]
        if last_datum.get('speaker_tag'):
            """Get rid of duplicate content that doesn't have speaker_tags"""
            json_data = [jd for jd in json_data if jd.get('speaker_tag')]
        return json_data
    def get_word_objects(self, json_data):
        return json_data
@@ -47,6 +52,7 @@ class GoogleConverter(TranscriptConverter):
                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
                'speaker_id': word_obj.speaker_id,
            })
        return converted_words
@@ -74,13 +80,13 @@ class GoogleConverter(TranscriptConverter):
    @staticmethod
    def get_word_word(word_object):
        print(word_object)
        return word_object['word']
 def make_json_friendly(json_string):
-    lines = [line.strip() for line in json_string.split('\\n')]
+    lines = [line.strip() for line in json_string.split('\n')]
    new_string = '['
    fields = [
        'words {', 
@@ -92,54 +98,41 @@ def make_json_friendly(json_string):
        'confidence: '
        ]
-    current_field_index = 0
+    start_field = 'words {'
    new_string = ''
-    for line in lines:
+    open_braces = 0
-        current_field = fields[current_field_index]
+    for index, line in enumerate(lines):
        if open_braces == 0:
            if start_field in line:
                open_braces = 1
                new_string += '{'
            continue
-        if current_field in line:
+        if '{' in line:
-            if current_field_index == len(fields) - 1:
+            open_braces += 1
-               current_field_index = 0
+        if '}' in line:
-            else:
+            open_braces -= 1
                current_field_index += 1
                if current_field_index == 1:
                    new_string += '}, {'
                    # "words" was found, don't want to append that
                    continue
-        else:
+        if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]:
            if current_field_index == 0:
                # haven't found the beginning of the next word object
                continue
        # add quotes around keys
        line = re.sub('^(?!")([0-9a-zA-Z_]+)', 
                        '"\\1"', 
                        line)
        # add colons after keys
        if line.endswith('{'):
            line = line.replace('" ', '": ')
        # use first two decimals of confidence
        if 'confidence' in current_field:
            line = ', ' + line
            line = line[:20]
        if current_field == '}':
            line = line + ', '
        if open_braces == 0:
            new_string += '}, '
            continue
        line = re.sub('^(?!")([0-9a-zA-Z_]+)',
                '"\\1"',
                line)
        if 'start_time' in line:
            line = line.replace('"start_time"', '"start_time":')
        if 'end_time' in line:
            line = line.replace('"end_time"', '"end_time":')
        new_string += line
-    # cleanup
+    new_string = new_string.replace('\\', '')
    if new_string.startswith('}, '):
        new_string = new_string[3:]
    if not new_string.startswith('['):
        new_string = '[' + new_string
    if not new_string.endswith('}]'):
        new_string = new_string + '}]'
    new_string = new_string.replace(', }', '}').replace('\\', '')
-    return new_string
+
    return new_string[:-2] + ']'
--- a/transcript_processing/outputs.py
+++ b/transcript_processing/outputs.py
@@ -4,11 +4,14 @@ def universal_transcript(self, pretty=False):
    return json.dumps(self.converted_words, indent=4 if pretty else None)
 def viral_overlay(self, pretty=False):
-    return json.dumps([{
+    return json.dumps([
-  'start': word['start'],
+        {'start': word['start'],
-  'stop': word['end'],
+         'stop': word['end'],
-  'text': word['word'].title() if word['always_capitalized'] else word['word']}
+         'text': word['word'].title() 
             if word['always_capitalized'] else word['word']
        }
-              for word in self.converted_words], indent=4 if pretty else None
+                       for word in self.converted_words]
                       , indent=4 if pretty else None
            )