made google work again after change in formatting of saved transcript. added speaker_id support to base class and Google

2019-03-07 22:55:37 -05:00
parent ed2ece3dfb
commit 3fc6dacfde
9 changed files with 75 additions and 62 deletions
--- a/README.md
+++ b/README.md
@@ -53,3 +53,4 @@ formats.
 - Word (`.doc`, `.docx`)
 - text files
 - SRT (subtitles)
+- Draft.js JSON
--- a/README_PYPI.md
+++ b/README_PYPI.md
@@ -111,3 +111,4 @@ formats.
 - Word (`.doc`, `.docx`)
 - text files
 - SRT (subtitles)
+- Draft.js JSON
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:

 setup(
    name="tpro",
-    version="0.11",
+    version="0.12",
    url='https://github.com/zevaverbach/tpro',
    install_requires=[
        'Click',
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -79,13 +79,14 @@ def test_google():
    g = GoogleConverter(transcript_data)
            
    g.convert()
+    print(g.converted_words[0])
    assert g.converted_words[0] == {
-            'start': 4,
-            'end': 5.5,
-            'confidence': 0.88,
-            'word': 'Testing',
-            'always_capitalized': False,
-            'punc_after': [','],
-            'punc_before': False,
-            }
-
+        'start': 0.4, 
+        'end': 2.1,
+        'confidence': 0.9128385782241821,
+        'word': 'Okay',
+        'always_capitalized': False,
+        'punc_after': [','],
+        'punc_before': False,
+        'speaker_id': None
+        }
--- a/tests/test_convert_google.py
+++ b/tests/test_convert_google.py
@@ -27,6 +27,7 @@ def test_pre_process(transcript):

    g = GoogleConverter(transcript_data)
    assert g.json_data
+    print(g.json_data)
            


--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -7,7 +7,9 @@ from . import converters



-Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
+Word = namedtuple(
+        'Word', 
+        'start end confidence word always_capitalized next_word speaker_id')


 class TranscriptConverter:
@@ -58,6 +60,11 @@ class TranscriptConverter:
    def get_word_confidence(word_object):
        pass

+    @staticmethod
+    @abc.abstractmethod
+    def get_speaker_id(word_object):
+        pass
+
    @staticmethod
    @abc.abstractmethod
    def get_word_word(word_object):
@@ -78,7 +85,8 @@ class TranscriptConverter:
            self.get_word_confidence(word_object),
            word,
            self.check_if_always_capitalized(word, index, tagged_words),
-            self.get_next_word(word_objects, index)
+            self.get_next_word(word_objects, index),
+            self.get_speaker_id(word_object),
                ) 

    def get_next_word(self, word_objects, index):
--- a/transcript_processing/converters/amazon.py
+++ b/transcript_processing/converters/amazon.py
@@ -35,6 +35,10 @@ class AmazonConverter(TranscriptConverter):
            word_word = 'I'
        return word_word

+    @staticmethod
+    def get_speaker_id(word_object):
+        return None
+
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []

@@ -74,6 +78,7 @@ class AmazonConverter(TranscriptConverter):
                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
+                'speaker_id': word_obj.speaker_id,
            })

            punc_after = False
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -14,7 +14,12 @@ class GoogleConverter(TranscriptConverter):

    def pre_process(self, transcript_data):
        friendly = make_json_friendly(transcript_data)
-        return json.loads(friendly)
+        json_data = json.loads(friendly)
+        last_datum = json_data[-1]
+        if last_datum.get('speaker_tag'):
+            """Get rid of duplicate content that doesn't have speaker_tags"""
+            json_data = [jd for jd in json_data if jd.get('speaker_tag')]
+        return json_data

    def get_word_objects(self, json_data):
        return json_data
@@ -47,6 +52,7 @@ class GoogleConverter(TranscriptConverter):
                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
+                'speaker_id': word_obj.speaker_id,
            })

        return converted_words
@@ -74,13 +80,13 @@ class GoogleConverter(TranscriptConverter):

    @staticmethod
    def get_word_word(word_object):
-        print(word_object)
        return word_object['word']



 def make_json_friendly(json_string):
-    lines = [line.strip() for line in json_string.split('\\n')]
+    lines = [line.strip() for line in json_string.split('\n')]
+    new_string = '['

    fields = [
        'words {', 
@@ -92,54 +98,41 @@ def make_json_friendly(json_string):
        'confidence: '
        ]

-    current_field_index = 0
-    new_string = ''
+    start_field = 'words {'

-    for line in lines:
+    open_braces = 0

-        current_field = fields[current_field_index]
+    for index, line in enumerate(lines):
+        if open_braces == 0:
+            if start_field in line:
+                open_braces = 1
+                new_string += '{'
+            continue

-        if current_field in line:
-            if current_field_index == len(fields) - 1:
-               current_field_index = 0
-            else:
-                current_field_index += 1
-                if current_field_index == 1:
-                    new_string += '}, {'
-                    # "words" was found, don't want to append that
-                    continue
+        if '{' in line:
+            open_braces += 1
+        if '}' in line:
+            open_braces -= 1

-        else:
-            if current_field_index == 0:
-                # haven't found the beginning of the next word object
-                continue
-
-        # add quotes around keys
-        line = re.sub('^(?!")([0-9a-zA-Z_]+)', 
-                        '"\\1"', 
-                        line)
-
-        # add colons after keys
-        if line.endswith('{'):
-            line = line.replace('" ', '": ')
-
-        # use first two decimals of confidence
-        if 'confidence' in current_field:
-            line = ', ' + line
-            line = line[:20]
-
-        if current_field == '}':
+        if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]:
            line = line + ', '

+        if open_braces == 0:
+            new_string += '}, '
+            continue
+
+        line = re.sub('^(?!")([0-9a-zA-Z_]+)',
+                '"\\1"',
+                line)
+
+        if 'start_time' in line:
+            line = line.replace('"start_time"', '"start_time":')
+        if 'end_time' in line:
+            line = line.replace('"end_time"', '"end_time":')
+
        new_string += line

-    # cleanup
-    if new_string.startswith('}, '):
-        new_string = new_string[3:]
-    if not new_string.startswith('['):
-        new_string = '[' + new_string
-    if not new_string.endswith('}]'):
-        new_string = new_string + '}]'
-    new_string = new_string.replace(', }', '}').replace('\\', '')
+    new_string = new_string.replace('\\', '')

-    return new_string
+
+    return new_string[:-2] + ']'
--- a/transcript_processing/outputs.py
+++ b/transcript_processing/outputs.py
@@ -4,11 +4,14 @@ def universal_transcript(self, pretty=False):
    return json.dumps(self.converted_words, indent=4 if pretty else None)

 def viral_overlay(self, pretty=False):
-    return json.dumps([{
-  'start': word['start'],
-  'stop': word['end'],
-  'text': word['word'].title() if word['always_capitalized'] else word['word']}
+    return json.dumps([
+        {'start': word['start'],
+         'stop': word['end'],
+         'text': word['word'].title() 
+             if word['always_capitalized'] else word['word']
+        }

-              for word in self.converted_words], indent=4 if pretty else None
+                       for word in self.converted_words]
+                       , indent=4 if pretty else None
            )