updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.

2019-03-07 02:30:48 -05:00
parent 6333f55c87
commit d30ecad583
16 changed files with 346 additions and 46 deletions
--- a/13
+++ b/13
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = "*"
+
+[packages]
+tpro = {editable = true,path = "."}
+
+[requires]
+python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1,104 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "49961036ff9465d1da8edf8b981512812678348e4baaa0c51841df64e80533ad"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.7"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "click": {
+            "hashes": [
+                "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
+                "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+            ],
+            "version": "==7.0"
+        },
+        "nltk": {
+            "hashes": [
+                "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
+            ],
+            "version": "==3.4"
+        },
+        "singledispatch": {
+            "hashes": [
+                "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
+                "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
+            ],
+            "version": "==3.4.0.3"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "tpro": {
+            "editable": true,
+            "path": "."
+        }
+    },
+    "develop": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
+                "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
+            ],
+            "markers": "python_version > '2.7'",
+            "version": "==6.0.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:19ecf9ce9db2fce065a7a0586e07cfb4ac8614fe96edf628a264b1c70116cf8f",
+                "sha256:84d306a647cc805219916e62aab89caa97a33a1dd8c342e87a37f91073cd4746"
+            ],
+            "version": "==0.9.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+            ],
+            "version": "==1.8.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c",
+                "sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4"
+            ],
+            "index": "pypi",
+            "version": "==4.3.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    }
+}
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:

 setup(
    name="tpro",
-    version="0.08",
+    version="0.09",
    url='https://github.com/zevaverbach/tpro',
    install_requires=[
        'Click',
@@ -21,6 +21,6 @@ setup(
    long_description=long_description,
    entry_points='''
        [console_scripts]
-        tpro=tpro.tpro:cli
+        tpro=transcript_processing.tpro:cli
    ''',
        )
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -3,9 +3,10 @@ import os

 import pytest

-from converters.amazon import AmazonConverter
-from converters.speechmatics import SpeechmaticsConverter
-from converters.gentle import GentleConverter
+from transcript_processing.converters.amazon import AmazonConverter
+from transcript_processing.converters.speechmatics import SpeechmaticsConverter
+from transcript_processing.converters.gentle import GentleConverter
+from transcript_processing.converters.google import GoogleConverter


@pytest.fixture
@@ -20,9 +21,10 @@ def test_json_transcript(json_transcript):


 def test_amazon():
-    a = AmazonConverter(
-            os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'),
-            'interactive_transcript')
+    with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), 'r') as fin:
+        json_data = json.load(fin)
+
+    a = AmazonConverter(json_data)
    a.convert()
    assert a.converted_words[0] == {
            'start': 5.49, 
@@ -30,16 +32,17 @@ def test_amazon():
            'confidence': 1.0,
            'word': 'So',
            'always_capitalized': False,
-            'index': 0,
            'punc_after': False,
            'punc_before': False
            }


 def test_speechmatics():
-    a = SpeechmaticsConverter(
-            os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'),
-            'interactive_transcript')
+    with open(os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), 'r') as fin:
+        json_data = json.load(fin)
+
+    a = SpeechmaticsConverter(json_data)
+            
    a.convert()
    assert a.converted_words[0] == {
            'start': 5.98,
@@ -47,16 +50,16 @@ def test_speechmatics():
            'confidence': 0.67,
            'word': 'For',
            'always_capitalized': False,
-            'index': 0,
            'punc_after': False,
            'punc_before': False,
            }


 def test_gentle():
-    a = GentleConverter(
-            os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
-            'interactive_transcript')
+    with open(os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
+        json_data = json.load(fin)
+
+    a = GentleConverter(json_data)
    a.convert()
    assert a.converted_words[0] == {
            'start': 0.35,
@@ -64,7 +67,25 @@ def test_gentle():
            'confidence': 1, 
            'word': '[noise]', 
            'always_capitalized': False, 
-            'index': 0, 
            'punc_after': False,
            'punc_before': False
            }
+
+
+def test_google():
+    with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
+        transcript_data = fin.read()
+
+    g = GoogleConverter(transcript_data)
+            
+    g.convert()
+    assert g.converted_words[0] == {
+            'start': 4,
+            'end': 5.5,
+            'confidence': 0.88,
+            'word': 'Testing',
+            'always_capitalized': False,
+            'punc_after': [','],
+            'punc_before': False,
+            }
+
--- a/tests/test_convert_google.py
+++ b/tests/test_convert_google.py
@@ -0,0 +1,32 @@
+import json
+import os
+
+import pytest
+
+from transcript_processing.converters.google import (
+    make_json_friendly,
+    GoogleConverter,
+        )
+from transcript_processing.config import GOOGLE_TRANSCRIPT_TEST_FILE
+
+
+@pytest.fixture
+def transcript():
+    with open(GOOGLE_TRANSCRIPT_TEST_FILE, 'r') as fin:
+        return fin.read()
+
+
+def test_make_json_friendly(transcript):
+    friendly = make_json_friendly(transcript)
+    assert json.loads(friendly)
+
+
+def test_pre_process(transcript):
+    with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
+        transcript_data = fin.read()
+
+    g = GoogleConverter(transcript_data)
+    assert g.json_data
+            
+
+
--- a/transcript_processing/init.py
+++ b/transcript_processing/init.py
--- a/transcript_processing/config.py
+++ b/transcript_processing/config.py
@@ -2,3 +2,4 @@ import os


 AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')
+GOOGLE_TRANSCRIPT_TEST_FILE = os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE')
--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -14,7 +14,7 @@ class TranscriptConverter:

    __metaclass__ = abc.ABCMeta

-    def __init__(self, json_data):
+    def __init__(self, json_data: dict):
        self.json_data = json_data

    def convert(self):
@@ -31,19 +31,16 @@ class TranscriptConverter:
                tagged_words
                )

-    @staticmethod
    @abc.abstractmethod
-    def get_word_objects(json_data):
+    def get_word_objects(self, json_data):
        pass

-    @staticmethod
-    @abc.abstractmethod
-    def get_words(word_objects):
-        pass
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects]

-    @staticmethod
    @abc.abstractmethod
-    def convert_words(word_objects, words, tagged_words=None):
+    def convert_words(self, word_objects, words, tagged_words=None):
        pass

    @staticmethod
--- a/transcript_processing/converters/init.py
+++ b/transcript_processing/converters/init.py
@@ -1,9 +1,11 @@
 from .amazon import AmazonConverter
 from .speechmatics import SpeechmaticsConverter
 from .gentle import GentleConverter
+from .google import GoogleConverter

 services = {
        'amazon': AmazonConverter,
        'gentle': GentleConverter,
        'speechmatics': SpeechmaticsConverter,
+        'google': GoogleConverter,
        }
--- a/transcript_processing/converters/amazon.py
+++ b/transcript_processing/converters/amazon.py
@@ -15,10 +15,6 @@ class AmazonConverter(TranscriptConverter):
    def get_word_objects(self, json_data):
        return json_data['results']['items']

-    def get_words(self, word_objects):
-        return [self.get_word_word(w)
-                for w in word_objects]
-
    @staticmethod
    def get_word_start(word_object):
        return float(word_object['start_time'])
@@ -32,7 +28,7 @@ class AmazonConverter(TranscriptConverter):
        return float(word_object['alternatives'][0]['confidence'])

    @staticmethod
-    def get_word_word(word_object):
+    def get_word_word(word_object) -> str:
        word_word = word_object['alternatives'][0]['content']
        if word_word == 'i':
            # weird Amazon quirk
@@ -44,7 +40,6 @@ class AmazonConverter(TranscriptConverter):

        punc_before = False
        punc_after = False
-        num_words = len(words)

        for i, w in enumerate(word_objects):
            if w['type'] == 'punctuation':
--- a/transcript_processing/converters/gentle.py
+++ b/transcript_processing/converters/gentle.py
@@ -7,16 +7,12 @@ class GentleConverter(TranscriptConverter):

    name = 'gentle'

-    def __init__(self, path):
-        super().__init__(path)
+    def __init__(self, json_data):
+        super().__init__(json_data)

    def get_word_objects(self, json_data):
        return json_data['words']

-    def get_words(self, word_objects):
-        return [self.get_word_word(w)
-                for w in word_objects]
-
    @staticmethod
    def get_word_start(word_object):
        return word_object['start']
@@ -35,8 +31,6 @@ class GentleConverter(TranscriptConverter):

    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
-        punc_before = False
-        punc_after = False
        num_words = len(words)

        for i, w in enumerate(word_objects):
@@ -51,8 +45,8 @@ class GentleConverter(TranscriptConverter):
                    word_obj.word, 
                    i,
                    tagged_words),
-                'punc_after': punc_after,
-                'punc_before': punc_before,
+                'punc_after': False,
+                'punc_before': False,
            })

            punc_after = False
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -0,0 +1,145 @@
+import json
+import re
+
+from ..converter import TranscriptConverter
+from .. import helpers
+
+
+
+class GoogleConverter(TranscriptConverter):
+
+    def __init__(self, transcript_data: str):
+        super().__init__(transcript_data)
+        self.json_data = self.pre_process(transcript_data)
+
+    def pre_process(self, transcript_data):
+        friendly = make_json_friendly(transcript_data)
+        return json.loads(friendly)
+
+    def get_word_objects(self, json_data):
+        return json_data
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+
+        punc_before = False
+        punc_after = False
+
+        for i, w in enumerate(word_objects):
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+            punc_before = helpers.get_punc_before(word_obj.word) or False
+            punc_after = helpers.get_punc_after(word_obj.word) or False
+
+            the_word = word_obj.word
+            if punc_before:
+                the_word = the_word[len(punc_before):]
+            if punc_after:
+                the_word = the_word[:-len(punc_after)]
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': the_word,
+                'always_capitalized': self.check_if_always_capitalized(
+                    word_obj.word, 
+                    i,
+                    tagged_words),
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+        return converted_words
+
+    @classmethod
+    def get_word_start(cls, word_object):
+        return cls.get_seconds(word_object['start_time'])
+
+    @classmethod
+    def get_word_end(cls, word_object):
+        return cls.get_seconds(word_object['end_time'])
+
+    @staticmethod
+    def get_seconds(time: dict) -> float:
+        seconds = 0
+        if 'seconds' in time:
+            seconds = time['seconds']
+        if 'nanos' in time:
+            seconds += time['nanos'] / 1_000_000_000
+        return seconds
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return word_object['confidence']
+
+    @staticmethod
+    def get_word_word(word_object):
+        print(word_object)
+        return word_object['word']
+
+
+
+def make_json_friendly(json_string):
+    lines = [line.strip() for line in json_string.split('\\n')]
+
+    fields = [
+        'words {', 
+        'start_time {', 
+        '}',
+        'end_time {', 
+        '}',
+        'word: ', 
+        'confidence: '
+        ]
+
+    current_field_index = 0
+    new_string = ''
+
+    for line in lines:
+
+        current_field = fields[current_field_index]
+
+        if current_field in line:
+            if current_field_index == len(fields) - 1:
+               current_field_index = 0
+            else:
+                current_field_index += 1
+                if current_field_index == 1:
+                    new_string += '}, {'
+                    # "words" was found, don't want to append that
+                    continue
+
+        else:
+            if current_field_index == 0:
+                # haven't found the beginning of the next word object
+                continue
+
+        # add quotes around keys
+        line = re.sub('^(?!")([0-9a-zA-Z_]+)', 
+                        '"\\1"', 
+                        line)
+
+        # add colons after keys
+        if line.endswith('{'):
+            line = line.replace('" ', '": ')
+
+        # use first two decimals of confidence
+        if 'confidence' in current_field:
+            line = ', ' + line
+            line = line[:20]
+
+        if current_field == '}':
+            line = line + ', '
+
+        new_string += line
+
+    # cleanup
+    if new_string.startswith('}, '):
+        new_string = new_string[3:]
+    if not new_string.startswith('['):
+        new_string = '[' + new_string
+    if not new_string.endswith('}]'):
+        new_string = new_string + '}]'
+    new_string = new_string.replace(', }', '}').replace('\\', '')
+
+    return new_string
--- a/transcript_processing/converters/speechmatics.py
+++ b/transcript_processing/converters/speechmatics.py
@@ -16,10 +16,6 @@ class SpeechmaticsConverter(TranscriptConverter):
    def get_word_objects(self, json_data):
        return json_data['words']

-    def get_words(self, word_objects):
-        return [self.get_word_word(w)
-                for w in word_objects]
-
    @staticmethod
    def get_word_start(word_object):
        return float(word_object['time'])
--- a/transcript_processing/helpers.py
+++ b/transcript_processing/helpers.py
--- a/transcript_processing/outputs.py
+++ b/transcript_processing/outputs.py
--- a/transcript_processing/tpro.py
+++ b/transcript_processing/tpro.py