From e36c8ba30e86fdae878b810b0345e68c14b5b674 Mon Sep 17 00:00:00 2001
From: zevav <zev@averba.ch>
Date: Wed, 6 Feb 2019 22:28:08 -0500
Subject: [PATCH] finished refactoring to a single repo, and to OOP for
 straight-forward adding of new ASR APIs.  added Gentle, and added
 viral_overlay JSON output.  added tests

---
 Pipfile                            | 13 +++++
 Pipfile.lock                       | 86 ++++++++++++++++++++++++++++++
 README.md                          |  6 +++
 __init__.py                        |  1 -
 converter.py                       | 18 +++++--
 converters/__init__.py             | 24 ---------
 converters/amazon.py               | 76 +++-----------------------
 converters/gentle.py               | 60 +++++++++++++++++++++
 converters/speechmatics.py         | 73 +++++++++++++++++++++++--
 tests/test_conversion.py           | 70 ++++++++++++++++++++++++
 tests/test_convert_viraloverlay.py | 23 ++++++++
 11 files changed, 347 insertions(+), 103 deletions(-)
 create mode 100644 Pipfile
 create mode 100644 Pipfile.lock
 create mode 100644 README.md
 delete mode 100644 __init__.py
 create mode 100644 converters/gentle.py
 create mode 100644 tests/test_conversion.py
 create mode 100644 tests/test_convert_viraloverlay.py

diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..9830ce9
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+nltk = "*"
+pytest = "*"
+
+[requires]
+python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..523f14c
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,86 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.7"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
+                "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
+            ],
+            "version": "==18.2.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
+                "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
+                "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
+            ],
+            "version": "==5.0.0"
+        },
+        "nltk": {
+            "hashes": [
+                "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
+            ],
+            "index": "pypi",
+            "version": "==3.4"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
+            ],
+            "version": "==0.8.1"
+        },
+        "py": {
+            "hashes": [
+                "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
+                "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
+            ],
+            "version": "==1.7.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
+                "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
+            ],
+            "index": "pypi",
+            "version": "==4.2.0"
+        },
+        "singledispatch": {
+            "hashes": [
+                "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
+                "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
+            ],
+            "version": "==3.4.0.3"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    },
+    "develop": {}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6beb6ca
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# Non-pip Requirement:  Stanford NER JAR
+
+  - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
+  - put these files in in /usr/local/bin/:
+    - stanford-ner.jar
+    - english.all.3class.distsim.crf.ser.gz
diff --git a/__init__.py b/__init__.py
deleted file mode 100644
index ac7024f..0000000
--- a/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from transcript_processing.converter import TranscriptConverter
\ No newline at end of file
diff --git a/converter.py b/converter.py
index d9a8184..9e0ea41 100644
--- a/converter.py
+++ b/converter.py
@@ -4,7 +4,7 @@ from collections import namedtuple
 import os
 
 import helpers
-from transcript_processing.converters import converters
+import converters
 
 
 
@@ -27,8 +27,7 @@ class TranscriptConverter:
             word_objects = self.get_word_objects(data)
             words = self.get_words(word_objects)
 
-            if self.output_target == 'interactive_transcript':
-                tagged_words = helpers.tag_words(words)
+            tagged_words = helpers.tag_words(words)
 
             self.converted_words = self.convert_words(
                     word_objects,
@@ -89,10 +88,19 @@ class TranscriptConverter:
         if index < len(word_objects) - 1:
             return word_objects[index + 1]
 
-    def to_json(self):
+    def interactive_transcript(self):
         return json.dumps(self.converted_words, indent=4)
 
+    def viral_overlay(self):
+        return json.dumps(
+                [{'start': word['start'],
+                  'stop': word['end'],
+                  'word': word['word']}
+                  for word in self.converted_words],
+                indent=4
+                )
+
     def save(self, path):
         with open(path, 'w') as fout:
-            fout.write(self.to_json())
+            fout.write(getattr(self, self.output_target)())
         return path
diff --git a/converters/__init__.py b/converters/__init__.py
index 1b067ef..e69de29 100644
--- a/converters/__init__.py
+++ b/converters/__init__.py
@@ -1,24 +0,0 @@
-"""
-
-fields for converted transcript:
-
-    start
-    end
-    word
-    confidence
-    index
-    always_capitalized
-    punc_before
-    punc_after
-
-"""
-
-from transcript_processing.converters.amazon import amazon_converter
-from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
-
-
-converters = {
-    'speechmatics': speechmatics_converter,
-    'speechmatics_align': speechmatics_aligned_text_converter,
-    'amazon': amazon_converter,
-}
diff --git a/converters/amazon.py b/converters/amazon.py
index 7c5dd09..d5fc549 100644
--- a/converters/amazon.py
+++ b/converters/amazon.py
@@ -1,6 +1,7 @@
 import json
 
-from transcript_processing import helpers
+from converter import TranscriptConverter
+import helpers
 
 
 
@@ -10,11 +11,11 @@ class AmazonConverter(TranscriptConverter):
         super().__init__(path, output_target)
 
     def get_word_objects(self, json_data):
-        return data['results']['items']
+        return json_data['results']['items']
 
     def get_words(self, word_objects):
         return [self.get_word_word(w)
-                for w in word_objects])
+                for w in word_objects]
 
     @staticmethod
     def get_word_start(word_object):
@@ -30,7 +31,7 @@ class AmazonConverter(TranscriptConverter):
 
     @staticmethod
     def get_word_word(word_object):
-        word_word = w['alternatives'][0]['content']
+        word_word = word_object['alternatives'][0]['content']
         if word_word == 'i':
             # weird Amazon quirk
             word_word = 'I'
@@ -44,11 +45,11 @@ class AmazonConverter(TranscriptConverter):
         num_words = len(words)
         index = 0
 
-        for i, w in enumerate(words):
+        for i, w in enumerate(word_objects):
             if w['type'] == 'punctuation':
                 continue
             next_word_punc_after = None
-            word_obj = self.get_word_object(w, i, tagged_words, words)
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
 
             if word_obj.next_word:
                 next_word = self.get_word_word(word_obj.next_word)
@@ -60,7 +61,7 @@ class AmazonConverter(TranscriptConverter):
                     next_word_punc_after = None
 
             if word_obj.word.lower() == 'you' and next_word == 'know':
-                prev_word = words[i - 1]
+                prev_word = word_objects[i - 1]
                 if prev_word['type'] != 'punctuation':
                     converted_words[-1]['punc_after'] = ','
                 if next_word_type != 'punctuation':
@@ -83,64 +84,3 @@ class AmazonConverter(TranscriptConverter):
             punc_after = False
 
         return converted_words
-
-
-def amazon_converter(data: dict):
-    data = json.load(data)
-    converted_words = []
-    words = data['results']['items']
-    tagged_words = helpers.tag_words(
-        [w['alternatives'][0]['content'] for w in words])
-    punc_before = False
-    punc_after = False
-    num_words = len(words)
-    index = 0
-
-    for i, w in enumerate(words):
-        if w['type'] == 'punctuation':
-            continue
-        next_word_punc_after = None
-        word_start = float(w['start_time'])
-        word_end = float(w['end_time'])
-        confidence = float(w['alternatives'][0]['confidence'])
-        word = w['alternatives'][0]['content']
-        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
-
-        next_word = None
-        if i < num_words - 1:
-            next_word = words[i + 1]['alternatives'][0]['content']
-            next_word_type = words[i + 1]['type']
-        if next_word == '.':
-            punc_after = '.'
-        elif next_word == ',':
-            punc_after = ','
-        elif next_word_punc_after:
-            punc_after = next_word_punc_after
-            next_word_punc_after = None
-
-        if word == 'i':
-            # weird Amazon quirk
-            word = 'I'
-
-        if word.lower() == 'you' and next_word == 'know':
-            prev_word = words[i - 1]
-            if prev_word['type'] != 'punctuation':
-                converted_words[-1]['punc_after'] = ','
-            if next_word_type != 'punctuation':
-                next_word_punc_after = ','
-
-        converted_words.append({
-            'start': word_start,
-            'end': word_end,
-            'confidence': confidence,
-            'word': word,
-            'always_capitalized': is_proper_noun or word == 'I',
-            'index': index,
-            'punc_after': punc_after,
-            'punc_before': punc_before,
-        })
-
-        index += 1
-        punc_after = False
-
-    return converted_words
diff --git a/converters/gentle.py b/converters/gentle.py
new file mode 100644
index 0000000..e76ca05
--- /dev/null
+++ b/converters/gentle.py
@@ -0,0 +1,60 @@
+from converter import TranscriptConverter
+
+
+
+class GentleConverter(TranscriptConverter):
+
+    def __init__(self, path, output_target):
+        super().__init__(path, output_target)
+
+    def get_word_objects(self, json_data):
+        return json_data['words']
+
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects]
+
+    @staticmethod
+    def get_word_start(word_object):
+        return word_object['start']
+
+    @staticmethod
+    def get_word_end(word_object):
+        return word_object['end']
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return 1
+
+    @staticmethod
+    def get_word_word(word_object):
+        return word_object['alignedWord']
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+        punc_before = False
+        punc_after = False
+        num_words = len(words)
+        index = 0
+
+        for i, w in enumerate(word_objects):
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': word_obj.word,
+                'always_capitalized': (
+                    word_obj.is_proper_noun 
+                    or word_obj.word == 'I'),
+                'index': index,
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+            index += 1
+            punc_after = False
+
+        return converted_words
+
diff --git a/converters/speechmatics.py b/converters/speechmatics.py
index 5a2b7bf..71f7d0d 100644
--- a/converters/speechmatics.py
+++ b/converters/speechmatics.py
@@ -1,10 +1,74 @@
 from collections import namedtuple
 import json
 
-from transcript_processing import helpers
+from converter import TranscriptConverter
+import helpers
 
 
-Word = namedtuple('Word', 'start end word')
+
+class SpeechmaticsConverter(TranscriptConverter):
+
+    def __init__(self, path, output_target):
+        super().__init__(path, output_target)
+
+    def get_word_objects(self, json_data):
+        return json_data['words']
+
+    def get_words(self, word_objects):
+        return [self.get_word_word(w)
+                for w in word_objects]
+
+    @staticmethod
+    def get_word_start(word_object):
+        return float(word_object['time'])
+
+    @staticmethod
+    def get_word_end(word_object):
+        return (SpeechmaticsConverter.get_word_start(word_object) 
+                + float(word_object['duration']))
+
+    @staticmethod
+    def get_word_confidence(word_object):
+        return float(word_object['confidence'])
+
+    @staticmethod
+    def get_word_word(word_object):
+        return word_object['name']
+
+    def convert_words(self, word_objects, words, tagged_words=None):
+        converted_words = []
+        punc_before = False
+        punc_after = False
+        num_words = len(words)
+        index = 0
+
+        for i, w in enumerate(word_objects):
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
+            if word_obj.word == '.':
+                continue
+
+            if word_obj.next_word:
+                next_word = self.get_word_word(word_obj.next_word)
+                if next_word == '.':
+                    punc_after = '.'
+
+            converted_words.append({
+                'start': word_obj.start,
+                'end': word_obj.end,
+                'confidence': word_obj.confidence,
+                'word': word_obj.word,
+                'always_capitalized': (
+                    word_obj.is_proper_noun 
+                    or word_obj.word == 'I'),
+                'index': index,
+                'punc_after': punc_after,
+                'punc_before': punc_before,
+            })
+
+            index += 1
+            punc_after = False
+
+        return converted_words
 
 
 def speechmatics_converter(data):
@@ -55,6 +119,8 @@ def speechmatics_aligned_text_converter(data):
     class Exhausted(Exception):
         pass
 
+    Word = namedtuple('Word', 'start end word')
+
     def get_time(transcript, index):
         time_index = transcript.find('time=', index)
         if time_index == -1:
@@ -108,6 +174,3 @@ def speechmatics_aligned_text_converter(data):
         })
 
     return converted_words
-
-
-def gentle_converter
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
new file mode 100644
index 0000000..a63f8bc
--- /dev/null
+++ b/tests/test_conversion.py
@@ -0,0 +1,70 @@
+import json
+import os
+
+import pytest
+
+from converters.amazon import AmazonConverter
+from converters.speechmatics import SpeechmaticsConverter
+from converters.gentle import GentleConverter
+
+
+@pytest.fixture
+def json_transcript():
+    with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')) as fin:
+        transcript = json.load(fin)
+        yield transcript
+
+
+def test_json_transcript(json_transcript):
+    assert json_transcript["jobName"] == "Lelandmp3"
+
+
+def test_amazon():
+    a = AmazonConverter(
+            os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'),
+            'interactive_transcript')
+    a.convert()
+    assert a.converted_words[0] == {
+            'start': 5.49, 
+            'end': 5.97, 
+            'confidence': 1.0,
+            'word': 'So',
+            'always_capitalized': False,
+            'index': 0,
+            'punc_after': False,
+            'punc_before': False
+            }
+
+
+def test_speechmatics():
+    a = SpeechmaticsConverter(
+            os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'),
+            'interactive_transcript')
+    a.convert()
+    assert a.converted_words[0] == {
+            'start': 5.98,
+            'end': 6.11,
+            'confidence': 0.67,
+            'word': 'For',
+            'always_capitalized': False,
+            'index': 0,
+            'punc_after': False,
+            'punc_before': False,
+            }
+
+
+def test_gentle():
+    a = GentleConverter(
+            os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
+            'interactive_transcript')
+    a.convert()
+    assert a.converted_words[0] == {
+            'start': 0.35,
+            'end': 1.58, 
+            'confidence': 1, 
+            'word': '[noise]', 
+            'always_capitalized': False, 
+            'index': 0, 
+            'punc_after': False,
+            'punc_before': False
+            }
diff --git a/tests/test_convert_viraloverlay.py b/tests/test_convert_viraloverlay.py
new file mode 100644
index 0000000..df5e686
--- /dev/null
+++ b/tests/test_convert_viraloverlay.py
@@ -0,0 +1,23 @@
+
+
+import json
+import os
+
+import pytest
+
+from converters.amazon import AmazonConverter
+from converters.speechmatics import SpeechmaticsConverter
+from converters.gentle import GentleConverter
+
+
+
+def test_gentle():
+    a = GentleConverter(
+            os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
+            'viral_overlay')
+    a.convert()
+    assert json.loads(a.viral_overlay())[0] == {
+            'start': 0.35,
+            'stop': 1.58, 
+            'word': '[noise]', 
+            }