finished refactoring to a single repo, and to OOP for straight-forward adding of new ASR APIs. added Gentle, and added viral_overlay JSON output. added tests

2019-02-06 22:28:08 -05:00
parent 84fe4d2fd4
commit e36c8ba30e
11 changed files with 347 additions and 103 deletions
--- a/13
+++ b/13
@@ -0,0 +1,13 @@
 [[source]]
 name = "pypi"
 url = "https://pypi.org/simple"
 verify_ssl = true
 [dev-packages]
 [packages]
 nltk = "*"
 pytest = "*"
 [requires]
 python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1,86 @@
 {
    "_meta": {
        "hash": {
            "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
        },
        "pipfile-spec": 6,
        "requires": {
            "python_version": "3.7"
        },
        "sources": [
            {
                "name": "pypi",
                "url": "https://pypi.org/simple",
                "verify_ssl": true
            }
        ]
    },
    "default": {
        "atomicwrites": {
            "hashes": [
                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
            ],
            "version": "==1.3.0"
        },
        "attrs": {
            "hashes": [
                "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
                "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
            ],
            "version": "==18.2.0"
        },
        "more-itertools": {
            "hashes": [
                "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
                "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
                "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
            ],
            "version": "==5.0.0"
        },
        "nltk": {
            "hashes": [
                "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
            ],
            "index": "pypi",
            "version": "==3.4"
        },
        "pluggy": {
            "hashes": [
                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
            ],
            "version": "==0.8.1"
        },
        "py": {
            "hashes": [
                "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
                "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
            ],
            "version": "==1.7.0"
        },
        "pytest": {
            "hashes": [
                "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
                "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
            ],
            "index": "pypi",
            "version": "==4.2.0"
        },
        "singledispatch": {
            "hashes": [
                "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
                "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
            ],
            "version": "==3.4.0.3"
        },
        "six": {
            "hashes": [
                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
            ],
            "version": "==1.12.0"
        }
    },
    "develop": {}
 }
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
 # Non-pip Requirement:  Stanford NER JAR
  - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
  - put these files in in /usr/local/bin/:
    - stanford-ner.jar
    - english.all.3class.distsim.crf.ser.gz
--- a/init.py
+++ b/init.py
@@ -1 +0,0 @@
 from transcript_processing.converter import TranscriptConverter
--- a/converter.py
+++ b/converter.py
@@ -4,7 +4,7 @@ from collections import namedtuple
 import os
 import helpers
-from transcript_processing.converters import converters
+import converters
@@ -27,8 +27,7 @@ class TranscriptConverter:
            word_objects = self.get_word_objects(data)
            words = self.get_words(word_objects)
-            if self.output_target == 'interactive_transcript':
+            tagged_words = helpers.tag_words(words)
                tagged_words = helpers.tag_words(words)
            self.converted_words = self.convert_words(
                    word_objects,
@@ -89,10 +88,19 @@ class TranscriptConverter:
        if index < len(word_objects) - 1:
            return word_objects[index + 1]
-    def to_json(self):
+    def interactive_transcript(self):
        return json.dumps(self.converted_words, indent=4)
    def viral_overlay(self):
        return json.dumps(
                [{'start': word['start'],
                  'stop': word['end'],
                  'word': word['word']}
                  for word in self.converted_words],
                indent=4
                )
    def save(self, path):
        with open(path, 'w') as fout:
-            fout.write(self.to_json())
+            fout.write(getattr(self, self.output_target)())
        return path
--- a/converters/init.py
+++ b/converters/init.py
@@ -1,24 +0,0 @@
 """
 fields for converted transcript:
    start
    end
    word
    confidence
    index
    always_capitalized
    punc_before
    punc_after
 """
 from transcript_processing.converters.amazon import amazon_converter
 from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
 converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
    'amazon': amazon_converter,
 }
--- a/converters/amazon.py
+++ b/converters/amazon.py
@@ -1,6 +1,7 @@
 import json
-from transcript_processing import helpers
+from converter import TranscriptConverter
 import helpers
@@ -10,11 +11,11 @@ class AmazonConverter(TranscriptConverter):
        super().__init__(path, output_target)
    def get_word_objects(self, json_data):
-        return data['results']['items']
+        return json_data['results']['items']
    def get_words(self, word_objects):
        return [self.get_word_word(w)
-                for w in word_objects])
+                for w in word_objects]
    @staticmethod
    def get_word_start(word_object):
@@ -30,7 +31,7 @@ class AmazonConverter(TranscriptConverter):
    @staticmethod
    def get_word_word(word_object):
-        word_word = w['alternatives'][0]['content']
+        word_word = word_object['alternatives'][0]['content']
        if word_word == 'i':
            # weird Amazon quirk
            word_word = 'I'
@@ -44,11 +45,11 @@ class AmazonConverter(TranscriptConverter):
        num_words = len(words)
        index = 0
-        for i, w in enumerate(words):
+        for i, w in enumerate(word_objects):
            if w['type'] == 'punctuation':
                continue
            next_word_punc_after = None
-            word_obj = self.get_word_object(w, i, tagged_words, words)
+            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
@@ -60,7 +61,7 @@ class AmazonConverter(TranscriptConverter):
                    next_word_punc_after = None
            if word_obj.word.lower() == 'you' and next_word == 'know':
-                prev_word = words[i - 1]
+                prev_word = word_objects[i - 1]
                if prev_word['type'] != 'punctuation':
                    converted_words[-1]['punc_after'] = ','
                if next_word_type != 'punctuation':
@@ -83,64 +84,3 @@ class AmazonConverter(TranscriptConverter):
            punc_after = False
        return converted_words
 def amazon_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0
    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None
        if word == 'i':
            # weird Amazon quirk
            word = 'I'
        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','
        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })
        index += 1
        punc_after = False
    return converted_words
--- a/converters/gentle.py
+++ b/converters/gentle.py
@@ -0,0 +1,60 @@
 from converter import TranscriptConverter
 class GentleConverter(TranscriptConverter):
    def __init__(self, path, output_target):
        super().__init__(path, output_target)
    def get_word_objects(self, json_data):
        return json_data['words']
    def get_words(self, word_objects):
        return [self.get_word_word(w)
                for w in word_objects]
    @staticmethod
    def get_word_start(word_object):
        return word_object['start']
    @staticmethod
    def get_word_end(word_object):
        return word_object['end']
    @staticmethod
    def get_word_confidence(word_object):
        return 1
    @staticmethod
    def get_word_word(word_object):
        return word_object['alignedWord']
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
            converted_words.append({
                'start': word_obj.start,
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
                'always_capitalized': (
                    word_obj.is_proper_noun 
                    or word_obj.word == 'I'),
                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
--- a/converters/speechmatics.py
+++ b/converters/speechmatics.py
@@ -1,10 +1,74 @@
 from collections import namedtuple
 import json
-from transcript_processing import helpers
+from converter import TranscriptConverter
 import helpers
-Word = namedtuple('Word', 'start end word')
+
 class SpeechmaticsConverter(TranscriptConverter):
    def __init__(self, path, output_target):
        super().__init__(path, output_target)
    def get_word_objects(self, json_data):
        return json_data['words']
    def get_words(self, word_objects):
        return [self.get_word_word(w)
                for w in word_objects]
    @staticmethod
    def get_word_start(word_object):
        return float(word_object['time'])
    @staticmethod
    def get_word_end(word_object):
        return (SpeechmaticsConverter.get_word_start(word_object) 
                + float(word_object['duration']))
    @staticmethod
    def get_word_confidence(word_object):
        return float(word_object['confidence'])
    @staticmethod
    def get_word_word(word_object):
        return word_object['name']
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
            if word_obj.word == '.':
                continue
            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
                if next_word == '.':
                    punc_after = '.'
            converted_words.append({
                'start': word_obj.start,
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
                'always_capitalized': (
                    word_obj.is_proper_noun 
                    or word_obj.word == 'I'),
                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
 def speechmatics_converter(data):
@@ -55,6 +119,8 @@ def speechmatics_aligned_text_converter(data):
    class Exhausted(Exception):
        pass
    Word = namedtuple('Word', 'start end word')
    def get_time(transcript, index):
        time_index = transcript.find('time=', index)
        if time_index == -1:
@@ -108,6 +174,3 @@ def speechmatics_aligned_text_converter(data):
        })
    return converted_words
 def gentle_converter
--- a/tests/test_conversion.py
+++ b/tests/test_conversion.py
@@ -0,0 +1,70 @@
 import json
 import os
 import pytest
 from converters.amazon import AmazonConverter
 from converters.speechmatics import SpeechmaticsConverter
 from converters.gentle import GentleConverter
@pytest.fixture
 def json_transcript():
    with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')) as fin:
        transcript = json.load(fin)
        yield transcript
 def test_json_transcript(json_transcript):
    assert json_transcript["jobName"] == "Lelandmp3"
 def test_amazon():
    a = AmazonConverter(
            os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'),
            'interactive_transcript')
    a.convert()
    assert a.converted_words[0] == {
            'start': 5.49, 
            'end': 5.97, 
            'confidence': 1.0,
            'word': 'So',
            'always_capitalized': False,
            'index': 0,
            'punc_after': False,
            'punc_before': False
            }
 def test_speechmatics():
    a = SpeechmaticsConverter(
            os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'),
            'interactive_transcript')
    a.convert()
    assert a.converted_words[0] == {
            'start': 5.98,
            'end': 6.11,
            'confidence': 0.67,
            'word': 'For',
            'always_capitalized': False,
            'index': 0,
            'punc_after': False,
            'punc_before': False,
            }
 def test_gentle():
    a = GentleConverter(
            os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
            'interactive_transcript')
    a.convert()
    assert a.converted_words[0] == {
            'start': 0.35,
            'end': 1.58, 
            'confidence': 1, 
            'word': '[noise]', 
            'always_capitalized': False, 
            'index': 0, 
            'punc_after': False,
            'punc_before': False
            }
--- a/tests/test_convert_viraloverlay.py
+++ b/tests/test_convert_viraloverlay.py
@@ -0,0 +1,23 @@
 import json
 import os
 import pytest
 from converters.amazon import AmazonConverter
 from converters.speechmatics import SpeechmaticsConverter
 from converters.gentle import GentleConverter
 def test_gentle():
    a = GentleConverter(
            os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
            'viral_overlay')
    a.convert()
    assert json.loads(a.viral_overlay())[0] == {
            'start': 0.35,
            'stop': 1.58, 
            'word': '[noise]', 
            }
		`@@ -1 +0,0 @@`
			`from transcript_processing.converter import TranscriptConverter`