From e36c8ba30e86fdae878b810b0345e68c14b5b674 Mon Sep 17 00:00:00 2001 From: zevav Date: Wed, 6 Feb 2019 22:28:08 -0500 Subject: [PATCH] finished refactoring to a single repo, and to OOP for straight-forward adding of new ASR APIs. added Gentle, and added viral_overlay JSON output. added tests --- Pipfile | 13 +++++ Pipfile.lock | 86 ++++++++++++++++++++++++++++++ README.md | 6 +++ __init__.py | 1 - converter.py | 18 +++++-- converters/__init__.py | 24 --------- converters/amazon.py | 76 +++----------------------- converters/gentle.py | 60 +++++++++++++++++++++ converters/speechmatics.py | 73 +++++++++++++++++++++++-- tests/test_conversion.py | 70 ++++++++++++++++++++++++ tests/test_convert_viraloverlay.py | 23 ++++++++ 11 files changed, 347 insertions(+), 103 deletions(-) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md delete mode 100644 __init__.py create mode 100644 converters/gentle.py create mode 100644 tests/test_conversion.py create mode 100644 tests/test_convert_viraloverlay.py diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..9830ce9 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +nltk = "*" +pytest = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..523f14c --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,86 @@ +{ + "_meta": { + "hash": { + "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", + "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + ], + "version": "==18.2.0" + }, + "more-itertools": { + "hashes": [ + "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4", + "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc", + "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9" + ], + "version": "==5.0.0" + }, + "nltk": { + "hashes": [ + "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d" + ], + "index": "pypi", + "version": "==3.4" + }, + "pluggy": { + "hashes": [ + "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", + "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" + ], + "version": "==0.8.1" + }, + "py": { + "hashes": [ + "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694", + "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6" + ], + "version": "==1.7.0" + }, + "pytest": { + "hashes": [ + "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07", + "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d" + ], + "index": "pypi", + "version": "==4.2.0" + }, + "singledispatch": { + "hashes": [ + "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c", + "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8" + ], + "version": "==3.4.0.3" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..6beb6ca --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# Non-pip Requirement: Stanford NER JAR + + - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download) + - put these files in in /usr/local/bin/: + - stanford-ner.jar + - english.all.3class.distsim.crf.ser.gz diff --git a/__init__.py b/__init__.py deleted file mode 100644 index ac7024f..0000000 --- a/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from transcript_processing.converter import TranscriptConverter \ No newline at end of file diff --git a/converter.py b/converter.py index d9a8184..9e0ea41 100644 --- a/converter.py +++ b/converter.py @@ -4,7 +4,7 @@ from collections import namedtuple import os import helpers -from transcript_processing.converters import converters +import converters @@ -27,8 +27,7 @@ class TranscriptConverter: word_objects = self.get_word_objects(data) words = self.get_words(word_objects) - if self.output_target == 'interactive_transcript': - tagged_words = helpers.tag_words(words) + tagged_words = helpers.tag_words(words) self.converted_words = self.convert_words( word_objects, @@ -89,10 +88,19 @@ class TranscriptConverter: if index < len(word_objects) - 1: return word_objects[index + 1] - def to_json(self): + def interactive_transcript(self): return json.dumps(self.converted_words, indent=4) + def viral_overlay(self): + return json.dumps( + [{'start': word['start'], + 'stop': word['end'], + 'word': word['word']} + for word in self.converted_words], + indent=4 + ) + def save(self, path): with open(path, 'w') as fout: - fout.write(self.to_json()) + fout.write(getattr(self, self.output_target)()) return path diff --git a/converters/__init__.py b/converters/__init__.py index 1b067ef..e69de29 100644 --- a/converters/__init__.py +++ b/converters/__init__.py @@ -1,24 +0,0 @@ -""" - -fields for converted transcript: - - start - end - word - confidence - index - always_capitalized - punc_before - punc_after - -""" - -from transcript_processing.converters.amazon import amazon_converter -from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter - - -converters = { - 'speechmatics': speechmatics_converter, - 'speechmatics_align': speechmatics_aligned_text_converter, - 'amazon': amazon_converter, -} diff --git a/converters/amazon.py b/converters/amazon.py index 7c5dd09..d5fc549 100644 --- a/converters/amazon.py +++ b/converters/amazon.py @@ -1,6 +1,7 @@ import json -from transcript_processing import helpers +from converter import TranscriptConverter +import helpers @@ -10,11 +11,11 @@ class AmazonConverter(TranscriptConverter): super().__init__(path, output_target) def get_word_objects(self, json_data): - return data['results']['items'] + return json_data['results']['items'] def get_words(self, word_objects): return [self.get_word_word(w) - for w in word_objects]) + for w in word_objects] @staticmethod def get_word_start(word_object): @@ -30,7 +31,7 @@ class AmazonConverter(TranscriptConverter): @staticmethod def get_word_word(word_object): - word_word = w['alternatives'][0]['content'] + word_word = word_object['alternatives'][0]['content'] if word_word == 'i': # weird Amazon quirk word_word = 'I' @@ -44,11 +45,11 @@ class AmazonConverter(TranscriptConverter): num_words = len(words) index = 0 - for i, w in enumerate(words): + for i, w in enumerate(word_objects): if w['type'] == 'punctuation': continue next_word_punc_after = None - word_obj = self.get_word_object(w, i, tagged_words, words) + word_obj = self.get_word_object(w, i, tagged_words, word_objects) if word_obj.next_word: next_word = self.get_word_word(word_obj.next_word) @@ -60,7 +61,7 @@ class AmazonConverter(TranscriptConverter): next_word_punc_after = None if word_obj.word.lower() == 'you' and next_word == 'know': - prev_word = words[i - 1] + prev_word = word_objects[i - 1] if prev_word['type'] != 'punctuation': converted_words[-1]['punc_after'] = ',' if next_word_type != 'punctuation': @@ -83,64 +84,3 @@ class AmazonConverter(TranscriptConverter): punc_after = False return converted_words - - -def amazon_converter(data: dict): - data = json.load(data) - converted_words = [] - words = data['results']['items'] - tagged_words = helpers.tag_words( - [w['alternatives'][0]['content'] for w in words]) - punc_before = False - punc_after = False - num_words = len(words) - index = 0 - - for i, w in enumerate(words): - if w['type'] == 'punctuation': - continue - next_word_punc_after = None - word_start = float(w['start_time']) - word_end = float(w['end_time']) - confidence = float(w['alternatives'][0]['confidence']) - word = w['alternatives'][0]['content'] - is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS - - next_word = None - if i < num_words - 1: - next_word = words[i + 1]['alternatives'][0]['content'] - next_word_type = words[i + 1]['type'] - if next_word == '.': - punc_after = '.' - elif next_word == ',': - punc_after = ',' - elif next_word_punc_after: - punc_after = next_word_punc_after - next_word_punc_after = None - - if word == 'i': - # weird Amazon quirk - word = 'I' - - if word.lower() == 'you' and next_word == 'know': - prev_word = words[i - 1] - if prev_word['type'] != 'punctuation': - converted_words[-1]['punc_after'] = ',' - if next_word_type != 'punctuation': - next_word_punc_after = ',' - - converted_words.append({ - 'start': word_start, - 'end': word_end, - 'confidence': confidence, - 'word': word, - 'always_capitalized': is_proper_noun or word == 'I', - 'index': index, - 'punc_after': punc_after, - 'punc_before': punc_before, - }) - - index += 1 - punc_after = False - - return converted_words diff --git a/converters/gentle.py b/converters/gentle.py new file mode 100644 index 0000000..e76ca05 --- /dev/null +++ b/converters/gentle.py @@ -0,0 +1,60 @@ +from converter import TranscriptConverter + + + +class GentleConverter(TranscriptConverter): + + def __init__(self, path, output_target): + super().__init__(path, output_target) + + def get_word_objects(self, json_data): + return json_data['words'] + + def get_words(self, word_objects): + return [self.get_word_word(w) + for w in word_objects] + + @staticmethod + def get_word_start(word_object): + return word_object['start'] + + @staticmethod + def get_word_end(word_object): + return word_object['end'] + + @staticmethod + def get_word_confidence(word_object): + return 1 + + @staticmethod + def get_word_word(word_object): + return word_object['alignedWord'] + + def convert_words(self, word_objects, words, tagged_words=None): + converted_words = [] + punc_before = False + punc_after = False + num_words = len(words) + index = 0 + + for i, w in enumerate(word_objects): + word_obj = self.get_word_object(w, i, tagged_words, word_objects) + + converted_words.append({ + 'start': word_obj.start, + 'end': word_obj.end, + 'confidence': word_obj.confidence, + 'word': word_obj.word, + 'always_capitalized': ( + word_obj.is_proper_noun + or word_obj.word == 'I'), + 'index': index, + 'punc_after': punc_after, + 'punc_before': punc_before, + }) + + index += 1 + punc_after = False + + return converted_words + diff --git a/converters/speechmatics.py b/converters/speechmatics.py index 5a2b7bf..71f7d0d 100644 --- a/converters/speechmatics.py +++ b/converters/speechmatics.py @@ -1,10 +1,74 @@ from collections import namedtuple import json -from transcript_processing import helpers +from converter import TranscriptConverter +import helpers -Word = namedtuple('Word', 'start end word') + +class SpeechmaticsConverter(TranscriptConverter): + + def __init__(self, path, output_target): + super().__init__(path, output_target) + + def get_word_objects(self, json_data): + return json_data['words'] + + def get_words(self, word_objects): + return [self.get_word_word(w) + for w in word_objects] + + @staticmethod + def get_word_start(word_object): + return float(word_object['time']) + + @staticmethod + def get_word_end(word_object): + return (SpeechmaticsConverter.get_word_start(word_object) + + float(word_object['duration'])) + + @staticmethod + def get_word_confidence(word_object): + return float(word_object['confidence']) + + @staticmethod + def get_word_word(word_object): + return word_object['name'] + + def convert_words(self, word_objects, words, tagged_words=None): + converted_words = [] + punc_before = False + punc_after = False + num_words = len(words) + index = 0 + + for i, w in enumerate(word_objects): + word_obj = self.get_word_object(w, i, tagged_words, word_objects) + if word_obj.word == '.': + continue + + if word_obj.next_word: + next_word = self.get_word_word(word_obj.next_word) + if next_word == '.': + punc_after = '.' + + converted_words.append({ + 'start': word_obj.start, + 'end': word_obj.end, + 'confidence': word_obj.confidence, + 'word': word_obj.word, + 'always_capitalized': ( + word_obj.is_proper_noun + or word_obj.word == 'I'), + 'index': index, + 'punc_after': punc_after, + 'punc_before': punc_before, + }) + + index += 1 + punc_after = False + + return converted_words def speechmatics_converter(data): @@ -55,6 +119,8 @@ def speechmatics_aligned_text_converter(data): class Exhausted(Exception): pass + Word = namedtuple('Word', 'start end word') + def get_time(transcript, index): time_index = transcript.find('time=', index) if time_index == -1: @@ -108,6 +174,3 @@ def speechmatics_aligned_text_converter(data): }) return converted_words - - -def gentle_converter diff --git a/tests/test_conversion.py b/tests/test_conversion.py new file mode 100644 index 0000000..a63f8bc --- /dev/null +++ b/tests/test_conversion.py @@ -0,0 +1,70 @@ +import json +import os + +import pytest + +from converters.amazon import AmazonConverter +from converters.speechmatics import SpeechmaticsConverter +from converters.gentle import GentleConverter + + +@pytest.fixture +def json_transcript(): + with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')) as fin: + transcript = json.load(fin) + yield transcript + + +def test_json_transcript(json_transcript): + assert json_transcript["jobName"] == "Lelandmp3" + + +def test_amazon(): + a = AmazonConverter( + os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), + 'interactive_transcript') + a.convert() + assert a.converted_words[0] == { + 'start': 5.49, + 'end': 5.97, + 'confidence': 1.0, + 'word': 'So', + 'always_capitalized': False, + 'index': 0, + 'punc_after': False, + 'punc_before': False + } + + +def test_speechmatics(): + a = SpeechmaticsConverter( + os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), + 'interactive_transcript') + a.convert() + assert a.converted_words[0] == { + 'start': 5.98, + 'end': 6.11, + 'confidence': 0.67, + 'word': 'For', + 'always_capitalized': False, + 'index': 0, + 'punc_after': False, + 'punc_before': False, + } + + +def test_gentle(): + a = GentleConverter( + os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), + 'interactive_transcript') + a.convert() + assert a.converted_words[0] == { + 'start': 0.35, + 'end': 1.58, + 'confidence': 1, + 'word': '[noise]', + 'always_capitalized': False, + 'index': 0, + 'punc_after': False, + 'punc_before': False + } diff --git a/tests/test_convert_viraloverlay.py b/tests/test_convert_viraloverlay.py new file mode 100644 index 0000000..df5e686 --- /dev/null +++ b/tests/test_convert_viraloverlay.py @@ -0,0 +1,23 @@ + + +import json +import os + +import pytest + +from converters.amazon import AmazonConverter +from converters.speechmatics import SpeechmaticsConverter +from converters.gentle import GentleConverter + + + +def test_gentle(): + a = GentleConverter( + os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), + 'viral_overlay') + a.convert() + assert json.loads(a.viral_overlay())[0] == { + 'start': 0.35, + 'stop': 1.58, + 'word': '[noise]', + }