diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..308b264 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pytest = "*" + +[packages] +tpro = {editable = true,path = "."} + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..75ad4bc --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,104 @@ +{ + "_meta": { + "hash": { + "sha256": "49961036ff9465d1da8edf8b981512812678348e4baaa0c51841df64e80533ad" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "click": { + "hashes": [ + "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", + "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" + ], + "version": "==7.0" + }, + "nltk": { + "hashes": [ + "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d" + ], + "version": "==3.4" + }, + "singledispatch": { + "hashes": [ + "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c", + "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8" + ], + "version": "==3.4.0.3" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "tpro": { + "editable": true, + "path": "." + } + }, + "develop": { + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "more-itertools": { + "hashes": [ + "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40", + "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1" + ], + "markers": "python_version > '2.7'", + "version": "==6.0.0" + }, + "pluggy": { + "hashes": [ + "sha256:19ecf9ce9db2fce065a7a0586e07cfb4ac8614fe96edf628a264b1c70116cf8f", + "sha256:84d306a647cc805219916e62aab89caa97a33a1dd8c342e87a37f91073cd4746" + ], + "version": "==0.9.0" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pytest": { + "hashes": [ + "sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c", + "sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4" + ], + "index": "pypi", + "version": "==4.3.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + } +} diff --git a/setup.py b/setup.py index 1ff7d06..0cc2fbd 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('README_PYPI.md') as file: setup( name="tpro", - version="0.08", + version="0.09", url='https://github.com/zevaverbach/tpro', install_requires=[ 'Click', @@ -21,6 +21,6 @@ setup( long_description=long_description, entry_points=''' [console_scripts] - tpro=tpro.tpro:cli + tpro=transcript_processing.tpro:cli ''', ) diff --git a/tests/test_conversion.py b/tests/test_conversion.py index a63f8bc..015d681 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -3,9 +3,10 @@ import os import pytest -from converters.amazon import AmazonConverter -from converters.speechmatics import SpeechmaticsConverter -from converters.gentle import GentleConverter +from transcript_processing.converters.amazon import AmazonConverter +from transcript_processing.converters.speechmatics import SpeechmaticsConverter +from transcript_processing.converters.gentle import GentleConverter +from transcript_processing.converters.google import GoogleConverter @pytest.fixture @@ -20,9 +21,10 @@ def test_json_transcript(json_transcript): def test_amazon(): - a = AmazonConverter( - os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), - 'interactive_transcript') + with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), 'r') as fin: + json_data = json.load(fin) + + a = AmazonConverter(json_data) a.convert() assert a.converted_words[0] == { 'start': 5.49, @@ -30,16 +32,17 @@ def test_amazon(): 'confidence': 1.0, 'word': 'So', 'always_capitalized': False, - 'index': 0, 'punc_after': False, 'punc_before': False } def test_speechmatics(): - a = SpeechmaticsConverter( - os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), - 'interactive_transcript') + with open(os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), 'r') as fin: + json_data = json.load(fin) + + a = SpeechmaticsConverter(json_data) + a.convert() assert a.converted_words[0] == { 'start': 5.98, @@ -47,16 +50,16 @@ def test_speechmatics(): 'confidence': 0.67, 'word': 'For', 'always_capitalized': False, - 'index': 0, 'punc_after': False, 'punc_before': False, } def test_gentle(): - a = GentleConverter( - os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), - 'interactive_transcript') + with open(os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), 'r') as fin: + json_data = json.load(fin) + + a = GentleConverter(json_data) a.convert() assert a.converted_words[0] == { 'start': 0.35, @@ -64,7 +67,25 @@ def test_gentle(): 'confidence': 1, 'word': '[noise]', 'always_capitalized': False, - 'index': 0, 'punc_after': False, 'punc_before': False } + + +def test_google(): + with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin: + transcript_data = fin.read() + + g = GoogleConverter(transcript_data) + + g.convert() + assert g.converted_words[0] == { + 'start': 4, + 'end': 5.5, + 'confidence': 0.88, + 'word': 'Testing', + 'always_capitalized': False, + 'punc_after': [','], + 'punc_before': False, + } + diff --git a/tests/test_convert_google.py b/tests/test_convert_google.py new file mode 100644 index 0000000..7fa5f6c --- /dev/null +++ b/tests/test_convert_google.py @@ -0,0 +1,32 @@ +import json +import os + +import pytest + +from transcript_processing.converters.google import ( + make_json_friendly, + GoogleConverter, + ) +from transcript_processing.config import GOOGLE_TRANSCRIPT_TEST_FILE + + +@pytest.fixture +def transcript(): + with open(GOOGLE_TRANSCRIPT_TEST_FILE, 'r') as fin: + return fin.read() + + +def test_make_json_friendly(transcript): + friendly = make_json_friendly(transcript) + assert json.loads(friendly) + + +def test_pre_process(transcript): + with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin: + transcript_data = fin.read() + + g = GoogleConverter(transcript_data) + assert g.json_data + + + diff --git a/tpro/__init__.py b/transcript_processing/__init__.py similarity index 100% rename from tpro/__init__.py rename to transcript_processing/__init__.py diff --git a/tpro/config.py b/transcript_processing/config.py similarity index 53% rename from tpro/config.py rename to transcript_processing/config.py index d99b18c..97ab743 100644 --- a/tpro/config.py +++ b/transcript_processing/config.py @@ -2,3 +2,4 @@ import os AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE') +GOOGLE_TRANSCRIPT_TEST_FILE = os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE') diff --git a/tpro/converter.py b/transcript_processing/converter.py similarity index 89% rename from tpro/converter.py rename to transcript_processing/converter.py index f300c48..28a1582 100644 --- a/tpro/converter.py +++ b/transcript_processing/converter.py @@ -14,7 +14,7 @@ class TranscriptConverter: __metaclass__ = abc.ABCMeta - def __init__(self, json_data): + def __init__(self, json_data: dict): self.json_data = json_data def convert(self): @@ -31,19 +31,16 @@ class TranscriptConverter: tagged_words ) - @staticmethod @abc.abstractmethod - def get_word_objects(json_data): + def get_word_objects(self, json_data): pass - @staticmethod - @abc.abstractmethod - def get_words(word_objects): - pass + def get_words(self, word_objects): + return [self.get_word_word(w) + for w in word_objects] - @staticmethod @abc.abstractmethod - def convert_words(word_objects, words, tagged_words=None): + def convert_words(self, word_objects, words, tagged_words=None): pass @staticmethod diff --git a/tpro/converters/__init__.py b/transcript_processing/converters/__init__.py similarity index 78% rename from tpro/converters/__init__.py rename to transcript_processing/converters/__init__.py index 62dcdde..22ac580 100644 --- a/tpro/converters/__init__.py +++ b/transcript_processing/converters/__init__.py @@ -1,9 +1,11 @@ from .amazon import AmazonConverter from .speechmatics import SpeechmaticsConverter from .gentle import GentleConverter +from .google import GoogleConverter services = { 'amazon': AmazonConverter, 'gentle': GentleConverter, 'speechmatics': SpeechmaticsConverter, + 'google': GoogleConverter, } diff --git a/tpro/converters/amazon.py b/transcript_processing/converters/amazon.py similarity index 93% rename from tpro/converters/amazon.py rename to transcript_processing/converters/amazon.py index 3b133e2..4196f2a 100644 --- a/tpro/converters/amazon.py +++ b/transcript_processing/converters/amazon.py @@ -15,10 +15,6 @@ class AmazonConverter(TranscriptConverter): def get_word_objects(self, json_data): return json_data['results']['items'] - def get_words(self, word_objects): - return [self.get_word_word(w) - for w in word_objects] - @staticmethod def get_word_start(word_object): return float(word_object['start_time']) @@ -32,7 +28,7 @@ class AmazonConverter(TranscriptConverter): return float(word_object['alternatives'][0]['confidence']) @staticmethod - def get_word_word(word_object): + def get_word_word(word_object) -> str: word_word = word_object['alternatives'][0]['content'] if word_word == 'i': # weird Amazon quirk @@ -44,7 +40,6 @@ class AmazonConverter(TranscriptConverter): punc_before = False punc_after = False - num_words = len(words) for i, w in enumerate(word_objects): if w['type'] == 'punctuation': diff --git a/tpro/converters/gentle.py b/transcript_processing/converters/gentle.py similarity index 79% rename from tpro/converters/gentle.py rename to transcript_processing/converters/gentle.py index 2f95f58..2d4defc 100644 --- a/tpro/converters/gentle.py +++ b/transcript_processing/converters/gentle.py @@ -7,16 +7,12 @@ class GentleConverter(TranscriptConverter): name = 'gentle' - def __init__(self, path): - super().__init__(path) + def __init__(self, json_data): + super().__init__(json_data) def get_word_objects(self, json_data): return json_data['words'] - def get_words(self, word_objects): - return [self.get_word_word(w) - for w in word_objects] - @staticmethod def get_word_start(word_object): return word_object['start'] @@ -35,8 +31,6 @@ class GentleConverter(TranscriptConverter): def convert_words(self, word_objects, words, tagged_words=None): converted_words = [] - punc_before = False - punc_after = False num_words = len(words) for i, w in enumerate(word_objects): @@ -51,8 +45,8 @@ class GentleConverter(TranscriptConverter): word_obj.word, i, tagged_words), - 'punc_after': punc_after, - 'punc_before': punc_before, + 'punc_after': False, + 'punc_before': False, }) punc_after = False diff --git a/transcript_processing/converters/google.py b/transcript_processing/converters/google.py new file mode 100644 index 0000000..0b5061a --- /dev/null +++ b/transcript_processing/converters/google.py @@ -0,0 +1,145 @@ +import json +import re + +from ..converter import TranscriptConverter +from .. import helpers + + + +class GoogleConverter(TranscriptConverter): + + def __init__(self, transcript_data: str): + super().__init__(transcript_data) + self.json_data = self.pre_process(transcript_data) + + def pre_process(self, transcript_data): + friendly = make_json_friendly(transcript_data) + return json.loads(friendly) + + def get_word_objects(self, json_data): + return json_data + + def convert_words(self, word_objects, words, tagged_words=None): + converted_words = [] + + punc_before = False + punc_after = False + + for i, w in enumerate(word_objects): + word_obj = self.get_word_object(w, i, tagged_words, word_objects) + punc_before = helpers.get_punc_before(word_obj.word) or False + punc_after = helpers.get_punc_after(word_obj.word) or False + + the_word = word_obj.word + if punc_before: + the_word = the_word[len(punc_before):] + if punc_after: + the_word = the_word[:-len(punc_after)] + + converted_words.append({ + 'start': word_obj.start, + 'end': word_obj.end, + 'confidence': word_obj.confidence, + 'word': the_word, + 'always_capitalized': self.check_if_always_capitalized( + word_obj.word, + i, + tagged_words), + 'punc_after': punc_after, + 'punc_before': punc_before, + }) + + return converted_words + + @classmethod + def get_word_start(cls, word_object): + return cls.get_seconds(word_object['start_time']) + + @classmethod + def get_word_end(cls, word_object): + return cls.get_seconds(word_object['end_time']) + + @staticmethod + def get_seconds(time: dict) -> float: + seconds = 0 + if 'seconds' in time: + seconds = time['seconds'] + if 'nanos' in time: + seconds += time['nanos'] / 1_000_000_000 + return seconds + + @staticmethod + def get_word_confidence(word_object): + return word_object['confidence'] + + @staticmethod + def get_word_word(word_object): + print(word_object) + return word_object['word'] + + + +def make_json_friendly(json_string): + lines = [line.strip() for line in json_string.split('\\n')] + + fields = [ + 'words {', + 'start_time {', + '}', + 'end_time {', + '}', + 'word: ', + 'confidence: ' + ] + + current_field_index = 0 + new_string = '' + + for line in lines: + + current_field = fields[current_field_index] + + if current_field in line: + if current_field_index == len(fields) - 1: + current_field_index = 0 + else: + current_field_index += 1 + if current_field_index == 1: + new_string += '}, {' + # "words" was found, don't want to append that + continue + + else: + if current_field_index == 0: + # haven't found the beginning of the next word object + continue + + # add quotes around keys + line = re.sub('^(?!")([0-9a-zA-Z_]+)', + '"\\1"', + line) + + # add colons after keys + if line.endswith('{'): + line = line.replace('" ', '": ') + + # use first two decimals of confidence + if 'confidence' in current_field: + line = ', ' + line + line = line[:20] + + if current_field == '}': + line = line + ', ' + + new_string += line + + # cleanup + if new_string.startswith('}, '): + new_string = new_string[3:] + if not new_string.startswith('['): + new_string = '[' + new_string + if not new_string.endswith('}]'): + new_string = new_string + '}]' + new_string = new_string.replace(', }', '}').replace('\\', '') + + return new_string diff --git a/tpro/converters/speechmatics.py b/transcript_processing/converters/speechmatics.py similarity index 97% rename from tpro/converters/speechmatics.py rename to transcript_processing/converters/speechmatics.py index fd9800b..37204b5 100644 --- a/tpro/converters/speechmatics.py +++ b/transcript_processing/converters/speechmatics.py @@ -16,10 +16,6 @@ class SpeechmaticsConverter(TranscriptConverter): def get_word_objects(self, json_data): return json_data['words'] - def get_words(self, word_objects): - return [self.get_word_word(w) - for w in word_objects] - @staticmethod def get_word_start(word_object): return float(word_object['time']) diff --git a/tpro/helpers.py b/transcript_processing/helpers.py similarity index 100% rename from tpro/helpers.py rename to transcript_processing/helpers.py diff --git a/tpro/outputs.py b/transcript_processing/outputs.py similarity index 100% rename from tpro/outputs.py rename to transcript_processing/outputs.py diff --git a/tpro/tpro.py b/transcript_processing/tpro.py similarity index 100% rename from tpro/tpro.py rename to transcript_processing/tpro.py