diff --git a/.gitignore b/.gitignore index 64cadab..0447b8b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,116 @@ -.DS_Store -.vscode -__pycache__ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/Pipfile b/Pipfile deleted file mode 100644 index 9830ce9..0000000 --- a/Pipfile +++ /dev/null @@ -1,13 +0,0 @@ -[[source]] -name = "pypi" -url = "https://pypi.org/simple" -verify_ssl = true - -[dev-packages] - -[packages] -nltk = "*" -pytest = "*" - -[requires] -python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 523f14c..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,86 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.7" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "atomicwrites": { - "hashes": [ - "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", - "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" - ], - "version": "==1.3.0" - }, - "attrs": { - "hashes": [ - "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", - "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" - ], - "version": "==18.2.0" - }, - "more-itertools": { - "hashes": [ - "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4", - "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc", - "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9" - ], - "version": "==5.0.0" - }, - "nltk": { - "hashes": [ - "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d" - ], - "index": "pypi", - "version": "==3.4" - }, - "pluggy": { - "hashes": [ - "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", - "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" - ], - "version": "==0.8.1" - }, - "py": { - "hashes": [ - "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694", - "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6" - ], - "version": "==1.7.0" - }, - "pytest": { - "hashes": [ - "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07", - "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d" - ], - "index": "pypi", - "version": "==4.2.0" - }, - "singledispatch": { - "hashes": [ - "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c", - "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8" - ], - "version": "==3.4.0.3" - }, - "six": { - "hashes": [ - "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", - "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" - ], - "version": "==1.12.0" - } - }, - "develop": {} -} diff --git a/README.md b/README.md index 6beb6ca..c946cda 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,37 @@ -# Non-pip Requirement: Stanford NER JAR +# tpro + +Transcript Processing! `tpro` takes JSON-formatted transcripts produced by +various speech-to-text services and converts them to various standardized +formats. + +# STT Services + +- [Speechmatics](https://www.speechmatics.com/) +- [Amazon Transcribe](https://aws.amazon.com/transcribe/) +- [Gentle](https://github.com/lowerquality/gentle) + +## Planned + +- [Watson](https://www.ibm.com/watson/services/speech-to-text/) +- [Google Speech](https://cloud.google.com/speech-to-text/) +- [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech) + +# Output Formats + +- [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON) +- [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON) + +## Planned + +- Word (`.doc`, `.docx`) +- text files +- SRT (subtitles) + +# Installation + + pip install tpro + +## Non-pip Requirement: Stanford NER JAR - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download) - put these files in in /usr/local/bin/: diff --git a/converters/__init__.py b/converters/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3c29e60 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages + + +with open('README.md') as file: + long_description = file.read() + +setup( + name="tpro", + version="0.01", + url='https://github.com/zevaverbach/tpro', + install_requires=[ + 'Click', + 'nltk', + ], + include_package_data=True, + packages=find_packages(), + description=( + 'tpro processes transcripts from speech-to-text services and outputs ' + 'to various formats.'), + long_description_content_type='text/markdown', + long_description=long_description, + entry_points=''' + [console_scripts] + tpro=tpro.tpro:cli + ''', + ) diff --git a/tpro/__init__.py b/tpro/__init__.py new file mode 100644 index 0000000..dc4a683 --- /dev/null +++ b/tpro/__init__.py @@ -0,0 +1 @@ +name = 'tpro' diff --git a/config.py b/tpro/config.py similarity index 100% rename from config.py rename to tpro/config.py diff --git a/converter.py b/tpro/converter.py similarity index 52% rename from converter.py rename to tpro/converter.py index fb97a18..f300c48 100644 --- a/converter.py +++ b/tpro/converter.py @@ -1,39 +1,35 @@ import abc -import json from collections import namedtuple import os -import helpers -import converters +from . import helpers +from . import converters -Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word') +Word = namedtuple('Word', 'start end confidence word always_capitalized next_word') class TranscriptConverter: __metaclass__ = abc.ABCMeta - def __init__(self, path, output_target): - self.path = path - self.output_target = output_target + def __init__(self, json_data): + self.json_data = json_data def convert(self): tagged_words = None - with open(self.path) as f: - data = json.load(f) - word_objects = self.get_word_objects(data) - words = self.get_words(word_objects) + word_objects = self.get_word_objects(self.json_data) + words = self.get_words(word_objects) - tagged_words = helpers.tag_words(words) + tagged_words = helpers.tag_words(words) - self.converted_words = self.convert_words( - word_objects, - words, - tagged_words - ) + self.converted_words = self.convert_words( + word_objects, + words, + tagged_words + ) @staticmethod @abc.abstractmethod @@ -71,16 +67,20 @@ class TranscriptConverter: pass @staticmethod - def check_if_proper_noun(index, tagged_words): - return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS + def check_if_always_capitalized(word, index, tagged_words): + if word.upper() == 'I': + return True + word_category = tagged_words[index][1] + return word_category in helpers.PROPER_NOUN_TAGS def get_word_object(self, word_object, index, tagged_words, word_objects): + word = self.get_word_word(word_object) return Word( self.get_word_start(word_object), self.get_word_end(word_object), self.get_word_confidence(word_object), - self.get_word_word(word_object), - self.check_if_proper_noun(index, tagged_words), + word, + self.check_if_always_capitalized(word, index, tagged_words), self.get_next_word(word_objects, index) ) @@ -88,19 +88,13 @@ class TranscriptConverter: if index < len(word_objects) - 1: return word_objects[index + 1] - def interactive_transcript(self): - return json.dumps(self.converted_words, indent=4) - - def viral_overlay(self): - return json.dumps( - [{'start': word['start'], - 'stop': word['end'], - 'text': word['word']} - for word in self.converted_words], - indent=4 - ) - - def save(self, path): + def save(self, path, output_target): with open(path, 'w') as fout: - fout.write(getattr(self, self.output_target)()) + fout.write(getattr(self, output_target)()) return path + + +from . import outputs +for name, val in outputs.__dict__.items(): + if callable(val): + setattr(TranscriptConverter, name, val) diff --git a/tpro/converters/__init__.py b/tpro/converters/__init__.py new file mode 100644 index 0000000..62dcdde --- /dev/null +++ b/tpro/converters/__init__.py @@ -0,0 +1,9 @@ +from .amazon import AmazonConverter +from .speechmatics import SpeechmaticsConverter +from .gentle import GentleConverter + +services = { + 'amazon': AmazonConverter, + 'gentle': GentleConverter, + 'speechmatics': SpeechmaticsConverter, + } diff --git a/converters/amazon.py b/tpro/converters/amazon.py similarity index 86% rename from converters/amazon.py rename to tpro/converters/amazon.py index d5fc549..3b133e2 100644 --- a/converters/amazon.py +++ b/tpro/converters/amazon.py @@ -1,14 +1,16 @@ import json -from converter import TranscriptConverter -import helpers +from ..converter import TranscriptConverter +from .. import helpers class AmazonConverter(TranscriptConverter): - def __init__(self, path, output_target): - super().__init__(path, output_target) + name = 'amazon' + + def __init__(self, json_data): + super().__init__(json_data) def get_word_objects(self, json_data): return json_data['results']['items'] @@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter): punc_before = False punc_after = False num_words = len(words) - index = 0 for i, w in enumerate(word_objects): if w['type'] == 'punctuation': @@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter): 'end': word_obj.end, 'confidence': word_obj.confidence, 'word': word_obj.word, - 'always_capitalized': ( - word_obj.is_proper_noun - or word_obj.word == 'I'), - 'index': index, + 'always_capitalized': self.check_if_always_capitalized( + word_obj.word, + i, + tagged_words), 'punc_after': punc_after, 'punc_before': punc_before, }) - index += 1 punc_after = False return converted_words diff --git a/converters/gentle.py b/tpro/converters/gentle.py similarity index 79% rename from converters/gentle.py rename to tpro/converters/gentle.py index e76ca05..2f95f58 100644 --- a/converters/gentle.py +++ b/tpro/converters/gentle.py @@ -1,11 +1,14 @@ -from converter import TranscriptConverter +from ..converter import TranscriptConverter + class GentleConverter(TranscriptConverter): - def __init__(self, path, output_target): - super().__init__(path, output_target) + name = 'gentle' + + def __init__(self, path): + super().__init__(path) def get_word_objects(self, json_data): return json_data['words'] @@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter): punc_before = False punc_after = False num_words = len(words) - index = 0 for i, w in enumerate(word_objects): word_obj = self.get_word_object(w, i, tagged_words, word_objects) @@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter): 'end': word_obj.end, 'confidence': word_obj.confidence, 'word': word_obj.word, - 'always_capitalized': ( - word_obj.is_proper_noun - or word_obj.word == 'I'), - 'index': index, + 'always_capitalized': self.check_if_always_capitalized( + word_obj.word, + i, + tagged_words), 'punc_after': punc_after, 'punc_before': punc_before, }) - index += 1 punc_after = False return converted_words diff --git a/converters/speechmatics.py b/tpro/converters/speechmatics.py similarity index 71% rename from converters/speechmatics.py rename to tpro/converters/speechmatics.py index 71f7d0d..7ff7c87 100644 --- a/converters/speechmatics.py +++ b/tpro/converters/speechmatics.py @@ -1,15 +1,17 @@ from collections import namedtuple import json -from converter import TranscriptConverter -import helpers +from ..converter import TranscriptConverter +from .. import helpers class SpeechmaticsConverter(TranscriptConverter): - def __init__(self, path, output_target): - super().__init__(path, output_target) + name = 'speechmatics' + + def __init__(self, path): + super().__init__(path) def get_word_objects(self, json_data): return json_data['words'] @@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter): punc_before = False punc_after = False num_words = len(words) - index = 0 for i, w in enumerate(word_objects): word_obj = self.get_word_object(w, i, tagged_words, word_objects) @@ -60,59 +61,15 @@ class SpeechmaticsConverter(TranscriptConverter): 'always_capitalized': ( word_obj.is_proper_noun or word_obj.word == 'I'), - 'index': index, 'punc_after': punc_after, 'punc_before': punc_before, }) - index += 1 punc_after = False return converted_words -def speechmatics_converter(data): - data = json.load(data) - converted_words = [] - words = data['words'] - tagged_words = helpers.tag_words([w['name'] for w in words]) - punc_before = False - punc_after = False - num_words = len(words) - index = 0 - - for i, w in enumerate(words): - word_start = float(w['time']) - word_end = word_start + float(w['duration']) - confidence = float(w['confidence']) - word = w['name'] - if word == '.': - continue - is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS - - next_word = None - if i < num_words - 1: - next_word = words[i + 1]['name'] - if next_word == '.': - punc_after = '.' - - converted_words.append({ - 'start': word_start, - 'end': word_end, - 'confidence': confidence, - 'word': word, - 'always_capitalized': is_proper_noun or word == 'I', - 'index': index, - 'punc_after': punc_after, - 'punc_before': punc_before, - }) - - index += 1 - punc_after = False - - return converted_words - - def speechmatics_aligned_text_converter(data): data = data.readlines()[0] @@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data): 'end': word.end, 'confidence': 1, 'word': the_word, - 'always_capitalized': is_proper_noun or word == 'I', + 'always_capitalized': self.check_if_always_capitalized( + word.word, + i, + tagged_words), 'index': i, 'punc_before': punc_before, 'punc_after': punc_after, diff --git a/helpers.py b/tpro/helpers.py similarity index 86% rename from helpers.py rename to tpro/helpers.py index 4386a3f..e7d1ad2 100644 --- a/helpers.py +++ b/tpro/helpers.py @@ -1,3 +1,5 @@ +from pathlib import Path + from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz', @@ -34,4 +36,8 @@ def get_punc_after(word): if char.isalpha(): return punc if char in PUNCTUATION: - punc.insert(0, char) \ No newline at end of file + punc.insert(0, char) + + +def is_path(string): + return '/' in string and Path(string).exists() diff --git a/tpro/outputs.py b/tpro/outputs.py new file mode 100644 index 0000000..08f4b75 --- /dev/null +++ b/tpro/outputs.py @@ -0,0 +1,14 @@ +import json + +def universal_transcript(self): + return json.dumps(self.converted_words, indent=4) + +def viral_overlay(self): + return json.dumps([{ + 'start': word['start'], + 'stop': word['end'], + 'text': word['word'].title() if word['always_capitalized'] else word['word']} + + for word in self.converted_words], indent=4 + ) + diff --git a/tpro/tpro.py b/tpro/tpro.py new file mode 100644 index 0000000..0c9ab24 --- /dev/null +++ b/tpro/tpro.py @@ -0,0 +1,37 @@ +import json + +import click + +from .converters import services +from . import outputs +from . import helpers + +output_choices = [k for k, v in + outputs.__dict__.items() + if callable(v)] + +@click.command() +@click.option('-s', '--save', type=str, help='save to file') +@click.argument('json_path_or_data', type=str) +@click.argument('input_format', type=click.Choice(services.keys())) +@click.argument('output_format', type=click.Choice(output_choices)) +def cli(save, + json_path_or_data, + input_format, + output_format): + + if not helpers.is_path(json_path_or_data): + json_data = json.loads(json_path_or_data) + else: + with open(json_path_or_data) as fin: + json_data = json.load(fin) + service = services[input_format] + converter = service(json_data) + converter.convert() + if save: + path = save + converter.save(path, output_format) + click.echo(f'{path} saved.') + else: + output_formatter = getattr(converter, output_format) + click.echo(output_formatter())