gitignore

2019-02-19 17:10:07 -05:00
parent 11776eaa07
commit 37c1a44b1d
16 changed files with 303 additions and 208 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,116 @@
-.DS_Store
-.vscode
-__pycache__
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/13
+++ b/13
@@ -1,13 +0,0 @@
-[[source]]
-name = "pypi"
-url = "https://pypi.org/simple"
-verify_ssl = true
-
-[dev-packages]
-
-[packages]
-nltk = "*"
-pytest = "*"
-
-[requires]
-python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,86 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3.7"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "atomicwrites": {
-            "hashes": [
-                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
-                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
-            ],
-            "version": "==1.3.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
-                "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
-            ],
-            "version": "==18.2.0"
-        },
-        "more-itertools": {
-            "hashes": [
-                "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
-                "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
-                "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
-            ],
-            "version": "==5.0.0"
-        },
-        "nltk": {
-            "hashes": [
-                "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
-            ],
-            "index": "pypi",
-            "version": "==3.4"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
-                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
-            ],
-            "version": "==0.8.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
-                "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
-            ],
-            "version": "==1.7.0"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
-                "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
-            ],
-            "index": "pypi",
-            "version": "==4.2.0"
-        },
-        "singledispatch": {
-            "hashes": [
-                "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
-                "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
-            ],
-            "version": "==3.4.0.3"
-        },
-        "six": {
-            "hashes": [
-                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
-                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
-            ],
-            "version": "==1.12.0"
-        }
-    },
-    "develop": {}
-}
--- a/README.md
+++ b/README.md
@@ -1,4 +1,37 @@
-# Non-pip Requirement:  Stanford NER JAR
+# tpro
+
+Transcript Processing!  `tpro` takes JSON-formatted transcripts produced by
+various speech-to-text services and converts them to various standardized
+formats.
+
+# STT Services
+
+- [Speechmatics](https://www.speechmatics.com/)
+- [Amazon Transcribe](https://aws.amazon.com/transcribe/)
+- [Gentle](https://github.com/lowerquality/gentle)
+
+## Planned
+
+- [Watson](https://www.ibm.com/watson/services/speech-to-text/) 
+- [Google Speech](https://cloud.google.com/speech-to-text/)
+- [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech)
+
+# Output Formats
+
+- [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON)
+- [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON)
+
+## Planned
+
+- Word (`.doc`, `.docx`)
+- text files
+- SRT (subtitles)
+
+# Installation
+
+    pip install tpro
+
+## Non-pip Requirement:  Stanford NER JAR

  - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
  - put these files in in /usr/local/bin/:
--- a/converters/init.py
+++ b/converters/init.py
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+
+with open('README.md') as file:
+    long_description = file.read()
+
+setup(
+    name="tpro",
+    version="0.01",
+    url='https://github.com/zevaverbach/tpro',
+    install_requires=[
+        'Click',
+        'nltk',
+        ],
+    include_package_data=True,
+    packages=find_packages(),
+    description=(
+        'tpro processes transcripts from speech-to-text services and outputs '
+        'to various formats.'),
+    long_description_content_type='text/markdown',
+    long_description=long_description,
+    entry_points='''
+        [console_scripts]
+        tpro=tpro.tpro:cli
+    ''',
+        )
--- a/tpro/init.py
+++ b/tpro/init.py
@@ -0,0 +1 @@
+name = 'tpro'
--- a/tpro/config.py
+++ b/tpro/config.py
--- a/tpro/converter.py
+++ b/tpro/converter.py
@@ -1,39 +1,35 @@
 import abc
-import json
 from collections import namedtuple
 import os

-import helpers
-import converters
+from . import helpers
+from . import converters



-Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
+Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')


 class TranscriptConverter:

    __metaclass__ = abc.ABCMeta

-    def __init__(self, path, output_target):
-        self.path = path
-        self.output_target = output_target
+    def __init__(self, json_data):
+        self.json_data = json_data

    def convert(self):
        tagged_words = None

-        with open(self.path) as f:
-            data = json.load(f)
-            word_objects = self.get_word_objects(data)
-            words = self.get_words(word_objects)
+        word_objects = self.get_word_objects(self.json_data)
+        words = self.get_words(word_objects)

-            tagged_words = helpers.tag_words(words)
+        tagged_words = helpers.tag_words(words)

-            self.converted_words = self.convert_words(
-                    word_objects,
-                    words,
-                    tagged_words
-                    )
+        self.converted_words = self.convert_words(
+                word_objects,
+                words,
+                tagged_words
+                )

    @staticmethod
    @abc.abstractmethod
@@ -71,16 +67,20 @@ class TranscriptConverter:
        pass

    @staticmethod
-    def check_if_proper_noun(index, tagged_words):
-        return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
+    def check_if_always_capitalized(word, index, tagged_words):
+        if word.upper() == 'I':
+            return True
+        word_category = tagged_words[index][1] 
+        return word_category in helpers.PROPER_NOUN_TAGS

    def get_word_object(self, word_object, index, tagged_words, word_objects):
+        word = self.get_word_word(word_object)
        return Word(
            self.get_word_start(word_object),
            self.get_word_end(word_object),
            self.get_word_confidence(word_object),
-            self.get_word_word(word_object),
-            self.check_if_proper_noun(index, tagged_words),
+            word,
+            self.check_if_always_capitalized(word, index, tagged_words),
            self.get_next_word(word_objects, index)
                ) 

@@ -88,19 +88,13 @@ class TranscriptConverter:
        if index < len(word_objects) - 1:
            return word_objects[index + 1]

-    def interactive_transcript(self):
-        return json.dumps(self.converted_words, indent=4)
-
-    def viral_overlay(self):
-        return json.dumps(
-                [{'start': word['start'],
-                  'stop': word['end'],
-                  'text': word['word']}
-                  for word in self.converted_words],
-                indent=4
-                )
-
-    def save(self, path):
+    def save(self, path, output_target):
        with open(path, 'w') as fout:
-            fout.write(getattr(self, self.output_target)())
+            fout.write(getattr(self, output_target)())
        return path
+
+
+from . import outputs
+for name, val in outputs.__dict__.items():
+    if callable(val):
+        setattr(TranscriptConverter, name, val)
--- a/tpro/converters/init.py
+++ b/tpro/converters/init.py
@@ -0,0 +1,9 @@
+from .amazon import AmazonConverter
+from .speechmatics import SpeechmaticsConverter
+from .gentle import GentleConverter
+
+services = {
+        'amazon': AmazonConverter,
+        'gentle': GentleConverter,
+        'speechmatics': SpeechmaticsConverter,
+        }
--- a/tpro/converters/amazon.py
+++ b/tpro/converters/amazon.py
@@ -1,14 +1,16 @@
 import json

-from converter import TranscriptConverter
-import helpers
+from ..converter import TranscriptConverter
+from .. import helpers



 class AmazonConverter(TranscriptConverter):

-    def __init__(self, path, output_target):
-        super().__init__(path, output_target)
+    name = 'amazon'
+
+    def __init__(self, json_data):
+        super().__init__(json_data)

    def get_word_objects(self, json_data):
        return json_data['results']['items']
@@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
-        index = 0

        for i, w in enumerate(word_objects):
            if w['type'] == 'punctuation':
@@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter):
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
-                'always_capitalized': (
-                    word_obj.is_proper_noun 
-                    or word_obj.word == 'I'),
-                'index': index,
+                'always_capitalized': self.check_if_always_capitalized(
+                    word_obj.word, 
+                    i,
+                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
            })

-            index += 1
            punc_after = False

        return converted_words
--- a/tpro/converters/gentle.py
+++ b/tpro/converters/gentle.py
@@ -1,11 +1,14 @@
-from converter import TranscriptConverter
+from ..converter import TranscriptConverter
+



 class GentleConverter(TranscriptConverter):

-    def __init__(self, path, output_target):
-        super().__init__(path, output_target)
+    name = 'gentle'
+
+    def __init__(self, path):
+        super().__init__(path)

    def get_word_objects(self, json_data):
        return json_data['words']
@@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
-        index = 0

        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter):
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
-                'always_capitalized': (
-                    word_obj.is_proper_noun 
-                    or word_obj.word == 'I'),
-                'index': index,
+                'always_capitalized': self.check_if_always_capitalized(
+                    word_obj.word, 
+                    i,
+                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
            })

-            index += 1
            punc_after = False

        return converted_words
--- a/tpro/converters/speechmatics.py
+++ b/tpro/converters/speechmatics.py
@@ -1,15 +1,17 @@
 from collections import namedtuple
 import json

-from converter import TranscriptConverter
-import helpers
+from ..converter import TranscriptConverter
+from .. import helpers



 class SpeechmaticsConverter(TranscriptConverter):

-    def __init__(self, path, output_target):
-        super().__init__(path, output_target)
+    name = 'speechmatics'
+
+    def __init__(self, path):
+        super().__init__(path)

    def get_word_objects(self, json_data):
        return json_data['words']
@@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
-        index = 0

        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -60,59 +61,15 @@ class SpeechmaticsConverter(TranscriptConverter):
                'always_capitalized': (
                    word_obj.is_proper_noun 
                    or word_obj.word == 'I'),
-                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })

-            index += 1
            punc_after = False

        return converted_words


-def speechmatics_converter(data):
-    data = json.load(data)
-    converted_words = []
-    words = data['words']
-    tagged_words = helpers.tag_words([w['name'] for w in words])
-    punc_before = False
-    punc_after = False
-    num_words = len(words)
-    index = 0
-
-    for i, w in enumerate(words):
-        word_start = float(w['time'])
-        word_end = word_start + float(w['duration'])
-        confidence = float(w['confidence'])
-        word = w['name']
-        if word == '.':
-            continue
-        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
-
-        next_word = None
-        if i < num_words - 1:
-            next_word = words[i + 1]['name']
-        if next_word == '.':
-            punc_after = '.'
-
-        converted_words.append({
-            'start': word_start,
-            'end': word_end,
-            'confidence': confidence,
-            'word': word,
-            'always_capitalized': is_proper_noun or word == 'I',
-            'index': index,
-            'punc_after': punc_after,
-            'punc_before': punc_before,
-        })
-
-        index += 1
-        punc_after = False
-
-    return converted_words
-
-
 def speechmatics_aligned_text_converter(data):
    data = data.readlines()[0]

@@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data):
            'end': word.end,
            'confidence': 1,
            'word': the_word,
-            'always_capitalized': is_proper_noun or word == 'I',
+            'always_capitalized': self.check_if_always_capitalized(
+                word.word, 
+                i,
+                tagged_words),
            'index': i,
            'punc_before': punc_before,
            'punc_after': punc_after,
--- a/tpro/helpers.py
+++ b/tpro/helpers.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from nltk.tag.stanford import StanfordNERTagger

 st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
@@ -34,4 +36,8 @@ def get_punc_after(word):
        if char.isalpha():
            return punc
        if char in PUNCTUATION:
-            punc.insert(0, char)
+            punc.insert(0, char)
+
+
+def is_path(string):
+    return '/' in string and Path(string).exists()
--- a/tpro/outputs.py
+++ b/tpro/outputs.py
@@ -0,0 +1,14 @@
+import json
+
+def universal_transcript(self):
+    return json.dumps(self.converted_words, indent=4)
+
+def viral_overlay(self):
+    return json.dumps([{
+  'start': word['start'],
+  'stop': word['end'],
+  'text': word['word'].title() if word['always_capitalized'] else word['word']}
+
+              for word in self.converted_words], indent=4
+            )
+
--- a/tpro/tpro.py
+++ b/tpro/tpro.py
@@ -0,0 +1,37 @@
+import json
+
+import click
+
+from .converters import services
+from . import outputs
+from . import helpers
+
+output_choices =  [k for k, v in 
+                   outputs.__dict__.items()
+                   if callable(v)]
+
+@click.command()
+@click.option('-s', '--save', type=str, help='save to file')
+@click.argument('json_path_or_data', type=str)
+@click.argument('input_format', type=click.Choice(services.keys()))
+@click.argument('output_format', type=click.Choice(output_choices))
+def cli(save, 
+        json_path_or_data,
+        input_format,
+        output_format):
+
+    if not helpers.is_path(json_path_or_data):
+        json_data = json.loads(json_path_or_data)
+    else:
+        with open(json_path_or_data) as fin:
+            json_data = json.load(fin)
+    service = services[input_format]
+    converter = service(json_data)
+    converter.convert()
+    if save:
+        path = save
+        converter.save(path, output_format)
+        click.echo(f'{path} saved.')
+    else:
+        output_formatter = getattr(converter, output_format)
+        click.echo(output_formatter())