gitignore

2019-02-19 17:10:07 -05:00
parent 11776eaa07
commit 37c1a44b1d
16 changed files with 303 additions and 208 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,116 @@
-.DS_Store
+# Byte-compiled / optimized / DLL files
-.vscode
+__pycache__/
-__pycache__
+*.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
--- a/13
+++ b/13
@@ -1,13 +0,0 @@
 [[source]]
 name = "pypi"
 url = "https://pypi.org/simple"
 verify_ssl = true
 [dev-packages]
 [packages]
 nltk = "*"
 pytest = "*"
 [requires]
 python_version = "3.7"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,86 +0,0 @@
 {
    "_meta": {
        "hash": {
            "sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
        },
        "pipfile-spec": 6,
        "requires": {
            "python_version": "3.7"
        },
        "sources": [
            {
                "name": "pypi",
                "url": "https://pypi.org/simple",
                "verify_ssl": true
            }
        ]
    },
    "default": {
        "atomicwrites": {
            "hashes": [
                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
            ],
            "version": "==1.3.0"
        },
        "attrs": {
            "hashes": [
                "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
                "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
            ],
            "version": "==18.2.0"
        },
        "more-itertools": {
            "hashes": [
                "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
                "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
                "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
            ],
            "version": "==5.0.0"
        },
        "nltk": {
            "hashes": [
                "sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
            ],
            "index": "pypi",
            "version": "==3.4"
        },
        "pluggy": {
            "hashes": [
                "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
                "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
            ],
            "version": "==0.8.1"
        },
        "py": {
            "hashes": [
                "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
                "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
            ],
            "version": "==1.7.0"
        },
        "pytest": {
            "hashes": [
                "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
                "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
            ],
            "index": "pypi",
            "version": "==4.2.0"
        },
        "singledispatch": {
            "hashes": [
                "sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
                "sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
            ],
            "version": "==3.4.0.3"
        },
        "six": {
            "hashes": [
                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
            ],
            "version": "==1.12.0"
        }
    },
    "develop": {}
 }
--- a/README.md
+++ b/README.md
@@ -1,4 +1,37 @@
-# Non-pip Requirement:  Stanford NER JAR
+# tpro
 Transcript Processing!  `tpro` takes JSON-formatted transcripts produced by
 various speech-to-text services and converts them to various standardized
 formats.
 # STT Services
 - [Speechmatics](https://www.speechmatics.com/)
 - [Amazon Transcribe](https://aws.amazon.com/transcribe/)
 - [Gentle](https://github.com/lowerquality/gentle)
 ## Planned
 - [Watson](https://www.ibm.com/watson/services/speech-to-text/) 
 - [Google Speech](https://cloud.google.com/speech-to-text/)
 - [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech)
 # Output Formats
 - [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON)
 - [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON)
 ## Planned
 - Word (`.doc`, `.docx`)
 - text files
 - SRT (subtitles)
 # Installation
    pip install tpro
 ## Non-pip Requirement:  Stanford NER JAR
  - download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
  - put these files in in /usr/local/bin/:
--- a/converters/init.py
+++ b/converters/init.py
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,26 @@
 from setuptools import setup, find_packages
 with open('README.md') as file:
    long_description = file.read()
 setup(
    name="tpro",
    version="0.01",
    url='https://github.com/zevaverbach/tpro',
    install_requires=[
        'Click',
        'nltk',
        ],
    include_package_data=True,
    packages=find_packages(),
    description=(
        'tpro processes transcripts from speech-to-text services and outputs '
        'to various formats.'),
    long_description_content_type='text/markdown',
    long_description=long_description,
    entry_points='''
        [console_scripts]
        tpro=tpro.tpro:cli
    ''',
        )
--- a/tpro/init.py
+++ b/tpro/init.py
@@ -0,0 +1 @@
 name = 'tpro'
--- a/tpro/config.py
+++ b/tpro/config.py
--- a/tpro/converter.py
+++ b/tpro/converter.py
@@ -1,30 +1,26 @@
 import abc
 import json
 from collections import namedtuple
 import os
-import helpers
+from . import helpers
-import converters
+from . import converters
-Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
+Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
 class TranscriptConverter:
    __metaclass__ = abc.ABCMeta
-    def __init__(self, path, output_target):
+    def __init__(self, json_data):
-        self.path = path
+        self.json_data = json_data
        self.output_target = output_target
    def convert(self):
        tagged_words = None
-        with open(self.path) as f:
+        word_objects = self.get_word_objects(self.json_data)
            data = json.load(f)
            word_objects = self.get_word_objects(data)
        words = self.get_words(word_objects)
        tagged_words = helpers.tag_words(words)
@@ -71,16 +67,20 @@ class TranscriptConverter:
        pass
    @staticmethod
-    def check_if_proper_noun(index, tagged_words):
+    def check_if_always_capitalized(word, index, tagged_words):
-        return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
+        if word.upper() == 'I':
            return True
        word_category = tagged_words[index][1] 
        return word_category in helpers.PROPER_NOUN_TAGS
    def get_word_object(self, word_object, index, tagged_words, word_objects):
        word = self.get_word_word(word_object)
        return Word(
            self.get_word_start(word_object),
            self.get_word_end(word_object),
            self.get_word_confidence(word_object),
-            self.get_word_word(word_object),
+            word,
-            self.check_if_proper_noun(index, tagged_words),
+            self.check_if_always_capitalized(word, index, tagged_words),
            self.get_next_word(word_objects, index)
                ) 
@@ -88,19 +88,13 @@ class TranscriptConverter:
        if index < len(word_objects) - 1:
            return word_objects[index + 1]
-    def interactive_transcript(self):
+    def save(self, path, output_target):
        return json.dumps(self.converted_words, indent=4)
    def viral_overlay(self):
        return json.dumps(
                [{'start': word['start'],
                  'stop': word['end'],
                  'text': word['word']}
                  for word in self.converted_words],
                indent=4
                )
    def save(self, path):
        with open(path, 'w') as fout:
-            fout.write(getattr(self, self.output_target)())
+            fout.write(getattr(self, output_target)())
        return path
 from . import outputs
 for name, val in outputs.__dict__.items():
    if callable(val):
        setattr(TranscriptConverter, name, val)
--- a/tpro/converters/init.py
+++ b/tpro/converters/init.py
@@ -0,0 +1,9 @@
 from .amazon import AmazonConverter
 from .speechmatics import SpeechmaticsConverter
 from .gentle import GentleConverter
 services = {
        'amazon': AmazonConverter,
        'gentle': GentleConverter,
        'speechmatics': SpeechmaticsConverter,
        }
--- a/tpro/converters/amazon.py
+++ b/tpro/converters/amazon.py
@@ -1,14 +1,16 @@
 import json
-from converter import TranscriptConverter
+from ..converter import TranscriptConverter
-import helpers
+from .. import helpers
 class AmazonConverter(TranscriptConverter):
-    def __init__(self, path, output_target):
+    name = 'amazon'
-        super().__init__(path, output_target)
+
    def __init__(self, json_data):
        super().__init__(json_data)
    def get_word_objects(self, json_data):
        return json_data['results']['items']
@@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(word_objects):
            if w['type'] == 'punctuation':
@@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter):
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
-                'always_capitalized': (
+                'always_capitalized': self.check_if_always_capitalized(
-                    word_obj.is_proper_noun 
+                    word_obj.word, 
-                    or word_obj.word == 'I'),
+                    i,
-                'index': index,
+                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
--- a/tpro/converters/gentle.py
+++ b/tpro/converters/gentle.py
@@ -1,11 +1,14 @@
-from converter import TranscriptConverter
+from ..converter import TranscriptConverter
 class GentleConverter(TranscriptConverter):
-    def __init__(self, path, output_target):
+    name = 'gentle'
-        super().__init__(path, output_target)
+
    def __init__(self, path):
        super().__init__(path)
    def get_word_objects(self, json_data):
        return json_data['words']
@@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter):
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
-                'always_capitalized': (
+                'always_capitalized': self.check_if_always_capitalized(
-                    word_obj.is_proper_noun 
+                    word_obj.word, 
-                    or word_obj.word == 'I'),
+                    i,
-                'index': index,
+                    tagged_words),
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
--- a/tpro/converters/speechmatics.py
+++ b/tpro/converters/speechmatics.py
@@ -1,15 +1,17 @@
 from collections import namedtuple
 import json
-from converter import TranscriptConverter
+from ..converter import TranscriptConverter
-import helpers
+from .. import helpers
 class SpeechmaticsConverter(TranscriptConverter):
-    def __init__(self, path, output_target):
+    name = 'speechmatics'
-        super().__init__(path, output_target)
+
    def __init__(self, path):
        super().__init__(path)
    def get_word_objects(self, json_data):
        return json_data['words']
@@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter):
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(word_objects):
            word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -60,54 +61,10 @@ class SpeechmaticsConverter(TranscriptConverter):
                'always_capitalized': (
                    word_obj.is_proper_noun 
                    or word_obj.word == 'I'),
                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
 def speechmatics_converter(data):
    data = json.load(data)
    converted_words = []
    words = data['words']
    tagged_words = helpers.tag_words([w['name'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0
    for i, w in enumerate(words):
        word_start = float(w['time'])
        word_end = word_start + float(w['duration'])
        confidence = float(w['confidence'])
        word = w['name']
        if word == '.':
            continue
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['name']
        if next_word == '.':
            punc_after = '.'
        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })
        index += 1
            punc_after = False
        return converted_words
@@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data):
            'end': word.end,
            'confidence': 1,
            'word': the_word,
-            'always_capitalized': is_proper_noun or word == 'I',
+            'always_capitalized': self.check_if_always_capitalized(
                word.word, 
                i,
                tagged_words),
            'index': i,
            'punc_before': punc_before,
            'punc_after': punc_after,
--- a/tpro/helpers.py
+++ b/tpro/helpers.py
@@ -1,3 +1,5 @@
 from pathlib import Path
 from nltk.tag.stanford import StanfordNERTagger
 st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
@@ -35,3 +37,7 @@ def get_punc_after(word):
            return punc
        if char in PUNCTUATION:
            punc.insert(0, char)
 def is_path(string):
    return '/' in string and Path(string).exists()
--- a/tpro/outputs.py
+++ b/tpro/outputs.py
@@ -0,0 +1,14 @@
 import json
 def universal_transcript(self):
    return json.dumps(self.converted_words, indent=4)
 def viral_overlay(self):
    return json.dumps([{
  'start': word['start'],
  'stop': word['end'],
  'text': word['word'].title() if word['always_capitalized'] else word['word']}
              for word in self.converted_words], indent=4
            )
--- a/tpro/tpro.py
+++ b/tpro/tpro.py
@@ -0,0 +1,37 @@
 import json
 import click
 from .converters import services
 from . import outputs
 from . import helpers
 output_choices =  [k for k, v in 
                   outputs.__dict__.items()
                   if callable(v)]
@click.command()
@click.option('-s', '--save', type=str, help='save to file')
@click.argument('json_path_or_data', type=str)
@click.argument('input_format', type=click.Choice(services.keys()))
@click.argument('output_format', type=click.Choice(output_choices))
 def cli(save, 
        json_path_or_data,
        input_format,
        output_format):
    if not helpers.is_path(json_path_or_data):
        json_data = json.loads(json_path_or_data)
    else:
        with open(json_path_or_data) as fin:
            json_data = json.load(fin)
    service = services[input_format]
    converter = service(json_data)
    converter.convert()
    if save:
        path = save
        converter.save(path, output_format)
        click.echo(f'{path} saved.')
    else:
        output_formatter = getattr(converter, output_format)
        click.echo(output_formatter())