gitignore

This commit is contained in:
2019-02-19 17:10:07 -05:00
parent 11776eaa07
commit 37c1a44b1d
16 changed files with 303 additions and 208 deletions

119
.gitignore vendored
View File

@@ -1,3 +1,116 @@
.DS_Store
.vscode
__pycache__
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

13
Pipfile
View File

@@ -1,13 +0,0 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
nltk = "*"
pytest = "*"
[requires]
python_version = "3.7"

86
Pipfile.lock generated
View File

@@ -1,86 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"atomicwrites": {
"hashes": [
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
],
"version": "==1.3.0"
},
"attrs": {
"hashes": [
"sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
"sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
],
"version": "==18.2.0"
},
"more-itertools": {
"hashes": [
"sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
"sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
"sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
],
"version": "==5.0.0"
},
"nltk": {
"hashes": [
"sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
],
"index": "pypi",
"version": "==3.4"
},
"pluggy": {
"hashes": [
"sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
"sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
],
"version": "==0.8.1"
},
"py": {
"hashes": [
"sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
"sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
],
"version": "==1.7.0"
},
"pytest": {
"hashes": [
"sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
"sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
],
"index": "pypi",
"version": "==4.2.0"
},
"singledispatch": {
"hashes": [
"sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
"sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
],
"version": "==3.4.0.3"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
}
},
"develop": {}
}

View File

@@ -1,4 +1,37 @@
# Non-pip Requirement: Stanford NER JAR
# tpro
Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
various speech-to-text services and converts them to various standardized
formats.
# STT Services
- [Speechmatics](https://www.speechmatics.com/)
- [Amazon Transcribe](https://aws.amazon.com/transcribe/)
- [Gentle](https://github.com/lowerquality/gentle)
## Planned
- [Watson](https://www.ibm.com/watson/services/speech-to-text/)
- [Google Speech](https://cloud.google.com/speech-to-text/)
- [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech)
# Output Formats
- [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON)
- [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON)
## Planned
- Word (`.doc`, `.docx`)
- text files
- SRT (subtitles)
# Installation
pip install tpro
## Non-pip Requirement: Stanford NER JAR
- download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
- put these files in in /usr/local/bin/:

View File

26
setup.py Normal file
View File

@@ -0,0 +1,26 @@
from setuptools import setup, find_packages
with open('README.md') as file:
long_description = file.read()
setup(
name="tpro",
version="0.01",
url='https://github.com/zevaverbach/tpro',
install_requires=[
'Click',
'nltk',
],
include_package_data=True,
packages=find_packages(),
description=(
'tpro processes transcripts from speech-to-text services and outputs '
'to various formats.'),
long_description_content_type='text/markdown',
long_description=long_description,
entry_points='''
[console_scripts]
tpro=tpro.tpro:cli
''',
)

1
tpro/__init__.py Normal file
View File

@@ -0,0 +1 @@
name = 'tpro'

View File

@@ -1,39 +1,35 @@
import abc
import json
from collections import namedtuple
import os
import helpers
import converters
from . import helpers
from . import converters
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
class TranscriptConverter:
__metaclass__ = abc.ABCMeta
def __init__(self, path, output_target):
self.path = path
self.output_target = output_target
def __init__(self, json_data):
self.json_data = json_data
def convert(self):
tagged_words = None
with open(self.path) as f:
data = json.load(f)
word_objects = self.get_word_objects(data)
words = self.get_words(word_objects)
word_objects = self.get_word_objects(self.json_data)
words = self.get_words(word_objects)
tagged_words = helpers.tag_words(words)
tagged_words = helpers.tag_words(words)
self.converted_words = self.convert_words(
word_objects,
words,
tagged_words
)
self.converted_words = self.convert_words(
word_objects,
words,
tagged_words
)
@staticmethod
@abc.abstractmethod
@@ -71,16 +67,20 @@ class TranscriptConverter:
pass
@staticmethod
def check_if_proper_noun(index, tagged_words):
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
def check_if_always_capitalized(word, index, tagged_words):
if word.upper() == 'I':
return True
word_category = tagged_words[index][1]
return word_category in helpers.PROPER_NOUN_TAGS
def get_word_object(self, word_object, index, tagged_words, word_objects):
word = self.get_word_word(word_object)
return Word(
self.get_word_start(word_object),
self.get_word_end(word_object),
self.get_word_confidence(word_object),
self.get_word_word(word_object),
self.check_if_proper_noun(index, tagged_words),
word,
self.check_if_always_capitalized(word, index, tagged_words),
self.get_next_word(word_objects, index)
)
@@ -88,19 +88,13 @@ class TranscriptConverter:
if index < len(word_objects) - 1:
return word_objects[index + 1]
def interactive_transcript(self):
return json.dumps(self.converted_words, indent=4)
def viral_overlay(self):
return json.dumps(
[{'start': word['start'],
'stop': word['end'],
'text': word['word']}
for word in self.converted_words],
indent=4
)
def save(self, path):
def save(self, path, output_target):
with open(path, 'w') as fout:
fout.write(getattr(self, self.output_target)())
fout.write(getattr(self, output_target)())
return path
from . import outputs
for name, val in outputs.__dict__.items():
if callable(val):
setattr(TranscriptConverter, name, val)

View File

@@ -0,0 +1,9 @@
from .amazon import AmazonConverter
from .speechmatics import SpeechmaticsConverter
from .gentle import GentleConverter
services = {
'amazon': AmazonConverter,
'gentle': GentleConverter,
'speechmatics': SpeechmaticsConverter,
}

View File

@@ -1,14 +1,16 @@
import json
from converter import TranscriptConverter
import helpers
from ..converter import TranscriptConverter
from .. import helpers
class AmazonConverter(TranscriptConverter):
def __init__(self, path, output_target):
super().__init__(path, output_target)
name = 'amazon'
def __init__(self, json_data):
super().__init__(json_data)
def get_word_objects(self, json_data):
return json_data['results']['items']
@@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter):
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(word_objects):
if w['type'] == 'punctuation':
@@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter):
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': (
word_obj.is_proper_noun
or word_obj.word == 'I'),
'index': index,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words

View File

@@ -1,11 +1,14 @@
from converter import TranscriptConverter
from ..converter import TranscriptConverter
class GentleConverter(TranscriptConverter):
def __init__(self, path, output_target):
super().__init__(path, output_target)
name = 'gentle'
def __init__(self, path):
super().__init__(path)
def get_word_objects(self, json_data):
return json_data['words']
@@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter):
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter):
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': (
word_obj.is_proper_noun
or word_obj.word == 'I'),
'index': index,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words

View File

@@ -1,15 +1,17 @@
from collections import namedtuple
import json
from converter import TranscriptConverter
import helpers
from ..converter import TranscriptConverter
from .. import helpers
class SpeechmaticsConverter(TranscriptConverter):
def __init__(self, path, output_target):
super().__init__(path, output_target)
name = 'speechmatics'
def __init__(self, path):
super().__init__(path)
def get_word_objects(self, json_data):
return json_data['words']
@@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter):
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
@@ -60,59 +61,15 @@ class SpeechmaticsConverter(TranscriptConverter):
'always_capitalized': (
word_obj.is_proper_noun
or word_obj.word == 'I'),
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def speechmatics_converter(data):
data = json.load(data)
converted_words = []
words = data['words']
tagged_words = helpers.tag_words([w['name'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
word_start = float(w['time'])
word_end = word_start + float(w['duration'])
confidence = float(w['confidence'])
word = w['name']
if word == '.':
continue
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['name']
if next_word == '.':
punc_after = '.'
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def speechmatics_aligned_text_converter(data):
data = data.readlines()[0]
@@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data):
'end': word.end,
'confidence': 1,
'word': the_word,
'always_capitalized': is_proper_noun or word == 'I',
'always_capitalized': self.check_if_always_capitalized(
word.word,
i,
tagged_words),
'index': i,
'punc_before': punc_before,
'punc_after': punc_after,

View File

@@ -1,3 +1,5 @@
from pathlib import Path
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
@@ -34,4 +36,8 @@ def get_punc_after(word):
if char.isalpha():
return punc
if char in PUNCTUATION:
punc.insert(0, char)
punc.insert(0, char)
def is_path(string):
return '/' in string and Path(string).exists()

14
tpro/outputs.py Normal file
View File

@@ -0,0 +1,14 @@
import json
def universal_transcript(self):
return json.dumps(self.converted_words, indent=4)
def viral_overlay(self):
return json.dumps([{
'start': word['start'],
'stop': word['end'],
'text': word['word'].title() if word['always_capitalized'] else word['word']}
for word in self.converted_words], indent=4
)

37
tpro/tpro.py Normal file
View File

@@ -0,0 +1,37 @@
import json
import click
from .converters import services
from . import outputs
from . import helpers
output_choices = [k for k, v in
outputs.__dict__.items()
if callable(v)]
@click.command()
@click.option('-s', '--save', type=str, help='save to file')
@click.argument('json_path_or_data', type=str)
@click.argument('input_format', type=click.Choice(services.keys()))
@click.argument('output_format', type=click.Choice(output_choices))
def cli(save,
json_path_or_data,
input_format,
output_format):
if not helpers.is_path(json_path_or_data):
json_data = json.loads(json_path_or_data)
else:
with open(json_path_or_data) as fin:
json_data = json.load(fin)
service = services[input_format]
converter = service(json_data)
converter.convert()
if save:
path = save
converter.save(path, output_format)
click.echo(f'{path} saved.')
else:
output_formatter = getattr(converter, output_format)
click.echo(output_formatter())