gitignore
This commit is contained in:
119
.gitignore
vendored
119
.gitignore
vendored
@@ -1,3 +1,116 @@
|
||||
.DS_Store
|
||||
.vscode
|
||||
__pycache__
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
13
Pipfile
13
Pipfile
@@ -1,13 +0,0 @@
|
||||
[[source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[packages]
|
||||
nltk = "*"
|
||||
pytest = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
86
Pipfile.lock
generated
86
Pipfile.lock
generated
@@ -1,86 +0,0 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.7"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"atomicwrites": {
|
||||
"hashes": [
|
||||
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
|
||||
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
|
||||
],
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
|
||||
"sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
|
||||
],
|
||||
"version": "==18.2.0"
|
||||
},
|
||||
"more-itertools": {
|
||||
"hashes": [
|
||||
"sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
|
||||
"sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
|
||||
"sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
|
||||
],
|
||||
"version": "==5.0.0"
|
||||
},
|
||||
"nltk": {
|
||||
"hashes": [
|
||||
"sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.4"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
|
||||
"sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
|
||||
],
|
||||
"version": "==0.8.1"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
|
||||
"sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
|
||||
],
|
||||
"version": "==1.7.0"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
|
||||
"sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.2.0"
|
||||
},
|
||||
"singledispatch": {
|
||||
"hashes": [
|
||||
"sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
|
||||
"sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
|
||||
],
|
||||
"version": "==3.4.0.3"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
}
|
||||
35
README.md
35
README.md
@@ -1,4 +1,37 @@
|
||||
# Non-pip Requirement: Stanford NER JAR
|
||||
# tpro
|
||||
|
||||
Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
|
||||
various speech-to-text services and converts them to various standardized
|
||||
formats.
|
||||
|
||||
# STT Services
|
||||
|
||||
- [Speechmatics](https://www.speechmatics.com/)
|
||||
- [Amazon Transcribe](https://aws.amazon.com/transcribe/)
|
||||
- [Gentle](https://github.com/lowerquality/gentle)
|
||||
|
||||
## Planned
|
||||
|
||||
- [Watson](https://www.ibm.com/watson/services/speech-to-text/)
|
||||
- [Google Speech](https://cloud.google.com/speech-to-text/)
|
||||
- [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech)
|
||||
|
||||
# Output Formats
|
||||
|
||||
- [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON)
|
||||
- [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON)
|
||||
|
||||
## Planned
|
||||
|
||||
- Word (`.doc`, `.docx`)
|
||||
- text files
|
||||
- SRT (subtitles)
|
||||
|
||||
# Installation
|
||||
|
||||
pip install tpro
|
||||
|
||||
## Non-pip Requirement: Stanford NER JAR
|
||||
|
||||
- download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
|
||||
- put these files in in /usr/local/bin/:
|
||||
|
||||
26
setup.py
Normal file
26
setup.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
|
||||
with open('README.md') as file:
|
||||
long_description = file.read()
|
||||
|
||||
setup(
|
||||
name="tpro",
|
||||
version="0.01",
|
||||
url='https://github.com/zevaverbach/tpro',
|
||||
install_requires=[
|
||||
'Click',
|
||||
'nltk',
|
||||
],
|
||||
include_package_data=True,
|
||||
packages=find_packages(),
|
||||
description=(
|
||||
'tpro processes transcripts from speech-to-text services and outputs '
|
||||
'to various formats.'),
|
||||
long_description_content_type='text/markdown',
|
||||
long_description=long_description,
|
||||
entry_points='''
|
||||
[console_scripts]
|
||||
tpro=tpro.tpro:cli
|
||||
''',
|
||||
)
|
||||
1
tpro/__init__.py
Normal file
1
tpro/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
name = 'tpro'
|
||||
@@ -1,30 +1,26 @@
|
||||
import abc
|
||||
import json
|
||||
from collections import namedtuple
|
||||
import os
|
||||
|
||||
import helpers
|
||||
import converters
|
||||
from . import helpers
|
||||
from . import converters
|
||||
|
||||
|
||||
|
||||
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
|
||||
Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
|
||||
|
||||
|
||||
class TranscriptConverter:
|
||||
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
self.path = path
|
||||
self.output_target = output_target
|
||||
def __init__(self, json_data):
|
||||
self.json_data = json_data
|
||||
|
||||
def convert(self):
|
||||
tagged_words = None
|
||||
|
||||
with open(self.path) as f:
|
||||
data = json.load(f)
|
||||
word_objects = self.get_word_objects(data)
|
||||
word_objects = self.get_word_objects(self.json_data)
|
||||
words = self.get_words(word_objects)
|
||||
|
||||
tagged_words = helpers.tag_words(words)
|
||||
@@ -71,16 +67,20 @@ class TranscriptConverter:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def check_if_proper_noun(index, tagged_words):
|
||||
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
|
||||
def check_if_always_capitalized(word, index, tagged_words):
|
||||
if word.upper() == 'I':
|
||||
return True
|
||||
word_category = tagged_words[index][1]
|
||||
return word_category in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
def get_word_object(self, word_object, index, tagged_words, word_objects):
|
||||
word = self.get_word_word(word_object)
|
||||
return Word(
|
||||
self.get_word_start(word_object),
|
||||
self.get_word_end(word_object),
|
||||
self.get_word_confidence(word_object),
|
||||
self.get_word_word(word_object),
|
||||
self.check_if_proper_noun(index, tagged_words),
|
||||
word,
|
||||
self.check_if_always_capitalized(word, index, tagged_words),
|
||||
self.get_next_word(word_objects, index)
|
||||
)
|
||||
|
||||
@@ -88,19 +88,13 @@ class TranscriptConverter:
|
||||
if index < len(word_objects) - 1:
|
||||
return word_objects[index + 1]
|
||||
|
||||
def interactive_transcript(self):
|
||||
return json.dumps(self.converted_words, indent=4)
|
||||
|
||||
def viral_overlay(self):
|
||||
return json.dumps(
|
||||
[{'start': word['start'],
|
||||
'stop': word['end'],
|
||||
'text': word['word']}
|
||||
for word in self.converted_words],
|
||||
indent=4
|
||||
)
|
||||
|
||||
def save(self, path):
|
||||
def save(self, path, output_target):
|
||||
with open(path, 'w') as fout:
|
||||
fout.write(getattr(self, self.output_target)())
|
||||
fout.write(getattr(self, output_target)())
|
||||
return path
|
||||
|
||||
|
||||
from . import outputs
|
||||
for name, val in outputs.__dict__.items():
|
||||
if callable(val):
|
||||
setattr(TranscriptConverter, name, val)
|
||||
9
tpro/converters/__init__.py
Normal file
9
tpro/converters/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .amazon import AmazonConverter
|
||||
from .speechmatics import SpeechmaticsConverter
|
||||
from .gentle import GentleConverter
|
||||
|
||||
services = {
|
||||
'amazon': AmazonConverter,
|
||||
'gentle': GentleConverter,
|
||||
'speechmatics': SpeechmaticsConverter,
|
||||
}
|
||||
@@ -1,14 +1,16 @@
|
||||
import json
|
||||
|
||||
from converter import TranscriptConverter
|
||||
import helpers
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class AmazonConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
name = 'amazon'
|
||||
|
||||
def __init__(self, json_data):
|
||||
super().__init__(json_data)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['results']['items']
|
||||
@@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter):
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
if w['type'] == 'punctuation':
|
||||
@@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter):
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
@@ -1,11 +1,14 @@
|
||||
from converter import TranscriptConverter
|
||||
from ..converter import TranscriptConverter
|
||||
|
||||
|
||||
|
||||
|
||||
class GentleConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
name = 'gentle'
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
@@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter):
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
@@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter):
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
@@ -1,15 +1,17 @@
|
||||
from collections import namedtuple
|
||||
import json
|
||||
|
||||
from converter import TranscriptConverter
|
||||
import helpers
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class SpeechmaticsConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
name = 'speechmatics'
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
@@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter):
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
@@ -60,54 +61,10 @@ class SpeechmaticsConverter(TranscriptConverter):
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def speechmatics_converter(data):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['words']
|
||||
tagged_words = helpers.tag_words([w['name'] for w in words])
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
word_start = float(w['time'])
|
||||
word_end = word_start + float(w['duration'])
|
||||
confidence = float(w['confidence'])
|
||||
word = w['name']
|
||||
if word == '.':
|
||||
continue
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
next_word = None
|
||||
if i < num_words - 1:
|
||||
next_word = words[i + 1]['name']
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
|
||||
converted_words.append({
|
||||
'start': word_start,
|
||||
'end': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
@@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data):
|
||||
'end': word.end,
|
||||
'confidence': 1,
|
||||
'word': the_word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'index': i,
|
||||
'punc_before': punc_before,
|
||||
'punc_after': punc_after,
|
||||
@@ -1,3 +1,5 @@
|
||||
from pathlib import Path
|
||||
|
||||
from nltk.tag.stanford import StanfordNERTagger
|
||||
|
||||
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
||||
@@ -35,3 +37,7 @@ def get_punc_after(word):
|
||||
return punc
|
||||
if char in PUNCTUATION:
|
||||
punc.insert(0, char)
|
||||
|
||||
|
||||
def is_path(string):
|
||||
return '/' in string and Path(string).exists()
|
||||
14
tpro/outputs.py
Normal file
14
tpro/outputs.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import json
|
||||
|
||||
def universal_transcript(self):
|
||||
return json.dumps(self.converted_words, indent=4)
|
||||
|
||||
def viral_overlay(self):
|
||||
return json.dumps([{
|
||||
'start': word['start'],
|
||||
'stop': word['end'],
|
||||
'text': word['word'].title() if word['always_capitalized'] else word['word']}
|
||||
|
||||
for word in self.converted_words], indent=4
|
||||
)
|
||||
|
||||
37
tpro/tpro.py
Normal file
37
tpro/tpro.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
|
||||
import click
|
||||
|
||||
from .converters import services
|
||||
from . import outputs
|
||||
from . import helpers
|
||||
|
||||
output_choices = [k for k, v in
|
||||
outputs.__dict__.items()
|
||||
if callable(v)]
|
||||
|
||||
@click.command()
|
||||
@click.option('-s', '--save', type=str, help='save to file')
|
||||
@click.argument('json_path_or_data', type=str)
|
||||
@click.argument('input_format', type=click.Choice(services.keys()))
|
||||
@click.argument('output_format', type=click.Choice(output_choices))
|
||||
def cli(save,
|
||||
json_path_or_data,
|
||||
input_format,
|
||||
output_format):
|
||||
|
||||
if not helpers.is_path(json_path_or_data):
|
||||
json_data = json.loads(json_path_or_data)
|
||||
else:
|
||||
with open(json_path_or_data) as fin:
|
||||
json_data = json.load(fin)
|
||||
service = services[input_format]
|
||||
converter = service(json_data)
|
||||
converter.convert()
|
||||
if save:
|
||||
path = save
|
||||
converter.save(path, output_format)
|
||||
click.echo(f'{path} saved.')
|
||||
else:
|
||||
output_formatter = getattr(converter, output_format)
|
||||
click.echo(output_formatter())
|
||||
Reference in New Issue
Block a user