gitignore
This commit is contained in:
119
.gitignore
vendored
119
.gitignore
vendored
@@ -1,3 +1,116 @@
|
|||||||
.DS_Store
|
# Byte-compiled / optimized / DLL files
|
||||||
.vscode
|
__pycache__/
|
||||||
__pycache__
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|||||||
13
Pipfile
13
Pipfile
@@ -1,13 +0,0 @@
|
|||||||
[[source]]
|
|
||||||
name = "pypi"
|
|
||||||
url = "https://pypi.org/simple"
|
|
||||||
verify_ssl = true
|
|
||||||
|
|
||||||
[dev-packages]
|
|
||||||
|
|
||||||
[packages]
|
|
||||||
nltk = "*"
|
|
||||||
pytest = "*"
|
|
||||||
|
|
||||||
[requires]
|
|
||||||
python_version = "3.7"
|
|
||||||
86
Pipfile.lock
generated
86
Pipfile.lock
generated
@@ -1,86 +0,0 @@
|
|||||||
{
|
|
||||||
"_meta": {
|
|
||||||
"hash": {
|
|
||||||
"sha256": "1b05879d48694b4c7fe1234da3a3744660dd38272835812cc05b270d6f9b06de"
|
|
||||||
},
|
|
||||||
"pipfile-spec": 6,
|
|
||||||
"requires": {
|
|
||||||
"python_version": "3.7"
|
|
||||||
},
|
|
||||||
"sources": [
|
|
||||||
{
|
|
||||||
"name": "pypi",
|
|
||||||
"url": "https://pypi.org/simple",
|
|
||||||
"verify_ssl": true
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"atomicwrites": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
|
|
||||||
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
|
|
||||||
],
|
|
||||||
"version": "==1.3.0"
|
|
||||||
},
|
|
||||||
"attrs": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
|
|
||||||
"sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
|
|
||||||
],
|
|
||||||
"version": "==18.2.0"
|
|
||||||
},
|
|
||||||
"more-itertools": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4",
|
|
||||||
"sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc",
|
|
||||||
"sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9"
|
|
||||||
],
|
|
||||||
"version": "==5.0.0"
|
|
||||||
},
|
|
||||||
"nltk": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==3.4"
|
|
||||||
},
|
|
||||||
"pluggy": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
|
|
||||||
"sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
|
|
||||||
],
|
|
||||||
"version": "==0.8.1"
|
|
||||||
},
|
|
||||||
"py": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
|
|
||||||
"sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
|
|
||||||
],
|
|
||||||
"version": "==1.7.0"
|
|
||||||
},
|
|
||||||
"pytest": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
|
|
||||||
"sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==4.2.0"
|
|
||||||
},
|
|
||||||
"singledispatch": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
|
|
||||||
"sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
|
|
||||||
],
|
|
||||||
"version": "==3.4.0.3"
|
|
||||||
},
|
|
||||||
"six": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
|
||||||
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
|
||||||
],
|
|
||||||
"version": "==1.12.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"develop": {}
|
|
||||||
}
|
|
||||||
35
README.md
35
README.md
@@ -1,4 +1,37 @@
|
|||||||
# Non-pip Requirement: Stanford NER JAR
|
# tpro
|
||||||
|
|
||||||
|
Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
|
||||||
|
various speech-to-text services and converts them to various standardized
|
||||||
|
formats.
|
||||||
|
|
||||||
|
# STT Services
|
||||||
|
|
||||||
|
- [Speechmatics](https://www.speechmatics.com/)
|
||||||
|
- [Amazon Transcribe](https://aws.amazon.com/transcribe/)
|
||||||
|
- [Gentle](https://github.com/lowerquality/gentle)
|
||||||
|
|
||||||
|
## Planned
|
||||||
|
|
||||||
|
- [Watson](https://www.ibm.com/watson/services/speech-to-text/)
|
||||||
|
- [Google Speech](https://cloud.google.com/speech-to-text/)
|
||||||
|
- [Mozilla's new open-source STT thing](https://github.com/mozilla/DeepSpeech)
|
||||||
|
|
||||||
|
# Output Formats
|
||||||
|
|
||||||
|
- [Universal Transcript](https://gist.github.com/zevaverbach/d2b7a19397607677878aa3268fda1002#example) (JSON)
|
||||||
|
- [viraloverlay](https://github.com/zevaverbach/viraloverlay) (JSON)
|
||||||
|
|
||||||
|
## Planned
|
||||||
|
|
||||||
|
- Word (`.doc`, `.docx`)
|
||||||
|
- text files
|
||||||
|
- SRT (subtitles)
|
||||||
|
|
||||||
|
# Installation
|
||||||
|
|
||||||
|
pip install tpro
|
||||||
|
|
||||||
|
## Non-pip Requirement: Stanford NER JAR
|
||||||
|
|
||||||
- download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
|
- download the .jar [here](https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
|
||||||
- put these files in in /usr/local/bin/:
|
- put these files in in /usr/local/bin/:
|
||||||
|
|||||||
26
setup.py
Normal file
26
setup.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
|
||||||
|
with open('README.md') as file:
|
||||||
|
long_description = file.read()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="tpro",
|
||||||
|
version="0.01",
|
||||||
|
url='https://github.com/zevaverbach/tpro',
|
||||||
|
install_requires=[
|
||||||
|
'Click',
|
||||||
|
'nltk',
|
||||||
|
],
|
||||||
|
include_package_data=True,
|
||||||
|
packages=find_packages(),
|
||||||
|
description=(
|
||||||
|
'tpro processes transcripts from speech-to-text services and outputs '
|
||||||
|
'to various formats.'),
|
||||||
|
long_description_content_type='text/markdown',
|
||||||
|
long_description=long_description,
|
||||||
|
entry_points='''
|
||||||
|
[console_scripts]
|
||||||
|
tpro=tpro.tpro:cli
|
||||||
|
''',
|
||||||
|
)
|
||||||
1
tpro/__init__.py
Normal file
1
tpro/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
name = 'tpro'
|
||||||
@@ -1,30 +1,26 @@
|
|||||||
import abc
|
import abc
|
||||||
import json
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import helpers
|
from . import helpers
|
||||||
import converters
|
from . import converters
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
|
Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
|
||||||
|
|
||||||
|
|
||||||
class TranscriptConverter:
|
class TranscriptConverter:
|
||||||
|
|
||||||
__metaclass__ = abc.ABCMeta
|
__metaclass__ = abc.ABCMeta
|
||||||
|
|
||||||
def __init__(self, path, output_target):
|
def __init__(self, json_data):
|
||||||
self.path = path
|
self.json_data = json_data
|
||||||
self.output_target = output_target
|
|
||||||
|
|
||||||
def convert(self):
|
def convert(self):
|
||||||
tagged_words = None
|
tagged_words = None
|
||||||
|
|
||||||
with open(self.path) as f:
|
word_objects = self.get_word_objects(self.json_data)
|
||||||
data = json.load(f)
|
|
||||||
word_objects = self.get_word_objects(data)
|
|
||||||
words = self.get_words(word_objects)
|
words = self.get_words(word_objects)
|
||||||
|
|
||||||
tagged_words = helpers.tag_words(words)
|
tagged_words = helpers.tag_words(words)
|
||||||
@@ -71,16 +67,20 @@ class TranscriptConverter:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_if_proper_noun(index, tagged_words):
|
def check_if_always_capitalized(word, index, tagged_words):
|
||||||
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
|
if word.upper() == 'I':
|
||||||
|
return True
|
||||||
|
word_category = tagged_words[index][1]
|
||||||
|
return word_category in helpers.PROPER_NOUN_TAGS
|
||||||
|
|
||||||
def get_word_object(self, word_object, index, tagged_words, word_objects):
|
def get_word_object(self, word_object, index, tagged_words, word_objects):
|
||||||
|
word = self.get_word_word(word_object)
|
||||||
return Word(
|
return Word(
|
||||||
self.get_word_start(word_object),
|
self.get_word_start(word_object),
|
||||||
self.get_word_end(word_object),
|
self.get_word_end(word_object),
|
||||||
self.get_word_confidence(word_object),
|
self.get_word_confidence(word_object),
|
||||||
self.get_word_word(word_object),
|
word,
|
||||||
self.check_if_proper_noun(index, tagged_words),
|
self.check_if_always_capitalized(word, index, tagged_words),
|
||||||
self.get_next_word(word_objects, index)
|
self.get_next_word(word_objects, index)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -88,19 +88,13 @@ class TranscriptConverter:
|
|||||||
if index < len(word_objects) - 1:
|
if index < len(word_objects) - 1:
|
||||||
return word_objects[index + 1]
|
return word_objects[index + 1]
|
||||||
|
|
||||||
def interactive_transcript(self):
|
def save(self, path, output_target):
|
||||||
return json.dumps(self.converted_words, indent=4)
|
|
||||||
|
|
||||||
def viral_overlay(self):
|
|
||||||
return json.dumps(
|
|
||||||
[{'start': word['start'],
|
|
||||||
'stop': word['end'],
|
|
||||||
'text': word['word']}
|
|
||||||
for word in self.converted_words],
|
|
||||||
indent=4
|
|
||||||
)
|
|
||||||
|
|
||||||
def save(self, path):
|
|
||||||
with open(path, 'w') as fout:
|
with open(path, 'w') as fout:
|
||||||
fout.write(getattr(self, self.output_target)())
|
fout.write(getattr(self, output_target)())
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
from . import outputs
|
||||||
|
for name, val in outputs.__dict__.items():
|
||||||
|
if callable(val):
|
||||||
|
setattr(TranscriptConverter, name, val)
|
||||||
9
tpro/converters/__init__.py
Normal file
9
tpro/converters/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from .amazon import AmazonConverter
|
||||||
|
from .speechmatics import SpeechmaticsConverter
|
||||||
|
from .gentle import GentleConverter
|
||||||
|
|
||||||
|
services = {
|
||||||
|
'amazon': AmazonConverter,
|
||||||
|
'gentle': GentleConverter,
|
||||||
|
'speechmatics': SpeechmaticsConverter,
|
||||||
|
}
|
||||||
@@ -1,14 +1,16 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from converter import TranscriptConverter
|
from ..converter import TranscriptConverter
|
||||||
import helpers
|
from .. import helpers
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AmazonConverter(TranscriptConverter):
|
class AmazonConverter(TranscriptConverter):
|
||||||
|
|
||||||
def __init__(self, path, output_target):
|
name = 'amazon'
|
||||||
super().__init__(path, output_target)
|
|
||||||
|
def __init__(self, json_data):
|
||||||
|
super().__init__(json_data)
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
return json_data['results']['items']
|
return json_data['results']['items']
|
||||||
@@ -43,7 +45,6 @@ class AmazonConverter(TranscriptConverter):
|
|||||||
punc_before = False
|
punc_before = False
|
||||||
punc_after = False
|
punc_after = False
|
||||||
num_words = len(words)
|
num_words = len(words)
|
||||||
index = 0
|
|
||||||
|
|
||||||
for i, w in enumerate(word_objects):
|
for i, w in enumerate(word_objects):
|
||||||
if w['type'] == 'punctuation':
|
if w['type'] == 'punctuation':
|
||||||
@@ -72,15 +73,14 @@ class AmazonConverter(TranscriptConverter):
|
|||||||
'end': word_obj.end,
|
'end': word_obj.end,
|
||||||
'confidence': word_obj.confidence,
|
'confidence': word_obj.confidence,
|
||||||
'word': word_obj.word,
|
'word': word_obj.word,
|
||||||
'always_capitalized': (
|
'always_capitalized': self.check_if_always_capitalized(
|
||||||
word_obj.is_proper_noun
|
word_obj.word,
|
||||||
or word_obj.word == 'I'),
|
i,
|
||||||
'index': index,
|
tagged_words),
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
})
|
})
|
||||||
|
|
||||||
index += 1
|
|
||||||
punc_after = False
|
punc_after = False
|
||||||
|
|
||||||
return converted_words
|
return converted_words
|
||||||
@@ -1,11 +1,14 @@
|
|||||||
from converter import TranscriptConverter
|
from ..converter import TranscriptConverter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class GentleConverter(TranscriptConverter):
|
class GentleConverter(TranscriptConverter):
|
||||||
|
|
||||||
def __init__(self, path, output_target):
|
name = 'gentle'
|
||||||
super().__init__(path, output_target)
|
|
||||||
|
def __init__(self, path):
|
||||||
|
super().__init__(path)
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
return json_data['words']
|
return json_data['words']
|
||||||
@@ -35,7 +38,6 @@ class GentleConverter(TranscriptConverter):
|
|||||||
punc_before = False
|
punc_before = False
|
||||||
punc_after = False
|
punc_after = False
|
||||||
num_words = len(words)
|
num_words = len(words)
|
||||||
index = 0
|
|
||||||
|
|
||||||
for i, w in enumerate(word_objects):
|
for i, w in enumerate(word_objects):
|
||||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||||
@@ -45,15 +47,14 @@ class GentleConverter(TranscriptConverter):
|
|||||||
'end': word_obj.end,
|
'end': word_obj.end,
|
||||||
'confidence': word_obj.confidence,
|
'confidence': word_obj.confidence,
|
||||||
'word': word_obj.word,
|
'word': word_obj.word,
|
||||||
'always_capitalized': (
|
'always_capitalized': self.check_if_always_capitalized(
|
||||||
word_obj.is_proper_noun
|
word_obj.word,
|
||||||
or word_obj.word == 'I'),
|
i,
|
||||||
'index': index,
|
tagged_words),
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
})
|
})
|
||||||
|
|
||||||
index += 1
|
|
||||||
punc_after = False
|
punc_after = False
|
||||||
|
|
||||||
return converted_words
|
return converted_words
|
||||||
@@ -1,15 +1,17 @@
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from converter import TranscriptConverter
|
from ..converter import TranscriptConverter
|
||||||
import helpers
|
from .. import helpers
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SpeechmaticsConverter(TranscriptConverter):
|
class SpeechmaticsConverter(TranscriptConverter):
|
||||||
|
|
||||||
def __init__(self, path, output_target):
|
name = 'speechmatics'
|
||||||
super().__init__(path, output_target)
|
|
||||||
|
def __init__(self, path):
|
||||||
|
super().__init__(path)
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
return json_data['words']
|
return json_data['words']
|
||||||
@@ -40,7 +42,6 @@ class SpeechmaticsConverter(TranscriptConverter):
|
|||||||
punc_before = False
|
punc_before = False
|
||||||
punc_after = False
|
punc_after = False
|
||||||
num_words = len(words)
|
num_words = len(words)
|
||||||
index = 0
|
|
||||||
|
|
||||||
for i, w in enumerate(word_objects):
|
for i, w in enumerate(word_objects):
|
||||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||||
@@ -60,54 +61,10 @@ class SpeechmaticsConverter(TranscriptConverter):
|
|||||||
'always_capitalized': (
|
'always_capitalized': (
|
||||||
word_obj.is_proper_noun
|
word_obj.is_proper_noun
|
||||||
or word_obj.word == 'I'),
|
or word_obj.word == 'I'),
|
||||||
'index': index,
|
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
})
|
})
|
||||||
|
|
||||||
index += 1
|
|
||||||
punc_after = False
|
|
||||||
|
|
||||||
return converted_words
|
|
||||||
|
|
||||||
|
|
||||||
def speechmatics_converter(data):
|
|
||||||
data = json.load(data)
|
|
||||||
converted_words = []
|
|
||||||
words = data['words']
|
|
||||||
tagged_words = helpers.tag_words([w['name'] for w in words])
|
|
||||||
punc_before = False
|
|
||||||
punc_after = False
|
|
||||||
num_words = len(words)
|
|
||||||
index = 0
|
|
||||||
|
|
||||||
for i, w in enumerate(words):
|
|
||||||
word_start = float(w['time'])
|
|
||||||
word_end = word_start + float(w['duration'])
|
|
||||||
confidence = float(w['confidence'])
|
|
||||||
word = w['name']
|
|
||||||
if word == '.':
|
|
||||||
continue
|
|
||||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
|
||||||
|
|
||||||
next_word = None
|
|
||||||
if i < num_words - 1:
|
|
||||||
next_word = words[i + 1]['name']
|
|
||||||
if next_word == '.':
|
|
||||||
punc_after = '.'
|
|
||||||
|
|
||||||
converted_words.append({
|
|
||||||
'start': word_start,
|
|
||||||
'end': word_end,
|
|
||||||
'confidence': confidence,
|
|
||||||
'word': word,
|
|
||||||
'always_capitalized': is_proper_noun or word == 'I',
|
|
||||||
'index': index,
|
|
||||||
'punc_after': punc_after,
|
|
||||||
'punc_before': punc_before,
|
|
||||||
})
|
|
||||||
|
|
||||||
index += 1
|
|
||||||
punc_after = False
|
punc_after = False
|
||||||
|
|
||||||
return converted_words
|
return converted_words
|
||||||
@@ -167,7 +124,10 @@ def speechmatics_aligned_text_converter(data):
|
|||||||
'end': word.end,
|
'end': word.end,
|
||||||
'confidence': 1,
|
'confidence': 1,
|
||||||
'word': the_word,
|
'word': the_word,
|
||||||
'always_capitalized': is_proper_noun or word == 'I',
|
'always_capitalized': self.check_if_always_capitalized(
|
||||||
|
word.word,
|
||||||
|
i,
|
||||||
|
tagged_words),
|
||||||
'index': i,
|
'index': i,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from nltk.tag.stanford import StanfordNERTagger
|
from nltk.tag.stanford import StanfordNERTagger
|
||||||
|
|
||||||
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
||||||
@@ -35,3 +37,7 @@ def get_punc_after(word):
|
|||||||
return punc
|
return punc
|
||||||
if char in PUNCTUATION:
|
if char in PUNCTUATION:
|
||||||
punc.insert(0, char)
|
punc.insert(0, char)
|
||||||
|
|
||||||
|
|
||||||
|
def is_path(string):
|
||||||
|
return '/' in string and Path(string).exists()
|
||||||
14
tpro/outputs.py
Normal file
14
tpro/outputs.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
def universal_transcript(self):
|
||||||
|
return json.dumps(self.converted_words, indent=4)
|
||||||
|
|
||||||
|
def viral_overlay(self):
|
||||||
|
return json.dumps([{
|
||||||
|
'start': word['start'],
|
||||||
|
'stop': word['end'],
|
||||||
|
'text': word['word'].title() if word['always_capitalized'] else word['word']}
|
||||||
|
|
||||||
|
for word in self.converted_words], indent=4
|
||||||
|
)
|
||||||
|
|
||||||
37
tpro/tpro.py
Normal file
37
tpro/tpro.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import click
|
||||||
|
|
||||||
|
from .converters import services
|
||||||
|
from . import outputs
|
||||||
|
from . import helpers
|
||||||
|
|
||||||
|
output_choices = [k for k, v in
|
||||||
|
outputs.__dict__.items()
|
||||||
|
if callable(v)]
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('-s', '--save', type=str, help='save to file')
|
||||||
|
@click.argument('json_path_or_data', type=str)
|
||||||
|
@click.argument('input_format', type=click.Choice(services.keys()))
|
||||||
|
@click.argument('output_format', type=click.Choice(output_choices))
|
||||||
|
def cli(save,
|
||||||
|
json_path_or_data,
|
||||||
|
input_format,
|
||||||
|
output_format):
|
||||||
|
|
||||||
|
if not helpers.is_path(json_path_or_data):
|
||||||
|
json_data = json.loads(json_path_or_data)
|
||||||
|
else:
|
||||||
|
with open(json_path_or_data) as fin:
|
||||||
|
json_data = json.load(fin)
|
||||||
|
service = services[input_format]
|
||||||
|
converter = service(json_data)
|
||||||
|
converter.convert()
|
||||||
|
if save:
|
||||||
|
path = save
|
||||||
|
converter.save(path, output_format)
|
||||||
|
click.echo(f'{path} saved.')
|
||||||
|
else:
|
||||||
|
output_formatter = getattr(converter, output_format)
|
||||||
|
click.echo(output_formatter())
|
||||||
Reference in New Issue
Block a user