updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.
This commit is contained in:
13
Pipfile
Normal file
13
Pipfile
Normal file
@@ -0,0 +1,13 @@
|
||||
[[source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
[packages]
|
||||
tpro = {editable = true,path = "."}
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
104
Pipfile.lock
generated
Normal file
104
Pipfile.lock
generated
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "49961036ff9465d1da8edf8b981512812678348e4baaa0c51841df64e80533ad"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3.7"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"click": {
|
||||
"hashes": [
|
||||
"sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
|
||||
"sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
|
||||
],
|
||||
"version": "==7.0"
|
||||
},
|
||||
"nltk": {
|
||||
"hashes": [
|
||||
"sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
|
||||
],
|
||||
"version": "==3.4"
|
||||
},
|
||||
"singledispatch": {
|
||||
"hashes": [
|
||||
"sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
|
||||
"sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
|
||||
],
|
||||
"version": "==3.4.0.3"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
},
|
||||
"tpro": {
|
||||
"editable": true,
|
||||
"path": "."
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"atomicwrites": {
|
||||
"hashes": [
|
||||
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
|
||||
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
|
||||
],
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
|
||||
"sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
|
||||
],
|
||||
"version": "==19.1.0"
|
||||
},
|
||||
"more-itertools": {
|
||||
"hashes": [
|
||||
"sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
|
||||
"sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
|
||||
],
|
||||
"markers": "python_version > '2.7'",
|
||||
"version": "==6.0.0"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:19ecf9ce9db2fce065a7a0586e07cfb4ac8614fe96edf628a264b1c70116cf8f",
|
||||
"sha256:84d306a647cc805219916e62aab89caa97a33a1dd8c342e87a37f91073cd4746"
|
||||
],
|
||||
"version": "==0.9.0"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
|
||||
"sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
|
||||
],
|
||||
"version": "==1.8.0"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c",
|
||||
"sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.3.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
4
setup.py
4
setup.py
@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:
|
||||
|
||||
setup(
|
||||
name="tpro",
|
||||
version="0.08",
|
||||
version="0.09",
|
||||
url='https://github.com/zevaverbach/tpro',
|
||||
install_requires=[
|
||||
'Click',
|
||||
@@ -21,6 +21,6 @@ setup(
|
||||
long_description=long_description,
|
||||
entry_points='''
|
||||
[console_scripts]
|
||||
tpro=tpro.tpro:cli
|
||||
tpro=transcript_processing.tpro:cli
|
||||
''',
|
||||
)
|
||||
|
||||
@@ -3,9 +3,10 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from converters.amazon import AmazonConverter
|
||||
from converters.speechmatics import SpeechmaticsConverter
|
||||
from converters.gentle import GentleConverter
|
||||
from transcript_processing.converters.amazon import AmazonConverter
|
||||
from transcript_processing.converters.speechmatics import SpeechmaticsConverter
|
||||
from transcript_processing.converters.gentle import GentleConverter
|
||||
from transcript_processing.converters.google import GoogleConverter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -20,9 +21,10 @@ def test_json_transcript(json_transcript):
|
||||
|
||||
|
||||
def test_amazon():
|
||||
a = AmazonConverter(
|
||||
os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'),
|
||||
'interactive_transcript')
|
||||
with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), 'r') as fin:
|
||||
json_data = json.load(fin)
|
||||
|
||||
a = AmazonConverter(json_data)
|
||||
a.convert()
|
||||
assert a.converted_words[0] == {
|
||||
'start': 5.49,
|
||||
@@ -30,16 +32,17 @@ def test_amazon():
|
||||
'confidence': 1.0,
|
||||
'word': 'So',
|
||||
'always_capitalized': False,
|
||||
'index': 0,
|
||||
'punc_after': False,
|
||||
'punc_before': False
|
||||
}
|
||||
|
||||
|
||||
def test_speechmatics():
|
||||
a = SpeechmaticsConverter(
|
||||
os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'),
|
||||
'interactive_transcript')
|
||||
with open(os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), 'r') as fin:
|
||||
json_data = json.load(fin)
|
||||
|
||||
a = SpeechmaticsConverter(json_data)
|
||||
|
||||
a.convert()
|
||||
assert a.converted_words[0] == {
|
||||
'start': 5.98,
|
||||
@@ -47,16 +50,16 @@ def test_speechmatics():
|
||||
'confidence': 0.67,
|
||||
'word': 'For',
|
||||
'always_capitalized': False,
|
||||
'index': 0,
|
||||
'punc_after': False,
|
||||
'punc_before': False,
|
||||
}
|
||||
|
||||
|
||||
def test_gentle():
|
||||
a = GentleConverter(
|
||||
os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
|
||||
'interactive_transcript')
|
||||
with open(os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
|
||||
json_data = json.load(fin)
|
||||
|
||||
a = GentleConverter(json_data)
|
||||
a.convert()
|
||||
assert a.converted_words[0] == {
|
||||
'start': 0.35,
|
||||
@@ -64,7 +67,25 @@ def test_gentle():
|
||||
'confidence': 1,
|
||||
'word': '[noise]',
|
||||
'always_capitalized': False,
|
||||
'index': 0,
|
||||
'punc_after': False,
|
||||
'punc_before': False
|
||||
}
|
||||
|
||||
|
||||
def test_google():
|
||||
with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
|
||||
transcript_data = fin.read()
|
||||
|
||||
g = GoogleConverter(transcript_data)
|
||||
|
||||
g.convert()
|
||||
assert g.converted_words[0] == {
|
||||
'start': 4,
|
||||
'end': 5.5,
|
||||
'confidence': 0.88,
|
||||
'word': 'Testing',
|
||||
'always_capitalized': False,
|
||||
'punc_after': [','],
|
||||
'punc_before': False,
|
||||
}
|
||||
|
||||
|
||||
32
tests/test_convert_google.py
Normal file
32
tests/test_convert_google.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from transcript_processing.converters.google import (
|
||||
make_json_friendly,
|
||||
GoogleConverter,
|
||||
)
|
||||
from transcript_processing.config import GOOGLE_TRANSCRIPT_TEST_FILE
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transcript():
|
||||
with open(GOOGLE_TRANSCRIPT_TEST_FILE, 'r') as fin:
|
||||
return fin.read()
|
||||
|
||||
|
||||
def test_make_json_friendly(transcript):
|
||||
friendly = make_json_friendly(transcript)
|
||||
assert json.loads(friendly)
|
||||
|
||||
|
||||
def test_pre_process(transcript):
|
||||
with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
|
||||
transcript_data = fin.read()
|
||||
|
||||
g = GoogleConverter(transcript_data)
|
||||
assert g.json_data
|
||||
|
||||
|
||||
|
||||
@@ -2,3 +2,4 @@ import os
|
||||
|
||||
|
||||
AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')
|
||||
GOOGLE_TRANSCRIPT_TEST_FILE = os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE')
|
||||
@@ -14,7 +14,7 @@ class TranscriptConverter:
|
||||
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
def __init__(self, json_data):
|
||||
def __init__(self, json_data: dict):
|
||||
self.json_data = json_data
|
||||
|
||||
def convert(self):
|
||||
@@ -31,19 +31,16 @@ class TranscriptConverter:
|
||||
tagged_words
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_objects(json_data):
|
||||
def get_word_objects(self, json_data):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_words(word_objects):
|
||||
pass
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def convert_words(word_objects, words, tagged_words=None):
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@@ -1,9 +1,11 @@
|
||||
from .amazon import AmazonConverter
|
||||
from .speechmatics import SpeechmaticsConverter
|
||||
from .gentle import GentleConverter
|
||||
from .google import GoogleConverter
|
||||
|
||||
services = {
|
||||
'amazon': AmazonConverter,
|
||||
'gentle': GentleConverter,
|
||||
'speechmatics': SpeechmaticsConverter,
|
||||
'google': GoogleConverter,
|
||||
}
|
||||
@@ -15,10 +15,6 @@ class AmazonConverter(TranscriptConverter):
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['results']['items']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['start_time'])
|
||||
@@ -32,7 +28,7 @@ class AmazonConverter(TranscriptConverter):
|
||||
return float(word_object['alternatives'][0]['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
def get_word_word(word_object) -> str:
|
||||
word_word = word_object['alternatives'][0]['content']
|
||||
if word_word == 'i':
|
||||
# weird Amazon quirk
|
||||
@@ -44,7 +40,6 @@ class AmazonConverter(TranscriptConverter):
|
||||
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
if w['type'] == 'punctuation':
|
||||
@@ -7,16 +7,12 @@ class GentleConverter(TranscriptConverter):
|
||||
|
||||
name = 'gentle'
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
def __init__(self, json_data):
|
||||
super().__init__(json_data)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return word_object['start']
|
||||
@@ -35,8 +31,6 @@ class GentleConverter(TranscriptConverter):
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
@@ -51,8 +45,8 @@ class GentleConverter(TranscriptConverter):
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
'punc_after': False,
|
||||
'punc_before': False,
|
||||
})
|
||||
|
||||
punc_after = False
|
||||
145
transcript_processing/converters/google.py
Normal file
145
transcript_processing/converters/google.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
from ..converter import TranscriptConverter
|
||||
from .. import helpers
|
||||
|
||||
|
||||
|
||||
class GoogleConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, transcript_data: str):
|
||||
super().__init__(transcript_data)
|
||||
self.json_data = self.pre_process(transcript_data)
|
||||
|
||||
def pre_process(self, transcript_data):
|
||||
friendly = make_json_friendly(transcript_data)
|
||||
return json.loads(friendly)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
|
||||
for i, w in enumerate(word_objects):
|
||||
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
|
||||
punc_before = helpers.get_punc_before(word_obj.word) or False
|
||||
punc_after = helpers.get_punc_after(word_obj.word) or False
|
||||
|
||||
the_word = word_obj.word
|
||||
if punc_before:
|
||||
the_word = the_word[len(punc_before):]
|
||||
if punc_after:
|
||||
the_word = the_word[:-len(punc_after)]
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': the_word,
|
||||
'always_capitalized': self.check_if_always_capitalized(
|
||||
word_obj.word,
|
||||
i,
|
||||
tagged_words),
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
return converted_words
|
||||
|
||||
@classmethod
|
||||
def get_word_start(cls, word_object):
|
||||
return cls.get_seconds(word_object['start_time'])
|
||||
|
||||
@classmethod
|
||||
def get_word_end(cls, word_object):
|
||||
return cls.get_seconds(word_object['end_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_seconds(time: dict) -> float:
|
||||
seconds = 0
|
||||
if 'seconds' in time:
|
||||
seconds = time['seconds']
|
||||
if 'nanos' in time:
|
||||
seconds += time['nanos'] / 1_000_000_000
|
||||
return seconds
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return word_object['confidence']
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
print(word_object)
|
||||
return word_object['word']
|
||||
|
||||
|
||||
|
||||
def make_json_friendly(json_string):
|
||||
lines = [line.strip() for line in json_string.split('\\n')]
|
||||
|
||||
fields = [
|
||||
'words {',
|
||||
'start_time {',
|
||||
'}',
|
||||
'end_time {',
|
||||
'}',
|
||||
'word: ',
|
||||
'confidence: '
|
||||
]
|
||||
|
||||
current_field_index = 0
|
||||
new_string = ''
|
||||
|
||||
for line in lines:
|
||||
|
||||
current_field = fields[current_field_index]
|
||||
|
||||
if current_field in line:
|
||||
if current_field_index == len(fields) - 1:
|
||||
current_field_index = 0
|
||||
else:
|
||||
current_field_index += 1
|
||||
if current_field_index == 1:
|
||||
new_string += '}, {'
|
||||
# "words" was found, don't want to append that
|
||||
continue
|
||||
|
||||
else:
|
||||
if current_field_index == 0:
|
||||
# haven't found the beginning of the next word object
|
||||
continue
|
||||
|
||||
# add quotes around keys
|
||||
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
|
||||
'"\\1"',
|
||||
line)
|
||||
|
||||
# add colons after keys
|
||||
if line.endswith('{'):
|
||||
line = line.replace('" ', '": ')
|
||||
|
||||
# use first two decimals of confidence
|
||||
if 'confidence' in current_field:
|
||||
line = ', ' + line
|
||||
line = line[:20]
|
||||
|
||||
if current_field == '}':
|
||||
line = line + ', '
|
||||
|
||||
new_string += line
|
||||
|
||||
# cleanup
|
||||
if new_string.startswith('}, '):
|
||||
new_string = new_string[3:]
|
||||
if not new_string.startswith('['):
|
||||
new_string = '[' + new_string
|
||||
if not new_string.endswith('}]'):
|
||||
new_string = new_string + '}]'
|
||||
new_string = new_string.replace(', }', '}').replace('\\', '')
|
||||
|
||||
return new_string
|
||||
@@ -16,10 +16,6 @@ class SpeechmaticsConverter(TranscriptConverter):
|
||||
def get_word_objects(self, json_data):
|
||||
return json_data['words']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects]
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['time'])
|
||||
Reference in New Issue
Block a user