updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.

This commit is contained in:
2019-03-07 02:30:48 -05:00
parent 6333f55c87
commit d30ecad583
16 changed files with 346 additions and 46 deletions

13
Pipfile Normal file
View File

@@ -0,0 +1,13 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
pytest = "*"
[packages]
tpro = {editable = true,path = "."}
[requires]
python_version = "3.7"

104
Pipfile.lock generated Normal file
View File

@@ -0,0 +1,104 @@
{
"_meta": {
"hash": {
"sha256": "49961036ff9465d1da8edf8b981512812678348e4baaa0c51841df64e80533ad"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.7"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"click": {
"hashes": [
"sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
"sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
],
"version": "==7.0"
},
"nltk": {
"hashes": [
"sha256:286f6797204ffdb52525a1d21ec0a221ec68b8e3fa4f2d25f412ac8e63c70e8d"
],
"version": "==3.4"
},
"singledispatch": {
"hashes": [
"sha256:5b06af87df13818d14f08a028e42f566640aef80805c3b50c5056b086e3c2b9c",
"sha256:833b46966687b3de7f438c761ac475213e53b306740f1abfaa86e1d1aae56aa8"
],
"version": "==3.4.0.3"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"tpro": {
"editable": true,
"path": "."
}
},
"develop": {
"atomicwrites": {
"hashes": [
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
],
"version": "==1.3.0"
},
"attrs": {
"hashes": [
"sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
"sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
],
"version": "==19.1.0"
},
"more-itertools": {
"hashes": [
"sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
"sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
],
"markers": "python_version > '2.7'",
"version": "==6.0.0"
},
"pluggy": {
"hashes": [
"sha256:19ecf9ce9db2fce065a7a0586e07cfb4ac8614fe96edf628a264b1c70116cf8f",
"sha256:84d306a647cc805219916e62aab89caa97a33a1dd8c342e87a37f91073cd4746"
],
"version": "==0.9.0"
},
"py": {
"hashes": [
"sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
"sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
],
"version": "==1.8.0"
},
"pytest": {
"hashes": [
"sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c",
"sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4"
],
"index": "pypi",
"version": "==4.3.0"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
}
}
}

View File

@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:
setup(
name="tpro",
version="0.08",
version="0.09",
url='https://github.com/zevaverbach/tpro',
install_requires=[
'Click',
@@ -21,6 +21,6 @@ setup(
long_description=long_description,
entry_points='''
[console_scripts]
tpro=tpro.tpro:cli
tpro=transcript_processing.tpro:cli
''',
)

View File

@@ -3,9 +3,10 @@ import os
import pytest
from converters.amazon import AmazonConverter
from converters.speechmatics import SpeechmaticsConverter
from converters.gentle import GentleConverter
from transcript_processing.converters.amazon import AmazonConverter
from transcript_processing.converters.speechmatics import SpeechmaticsConverter
from transcript_processing.converters.gentle import GentleConverter
from transcript_processing.converters.google import GoogleConverter
@pytest.fixture
@@ -20,9 +21,10 @@ def test_json_transcript(json_transcript):
def test_amazon():
a = AmazonConverter(
os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'),
'interactive_transcript')
with open(os.getenv('AMAZON_TRANSCRIPT_TEST_FILE'), 'r') as fin:
json_data = json.load(fin)
a = AmazonConverter(json_data)
a.convert()
assert a.converted_words[0] == {
'start': 5.49,
@@ -30,16 +32,17 @@ def test_amazon():
'confidence': 1.0,
'word': 'So',
'always_capitalized': False,
'index': 0,
'punc_after': False,
'punc_before': False
}
def test_speechmatics():
a = SpeechmaticsConverter(
os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'),
'interactive_transcript')
with open(os.getenv('SPEECHMATICS_TRANSCRIPT_TEST_FILE'), 'r') as fin:
json_data = json.load(fin)
a = SpeechmaticsConverter(json_data)
a.convert()
assert a.converted_words[0] == {
'start': 5.98,
@@ -47,16 +50,16 @@ def test_speechmatics():
'confidence': 0.67,
'word': 'For',
'always_capitalized': False,
'index': 0,
'punc_after': False,
'punc_before': False,
}
def test_gentle():
a = GentleConverter(
os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'),
'interactive_transcript')
with open(os.getenv('GENTLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
json_data = json.load(fin)
a = GentleConverter(json_data)
a.convert()
assert a.converted_words[0] == {
'start': 0.35,
@@ -64,7 +67,25 @@ def test_gentle():
'confidence': 1,
'word': '[noise]',
'always_capitalized': False,
'index': 0,
'punc_after': False,
'punc_before': False
}
def test_google():
with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
transcript_data = fin.read()
g = GoogleConverter(transcript_data)
g.convert()
assert g.converted_words[0] == {
'start': 4,
'end': 5.5,
'confidence': 0.88,
'word': 'Testing',
'always_capitalized': False,
'punc_after': [','],
'punc_before': False,
}

View File

@@ -0,0 +1,32 @@
import json
import os
import pytest
from transcript_processing.converters.google import (
make_json_friendly,
GoogleConverter,
)
from transcript_processing.config import GOOGLE_TRANSCRIPT_TEST_FILE
@pytest.fixture
def transcript():
with open(GOOGLE_TRANSCRIPT_TEST_FILE, 'r') as fin:
return fin.read()
def test_make_json_friendly(transcript):
friendly = make_json_friendly(transcript)
assert json.loads(friendly)
def test_pre_process(transcript):
with open(os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE'), 'r') as fin:
transcript_data = fin.read()
g = GoogleConverter(transcript_data)
assert g.json_data

View File

@@ -2,3 +2,4 @@ import os
AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')
GOOGLE_TRANSCRIPT_TEST_FILE = os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE')

View File

@@ -14,7 +14,7 @@ class TranscriptConverter:
__metaclass__ = abc.ABCMeta
def __init__(self, json_data):
def __init__(self, json_data: dict):
self.json_data = json_data
def convert(self):
@@ -31,19 +31,16 @@ class TranscriptConverter:
tagged_words
)
@staticmethod
@abc.abstractmethod
def get_word_objects(json_data):
def get_word_objects(self, json_data):
pass
@staticmethod
@abc.abstractmethod
def get_words(word_objects):
pass
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects]
@staticmethod
@abc.abstractmethod
def convert_words(word_objects, words, tagged_words=None):
def convert_words(self, word_objects, words, tagged_words=None):
pass
@staticmethod

View File

@@ -1,9 +1,11 @@
from .amazon import AmazonConverter
from .speechmatics import SpeechmaticsConverter
from .gentle import GentleConverter
from .google import GoogleConverter
services = {
'amazon': AmazonConverter,
'gentle': GentleConverter,
'speechmatics': SpeechmaticsConverter,
'google': GoogleConverter,
}

View File

@@ -15,10 +15,6 @@ class AmazonConverter(TranscriptConverter):
def get_word_objects(self, json_data):
return json_data['results']['items']
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects]
@staticmethod
def get_word_start(word_object):
return float(word_object['start_time'])
@@ -32,7 +28,7 @@ class AmazonConverter(TranscriptConverter):
return float(word_object['alternatives'][0]['confidence'])
@staticmethod
def get_word_word(word_object):
def get_word_word(word_object) -> str:
word_word = word_object['alternatives'][0]['content']
if word_word == 'i':
# weird Amazon quirk
@@ -44,7 +40,6 @@ class AmazonConverter(TranscriptConverter):
punc_before = False
punc_after = False
num_words = len(words)
for i, w in enumerate(word_objects):
if w['type'] == 'punctuation':

View File

@@ -7,16 +7,12 @@ class GentleConverter(TranscriptConverter):
name = 'gentle'
def __init__(self, path):
super().__init__(path)
def __init__(self, json_data):
super().__init__(json_data)
def get_word_objects(self, json_data):
return json_data['words']
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects]
@staticmethod
def get_word_start(word_object):
return word_object['start']
@@ -35,8 +31,6 @@ class GentleConverter(TranscriptConverter):
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
num_words = len(words)
for i, w in enumerate(word_objects):
@@ -51,8 +45,8 @@ class GentleConverter(TranscriptConverter):
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
'punc_after': False,
'punc_before': False,
})
punc_after = False

View File

@@ -0,0 +1,145 @@
import json
import re
from ..converter import TranscriptConverter
from .. import helpers
class GoogleConverter(TranscriptConverter):
def __init__(self, transcript_data: str):
super().__init__(transcript_data)
self.json_data = self.pre_process(transcript_data)
def pre_process(self, transcript_data):
friendly = make_json_friendly(transcript_data)
return json.loads(friendly)
def get_word_objects(self, json_data):
return json_data
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
for i, w in enumerate(word_objects):
word_obj = self.get_word_object(w, i, tagged_words, word_objects)
punc_before = helpers.get_punc_before(word_obj.word) or False
punc_after = helpers.get_punc_after(word_obj.word) or False
the_word = word_obj.word
if punc_before:
the_word = the_word[len(punc_before):]
if punc_after:
the_word = the_word[:-len(punc_after)]
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': the_word,
'always_capitalized': self.check_if_always_capitalized(
word_obj.word,
i,
tagged_words),
'punc_after': punc_after,
'punc_before': punc_before,
})
return converted_words
@classmethod
def get_word_start(cls, word_object):
return cls.get_seconds(word_object['start_time'])
@classmethod
def get_word_end(cls, word_object):
return cls.get_seconds(word_object['end_time'])
@staticmethod
def get_seconds(time: dict) -> float:
seconds = 0
if 'seconds' in time:
seconds = time['seconds']
if 'nanos' in time:
seconds += time['nanos'] / 1_000_000_000
return seconds
@staticmethod
def get_word_confidence(word_object):
return word_object['confidence']
@staticmethod
def get_word_word(word_object):
print(word_object)
return word_object['word']
def make_json_friendly(json_string):
lines = [line.strip() for line in json_string.split('\\n')]
fields = [
'words {',
'start_time {',
'}',
'end_time {',
'}',
'word: ',
'confidence: '
]
current_field_index = 0
new_string = ''
for line in lines:
current_field = fields[current_field_index]
if current_field in line:
if current_field_index == len(fields) - 1:
current_field_index = 0
else:
current_field_index += 1
if current_field_index == 1:
new_string += '}, {'
# "words" was found, don't want to append that
continue
else:
if current_field_index == 0:
# haven't found the beginning of the next word object
continue
# add quotes around keys
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
'"\\1"',
line)
# add colons after keys
if line.endswith('{'):
line = line.replace('" ', '": ')
# use first two decimals of confidence
if 'confidence' in current_field:
line = ', ' + line
line = line[:20]
if current_field == '}':
line = line + ', '
new_string += line
# cleanup
if new_string.startswith('}, '):
new_string = new_string[3:]
if not new_string.startswith('['):
new_string = '[' + new_string
if not new_string.endswith('}]'):
new_string = new_string + '}]'
new_string = new_string.replace(', }', '}').replace('\\', '')
return new_string

View File

@@ -16,10 +16,6 @@ class SpeechmaticsConverter(TranscriptConverter):
def get_word_objects(self, json_data):
return json_data['words']
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects]
@staticmethod
def get_word_start(word_object):
return float(word_object['time'])