simplified to not make output that is pipe-able, made google and amazon work with and without speaker IDs

This commit is contained in:
2019-03-08 14:26:16 -05:00
parent 0301b3be23
commit 8f63320be4
5 changed files with 80 additions and 42 deletions

View File

@@ -1,6 +1,6 @@
# tpro
Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
Transcript Processing! `tpro` takes transcripts produced by
various speech-to-text services and converts them to various standardized
formats.

View File

@@ -12,27 +12,46 @@ def transcript_data():
return json.load(fin)
@pytest.fixture
def transcript_data_no_speaker_id():
with open(
os.getenv('AMAZON_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
return json.load(fin)
@pytest.fixture
def converter(transcript_data):
return AmazonConverter(transcript_data)
@pytest.fixture
def converter_no_speaker_id(transcript_data_no_speaker_id):
return AmazonConverter(transcript_data_no_speaker_id)
def test_get_word_objects(converter):
def test_get_word_objects(converter, converter_no_speaker_id):
word_objects = converter.get_word_objects(converter.json_data)
assert word_objects
word_objects = converter_no_speaker_id.get_word_objects(
converter_no_speaker_id.json_data)
assert word_objects
def test_get_speaker_segments(converter):
def test_get_speaker_segments(converter, converter_no_speaker_id):
speaker_segments = converter.get_speaker_segments()
assert speaker_segments
speaker_segments = converter_no_speaker_id.get_speaker_segments()
assert speaker_segments is None
def test_get_speaker_id(converter):
speaker_segments = converter.get_speaker_segments()
assert speaker_segments[54.58] == 0
assert speaker_segments[32.36] == 1
def test_convert(converter):
def test_convert(converter, converter_no_speaker_id):
converter.convert()
print(converter.converted_words)
converter_no_speaker_id.convert()
print(converter.converted_words)

View File

@@ -15,25 +15,42 @@ def transcript_data():
return fin.read()
@pytest.fixture
def transcript_data_no_speaker_id():
with open(
os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
return fin.read()
@pytest.fixture
def converter(transcript_data):
return GoogleConverter(transcript_data)
def test_get_word_objects(converter):
@pytest.fixture
def converter_no_speaker_id(transcript_data_no_speaker_id):
return GoogleConverter(transcript_data_no_speaker_id)
def test_get_word_objects(converter, converter_no_speaker_id):
word_objects = converter.get_word_objects(converter.json_data)
assert word_objects
word_objects = converter_no_speaker_id.get_word_objects(
converter_no_speaker_id.json_data)
assert word_objects
def test_convert(converter, converter_no_speaker_id):
converter.convert()
converter_no_speaker_id.convert()
def test_make_json_friendly(transcript_data):
friendly = make_json_friendly(transcript_data)
assert json.loads(friendly)
def test_pre_process(converter):
def test_pre_process(converter, converter_no_speaker_id):
assert converter.json_data
def test_convert(converter):
converter.convert()
print(converter.converted_words)
assert converter_no_speaker_id.json_data

View File

@@ -1,17 +1,22 @@
import json
def universal_transcript(self, pretty=False):
return json.dumps(self.converted_words, indent=4 if pretty else None)
def universal(self):
return json.dumps(self.converted_words, indent=4)
def viral_overlay(self, pretty=False):
return json.dumps([
{'start': word['start'],
def vo(self):
transcript = []
for word in self.converted_words:
if word['always_capitalized']:
word_word = word['word'].title()
else:
word_word['word']
transcript.append({
'start': word['start'],
'stop': word['end'],
'text': word['word'].title()
if word['always_capitalized'] else word['word']
}
'text': word_word,
})
for word in self.converted_words]
, indent=4 if pretty else None
)
return json.dumps(transcript, indent=4)

View File

@@ -11,30 +11,27 @@ output_choices = [k for k, v in
if callable(v)]
@click.command()
@click.option('-s', '--save', type=str, help='save to JSON file')
@click.option('-p', '--pretty', is_flag=True,
@click.option('-p', '--print-output', is_flag=True, default=True,
help='pretty print the transcript, breaks pipeability')
@click.argument('json_path_or_data', type=str)
@click.argument('transcript_data_path', type=click.File('r'))
@click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
@click.argument('input_format', type=click.Choice(services.keys()))
@click.argument('output_format', type=click.Choice(output_choices))
def cli(save,
pretty,
json_path_or_data,
def cli(print_output,
transcript_data_path,
output_path,
input_format,
output_format):
if not helpers.is_path(json_path_or_data):
json_data = json.loads(json_path_or_data)
else:
with open(json_path_or_data) as fin:
json_data = json.load(fin)
json_data = json.load(transcript_data_path)
service = services[input_format]
converter = service(json_data)
converter.convert()
if save:
path = save
converter.save(path, output_format)
click.echo(f'{path} saved.')
else:
output_formatter = getattr(converter, output_format)
click.echo(output_formatter(pretty))
converter.save(output_path, output_format)
if print_output:
with open(output_path) as fin:
click.echo(fin.read())
click.echo(f'☝☝☝ There\'s your transcript, which was saved to {output_path}.')