diff --git a/README.md b/README.md index 4ea75ee..8f0bf5b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # tpro -Transcript Processing! `tpro` takes JSON-formatted transcripts produced by +Transcript Processing! `tpro` takes transcripts produced by various speech-to-text services and converts them to various standardized formats. diff --git a/tests/test_amazon.py b/tests/test_amazon.py index dbceca3..acca70b 100644 --- a/tests/test_amazon.py +++ b/tests/test_amazon.py @@ -12,27 +12,46 @@ def transcript_data(): return json.load(fin) +@pytest.fixture +def transcript_data_no_speaker_id(): + with open( + os.getenv('AMAZON_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin: + return json.load(fin) + + @pytest.fixture def converter(transcript_data): return AmazonConverter(transcript_data) +@pytest.fixture +def converter_no_speaker_id(transcript_data_no_speaker_id): + return AmazonConverter(transcript_data_no_speaker_id) -def test_get_word_objects(converter): + +def test_get_word_objects(converter, converter_no_speaker_id): word_objects = converter.get_word_objects(converter.json_data) assert word_objects + word_objects = converter_no_speaker_id.get_word_objects( + converter_no_speaker_id.json_data) + assert word_objects -def test_get_speaker_segments(converter): + +def test_get_speaker_segments(converter, converter_no_speaker_id): speaker_segments = converter.get_speaker_segments() assert speaker_segments + speaker_segments = converter_no_speaker_id.get_speaker_segments() + assert speaker_segments is None def test_get_speaker_id(converter): speaker_segments = converter.get_speaker_segments() assert speaker_segments[54.58] == 0 assert speaker_segments[32.36] == 1 - -def test_convert(converter): +def test_convert(converter, converter_no_speaker_id): converter.convert() print(converter.converted_words) + + converter_no_speaker_id.convert() + print(converter.converted_words) diff --git a/tests/test_google.py b/tests/test_google.py index 1dc8e41..014ef6d 100644 --- a/tests/test_google.py +++ b/tests/test_google.py @@ -15,25 +15,42 @@ def transcript_data(): return fin.read() +@pytest.fixture +def transcript_data_no_speaker_id(): + with open( + os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin: + return fin.read() + + @pytest.fixture def converter(transcript_data): return GoogleConverter(transcript_data) -def test_get_word_objects(converter): +@pytest.fixture +def converter_no_speaker_id(transcript_data_no_speaker_id): + return GoogleConverter(transcript_data_no_speaker_id) + + +def test_get_word_objects(converter, converter_no_speaker_id): word_objects = converter.get_word_objects(converter.json_data) assert word_objects + word_objects = converter_no_speaker_id.get_word_objects( + converter_no_speaker_id.json_data) + assert word_objects + + +def test_convert(converter, converter_no_speaker_id): + converter.convert() + converter_no_speaker_id.convert() + def test_make_json_friendly(transcript_data): friendly = make_json_friendly(transcript_data) assert json.loads(friendly) -def test_pre_process(converter): +def test_pre_process(converter, converter_no_speaker_id): assert converter.json_data - - -def test_convert(converter): - converter.convert() - print(converter.converted_words) + assert converter_no_speaker_id.json_data diff --git a/transcript_processing/outputs.py b/transcript_processing/outputs.py index f073e48..5c76ac0 100644 --- a/transcript_processing/outputs.py +++ b/transcript_processing/outputs.py @@ -1,17 +1,22 @@ import json -def universal_transcript(self, pretty=False): - return json.dumps(self.converted_words, indent=4 if pretty else None) +def universal(self): + return json.dumps(self.converted_words, indent=4) -def viral_overlay(self, pretty=False): - return json.dumps([ - {'start': word['start'], - 'stop': word['end'], - 'text': word['word'].title() - if word['always_capitalized'] else word['word'] - } +def vo(self): + transcript = [] - for word in self.converted_words] - , indent=4 if pretty else None - ) + for word in self.converted_words: + if word['always_capitalized']: + word_word = word['word'].title() + else: + word_word['word'] + + transcript.append({ + 'start': word['start'], + 'stop': word['end'], + 'text': word_word, + }) + + return json.dumps(transcript, indent=4) diff --git a/transcript_processing/tpro.py b/transcript_processing/tpro.py index f3b2726..59e1d11 100644 --- a/transcript_processing/tpro.py +++ b/transcript_processing/tpro.py @@ -11,30 +11,27 @@ output_choices = [k for k, v in if callable(v)] @click.command() -@click.option('-s', '--save', type=str, help='save to JSON file') -@click.option('-p', '--pretty', is_flag=True, +@click.option('-p', '--print-output', is_flag=True, default=True, help='pretty print the transcript, breaks pipeability') -@click.argument('json_path_or_data', type=str) +@click.argument('transcript_data_path', type=click.File('r')) +@click.argument('output_path', type=click.Path(writable=True, dir_okay=False)) @click.argument('input_format', type=click.Choice(services.keys())) @click.argument('output_format', type=click.Choice(output_choices)) -def cli(save, - pretty, - json_path_or_data, +def cli(print_output, + transcript_data_path, + output_path, input_format, output_format): - if not helpers.is_path(json_path_or_data): - json_data = json.loads(json_path_or_data) - else: - with open(json_path_or_data) as fin: - json_data = json.load(fin) + json_data = json.load(transcript_data_path) service = services[input_format] + converter = service(json_data) converter.convert() - if save: - path = save - converter.save(path, output_format) - click.echo(f'{path} saved.') - else: - output_formatter = getattr(converter, output_format) - click.echo(output_formatter(pretty)) + converter.save(output_path, output_format) + + if print_output: + with open(output_path) as fin: + click.echo(fin.read()) + + click.echo(f'☝☝☝ There\'s your transcript, which was saved to {output_path}.')