simplified to not make output that is pipe-able, made google and amazon work with and without speaker IDs

2019-03-08 14:26:16 -05:00
parent 0301b3be23
commit 8f63320be4
5 changed files with 80 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # tpro

-Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
+Transcript Processing! `tpro` takes transcripts produced by
 various speech-to-text services and converts them to various standardized
 formats.

--- a/tests/test_amazon.py
+++ b/tests/test_amazon.py
@@ -12,27 +12,46 @@ def transcript_data():
        return json.load(fin)


+@pytest.fixture
+def transcript_data_no_speaker_id():
+    with open(
+           os.getenv('AMAZON_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
+        return json.load(fin)
+
+
@pytest.fixture
 def converter(transcript_data):
    return AmazonConverter(transcript_data)

+@pytest.fixture
+def converter_no_speaker_id(transcript_data_no_speaker_id):
+    return AmazonConverter(transcript_data_no_speaker_id)

-def test_get_word_objects(converter):
+
+def test_get_word_objects(converter, converter_no_speaker_id):
    word_objects = converter.get_word_objects(converter.json_data)
    assert word_objects

+    word_objects = converter_no_speaker_id.get_word_objects(
+            converter_no_speaker_id.json_data)
+    assert word_objects

-def test_get_speaker_segments(converter):
+
+def test_get_speaker_segments(converter, converter_no_speaker_id):
    speaker_segments = converter.get_speaker_segments()
    assert speaker_segments

+    speaker_segments = converter_no_speaker_id.get_speaker_segments()
+    assert speaker_segments is None

 def test_get_speaker_id(converter):
    speaker_segments = converter.get_speaker_segments()
    assert speaker_segments[54.58] == 0
    assert speaker_segments[32.36] == 1

-
-def test_convert(converter):
+def test_convert(converter, converter_no_speaker_id):
    converter.convert()
    print(converter.converted_words)
+
+    converter_no_speaker_id.convert()
+    print(converter.converted_words)
--- a/tests/test_google.py
+++ b/tests/test_google.py
@@ -15,25 +15,42 @@ def transcript_data():
        return fin.read()


+@pytest.fixture
+def transcript_data_no_speaker_id():
+    with open(
+           os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
+        return fin.read()
+
+
@pytest.fixture
 def converter(transcript_data):
    return GoogleConverter(transcript_data)


-def test_get_word_objects(converter):
+@pytest.fixture
+def converter_no_speaker_id(transcript_data_no_speaker_id):
+    return GoogleConverter(transcript_data_no_speaker_id)
+
+
+def test_get_word_objects(converter, converter_no_speaker_id):
    word_objects = converter.get_word_objects(converter.json_data)
    assert word_objects

+    word_objects = converter_no_speaker_id.get_word_objects(
+            converter_no_speaker_id.json_data)
+    assert word_objects
+
+
+def test_convert(converter, converter_no_speaker_id):
+    converter.convert()
+    converter_no_speaker_id.convert()
+

 def test_make_json_friendly(transcript_data):
    friendly = make_json_friendly(transcript_data)
    assert json.loads(friendly)


-def test_pre_process(converter):
+def test_pre_process(converter, converter_no_speaker_id):
    assert converter.json_data
-
-
-def test_convert(converter):
-    converter.convert()
-    print(converter.converted_words)
+    assert converter_no_speaker_id.json_data
--- a/transcript_processing/outputs.py
+++ b/transcript_processing/outputs.py
@@ -1,17 +1,22 @@
 import json

-def universal_transcript(self, pretty=False):
-    return json.dumps(self.converted_words, indent=4 if pretty else None)
+def universal(self):
+    return json.dumps(self.converted_words, indent=4)

-def viral_overlay(self, pretty=False):
-    return json.dumps([
-        {'start': word['start'],
+def vo(self):
+    transcript = []
+
+    for word in self.converted_words:
+        if word['always_capitalized']:
+            word_word = word['word'].title()
+        else:
+            word_word['word']
+
+        transcript.append({
+            'start': word['start'],
            'stop': word['end'],
-         'text': word['word'].title() 
-             if word['always_capitalized'] else word['word']
-        }
+            'text': word_word,
+            })

-                       for word in self.converted_words]
-                       , indent=4 if pretty else None
-            )
+    return json.dumps(transcript, indent=4)

--- a/transcript_processing/tpro.py
+++ b/transcript_processing/tpro.py
@@ -11,30 +11,27 @@ output_choices =  [k for k, v in
                   if callable(v)]

@click.command()
-@click.option('-s', '--save', type=str, help='save to JSON file')
-@click.option('-p', '--pretty', is_flag=True,
+@click.option('-p', '--print-output', is_flag=True, default=True,
        help='pretty print the transcript, breaks pipeability')
-@click.argument('json_path_or_data', type=str)
+@click.argument('transcript_data_path', type=click.File('r'))
+@click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
@click.argument('input_format', type=click.Choice(services.keys()))
@click.argument('output_format', type=click.Choice(output_choices))
-def cli(save, 
-        pretty,
-        json_path_or_data,
+def cli(print_output,
+        transcript_data_path,
+        output_path,
        input_format,
        output_format):

-    if not helpers.is_path(json_path_or_data):
-        json_data = json.loads(json_path_or_data)
-    else:
-        with open(json_path_or_data) as fin:
-            json_data = json.load(fin)
+    json_data = json.load(transcript_data_path)
    service = services[input_format]
+
    converter = service(json_data)
    converter.convert()
-    if save:
-        path = save
-        converter.save(path, output_format)
-        click.echo(f'{path} saved.')
-    else:
-        output_formatter = getattr(converter, output_format)
-        click.echo(output_formatter(pretty))
+    converter.save(output_path, output_format)
+
+    if print_output:
+        with open(output_path) as fin:
+            click.echo(fin.read())
+
+    click.echo(f'☝☝☝ There\'s your transcript, which was saved to {output_path}.')