simplified to not make output that is pipe-able, made google and amazon work with and without speaker IDs
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# tpro
|
||||
|
||||
Transcript Processing! `tpro` takes JSON-formatted transcripts produced by
|
||||
Transcript Processing! `tpro` takes transcripts produced by
|
||||
various speech-to-text services and converts them to various standardized
|
||||
formats.
|
||||
|
||||
|
||||
@@ -12,27 +12,46 @@ def transcript_data():
|
||||
return json.load(fin)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transcript_data_no_speaker_id():
|
||||
with open(
|
||||
os.getenv('AMAZON_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
|
||||
return json.load(fin)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter(transcript_data):
|
||||
return AmazonConverter(transcript_data)
|
||||
|
||||
@pytest.fixture
|
||||
def converter_no_speaker_id(transcript_data_no_speaker_id):
|
||||
return AmazonConverter(transcript_data_no_speaker_id)
|
||||
|
||||
def test_get_word_objects(converter):
|
||||
|
||||
def test_get_word_objects(converter, converter_no_speaker_id):
|
||||
word_objects = converter.get_word_objects(converter.json_data)
|
||||
assert word_objects
|
||||
|
||||
word_objects = converter_no_speaker_id.get_word_objects(
|
||||
converter_no_speaker_id.json_data)
|
||||
assert word_objects
|
||||
|
||||
def test_get_speaker_segments(converter):
|
||||
|
||||
def test_get_speaker_segments(converter, converter_no_speaker_id):
|
||||
speaker_segments = converter.get_speaker_segments()
|
||||
assert speaker_segments
|
||||
|
||||
speaker_segments = converter_no_speaker_id.get_speaker_segments()
|
||||
assert speaker_segments is None
|
||||
|
||||
def test_get_speaker_id(converter):
|
||||
speaker_segments = converter.get_speaker_segments()
|
||||
assert speaker_segments[54.58] == 0
|
||||
assert speaker_segments[32.36] == 1
|
||||
|
||||
|
||||
def test_convert(converter):
|
||||
def test_convert(converter, converter_no_speaker_id):
|
||||
converter.convert()
|
||||
print(converter.converted_words)
|
||||
|
||||
converter_no_speaker_id.convert()
|
||||
print(converter.converted_words)
|
||||
|
||||
@@ -15,25 +15,42 @@ def transcript_data():
|
||||
return fin.read()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def transcript_data_no_speaker_id():
|
||||
with open(
|
||||
os.getenv('GOOGLE_TRANSCRIPT_TEST_FILE_NO_SPEAKER_ID'), 'r') as fin:
|
||||
return fin.read()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter(transcript_data):
|
||||
return GoogleConverter(transcript_data)
|
||||
|
||||
|
||||
def test_get_word_objects(converter):
|
||||
@pytest.fixture
|
||||
def converter_no_speaker_id(transcript_data_no_speaker_id):
|
||||
return GoogleConverter(transcript_data_no_speaker_id)
|
||||
|
||||
|
||||
def test_get_word_objects(converter, converter_no_speaker_id):
|
||||
word_objects = converter.get_word_objects(converter.json_data)
|
||||
assert word_objects
|
||||
|
||||
word_objects = converter_no_speaker_id.get_word_objects(
|
||||
converter_no_speaker_id.json_data)
|
||||
assert word_objects
|
||||
|
||||
|
||||
def test_convert(converter, converter_no_speaker_id):
|
||||
converter.convert()
|
||||
converter_no_speaker_id.convert()
|
||||
|
||||
|
||||
def test_make_json_friendly(transcript_data):
|
||||
friendly = make_json_friendly(transcript_data)
|
||||
assert json.loads(friendly)
|
||||
|
||||
|
||||
def test_pre_process(converter):
|
||||
def test_pre_process(converter, converter_no_speaker_id):
|
||||
assert converter.json_data
|
||||
|
||||
|
||||
def test_convert(converter):
|
||||
converter.convert()
|
||||
print(converter.converted_words)
|
||||
assert converter_no_speaker_id.json_data
|
||||
|
||||
@@ -1,17 +1,22 @@
|
||||
import json
|
||||
|
||||
def universal_transcript(self, pretty=False):
|
||||
return json.dumps(self.converted_words, indent=4 if pretty else None)
|
||||
def universal(self):
|
||||
return json.dumps(self.converted_words, indent=4)
|
||||
|
||||
def viral_overlay(self, pretty=False):
|
||||
return json.dumps([
|
||||
{'start': word['start'],
|
||||
'stop': word['end'],
|
||||
'text': word['word'].title()
|
||||
if word['always_capitalized'] else word['word']
|
||||
}
|
||||
def vo(self):
|
||||
transcript = []
|
||||
|
||||
for word in self.converted_words]
|
||||
, indent=4 if pretty else None
|
||||
)
|
||||
for word in self.converted_words:
|
||||
if word['always_capitalized']:
|
||||
word_word = word['word'].title()
|
||||
else:
|
||||
word_word['word']
|
||||
|
||||
transcript.append({
|
||||
'start': word['start'],
|
||||
'stop': word['end'],
|
||||
'text': word_word,
|
||||
})
|
||||
|
||||
return json.dumps(transcript, indent=4)
|
||||
|
||||
|
||||
@@ -11,30 +11,27 @@ output_choices = [k for k, v in
|
||||
if callable(v)]
|
||||
|
||||
@click.command()
|
||||
@click.option('-s', '--save', type=str, help='save to JSON file')
|
||||
@click.option('-p', '--pretty', is_flag=True,
|
||||
@click.option('-p', '--print-output', is_flag=True, default=True,
|
||||
help='pretty print the transcript, breaks pipeability')
|
||||
@click.argument('json_path_or_data', type=str)
|
||||
@click.argument('transcript_data_path', type=click.File('r'))
|
||||
@click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
|
||||
@click.argument('input_format', type=click.Choice(services.keys()))
|
||||
@click.argument('output_format', type=click.Choice(output_choices))
|
||||
def cli(save,
|
||||
pretty,
|
||||
json_path_or_data,
|
||||
def cli(print_output,
|
||||
transcript_data_path,
|
||||
output_path,
|
||||
input_format,
|
||||
output_format):
|
||||
|
||||
if not helpers.is_path(json_path_or_data):
|
||||
json_data = json.loads(json_path_or_data)
|
||||
else:
|
||||
with open(json_path_or_data) as fin:
|
||||
json_data = json.load(fin)
|
||||
json_data = json.load(transcript_data_path)
|
||||
service = services[input_format]
|
||||
|
||||
converter = service(json_data)
|
||||
converter.convert()
|
||||
if save:
|
||||
path = save
|
||||
converter.save(path, output_format)
|
||||
click.echo(f'{path} saved.')
|
||||
else:
|
||||
output_formatter = getattr(converter, output_format)
|
||||
click.echo(output_formatter(pretty))
|
||||
converter.save(output_path, output_format)
|
||||
|
||||
if print_output:
|
||||
with open(output_path) as fin:
|
||||
click.echo(fin.read())
|
||||
|
||||
click.echo(f'☝☝☝ There\'s your transcript, which was saved to {output_path}.')
|
||||
|
||||
Reference in New Issue
Block a user