diff --git a/saying_things_stuff_4.mp3.txt b/saying_things_stuff_4.mp3.txt new file mode 100644 index 0000000..45bdb69 --- /dev/null +++ b/saying_things_stuff_4.mp3.txt @@ -0,0 +1,202 @@ +alternatives { + transcript: "testing" + confidence: 0.9585370421409607 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " This is everybody saying things." + confidence: 0.8421146273612976 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004213452339172 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.7137818932533264 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628734588623047 + speaker_tag: 2 + } +} +channel_tag: 1 +language_code: "en-us" + +alternatives { + transcript: " 2019" + confidence: 0.9188541173934937 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004213452339172 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.7137818932533264 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628734588623047 + speaker_tag: 2 + } + words { + start_time { + seconds: 10 + nanos: 300000000 + } + end_time { + seconds: 11 + nanos: 600000000 + } + word: "2019" + confidence: 0.901819109916687 + speaker_tag: 2 + } +} +channel_tag: 1 +language_code: "en-us" + diff --git a/saying_things_stuff_4_copy.mp3.txt b/saying_things_stuff_4_copy.mp3.txt new file mode 100644 index 0000000..0c25ebf --- /dev/null +++ b/saying_things_stuff_4_copy.mp3.txt @@ -0,0 +1,202 @@ +alternatives { + transcript: "testing" + confidence: 0.9585370421409607 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " This is everybody saying things." + confidence: 0.8421170711517334 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004211068153381 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.7137818336486816 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628880620002747 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " 2019" + confidence: 0.9188556671142578 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004211068153381 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.7137818336486816 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628880620002747 + speaker_tag: 2 + } + words { + start_time { + seconds: 10 + nanos: 300000000 + } + end_time { + seconds: 11 + nanos: 600000000 + } + word: "2019" + confidence: 0.901819109916687 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + diff --git a/saying_things_stuff_5.mp3.txt b/saying_things_stuff_5.mp3.txt new file mode 100644 index 0000000..a489c98 --- /dev/null +++ b/saying_things_stuff_5.mp3.txt @@ -0,0 +1,202 @@ +alternatives { + transcript: "testing" + confidence: 0.9585370421409607 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " This is everybody saying things." + confidence: 0.8421146273612976 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004211068153381 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.713782012462616 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628734588623047 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " 2019" + confidence: 0.9188556671142578 + words { + start_time { + seconds: 4 + nanos: 700000000 + } + end_time { + seconds: 5 + nanos: 300000000 + } + word: "testing" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 200000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "This" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 800000000 + } + word: "is" + confidence: 0.9585370421409607 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 800000000 + } + end_time { + seconds: 7 + nanos: 800000000 + } + word: "everybody" + confidence: 0.5004211068153381 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 800000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.713782012462616 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 200000000 + } + word: "things." + confidence: 0.9628734588623047 + speaker_tag: 2 + } + words { + start_time { + seconds: 10 + nanos: 300000000 + } + end_time { + seconds: 11 + nanos: 600000000 + } + word: "2019" + confidence: 0.901819109916687 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + diff --git a/saying_things_stuff_6.mp3.txt b/saying_things_stuff_6.mp3.txt new file mode 100644 index 0000000..8c71ce4 --- /dev/null +++ b/saying_things_stuff_6.mp3.txt @@ -0,0 +1,233 @@ +alternatives { + transcript: "Testing, this is Zev, Ivory box saying things." + confidence: 0.800268292427063 + words { + start_time { + seconds: 4 + } + end_time { + seconds: 5 + nanos: 500000000 + } + word: "Testing," + confidence: 0.8863372206687927 + speaker_tag: 2 + } + words { + start_time { + seconds: 5 + nanos: 500000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "this" + confidence: 0.8322268724441528 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 900000000 + } + word: "is" + confidence: 0.7659580111503601 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 900000000 + } + end_time { + seconds: 7 + nanos: 300000000 + } + word: "Zev," + confidence: 0.9128385782241821 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 300000000 + } + end_time { + seconds: 7 + nanos: 700000000 + } + word: "Ivory" + confidence: 0.7265068292617798 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 700000000 + } + end_time { + seconds: 7 + nanos: 900000000 + } + word: "box" + confidence: 0.7768470644950867 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 900000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.8872998952865601 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 400000000 + } + word: "things." + confidence: 0.9128385782241821 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + +alternatives { + transcript: " 2019" + confidence: 0.7211146354675293 + words { + start_time { + seconds: 4 + } + end_time { + seconds: 5 + nanos: 500000000 + } + word: "Testing," + confidence: 0.8863372206687927 + speaker_tag: 2 + } + words { + start_time { + seconds: 5 + nanos: 500000000 + } + end_time { + seconds: 6 + nanos: 600000000 + } + word: "this" + confidence: 0.8322268724441528 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 600000000 + } + end_time { + seconds: 6 + nanos: 900000000 + } + word: "is" + confidence: 0.7659580111503601 + speaker_tag: 2 + } + words { + start_time { + seconds: 6 + nanos: 900000000 + } + end_time { + seconds: 7 + nanos: 300000000 + } + word: "Zev," + confidence: 0.9128385782241821 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 300000000 + } + end_time { + seconds: 7 + nanos: 700000000 + } + word: "Ivory" + confidence: 0.7265068292617798 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 700000000 + } + end_time { + seconds: 7 + nanos: 900000000 + } + word: "box" + confidence: 0.7768470644950867 + speaker_tag: 2 + } + words { + start_time { + seconds: 7 + nanos: 900000000 + } + end_time { + seconds: 8 + nanos: 700000000 + } + word: "saying" + confidence: 0.8872998952865601 + speaker_tag: 2 + } + words { + start_time { + seconds: 8 + nanos: 700000000 + } + end_time { + seconds: 9 + nanos: 400000000 + } + word: "things." + confidence: 0.9128385782241821 + speaker_tag: 2 + } + words { + start_time { + seconds: 10 + nanos: 300000000 + } + end_time { + seconds: 11 + nanos: 500000000 + } + word: "2019" + confidence: 0.7581849098205566 + speaker_tag: 2 + } +} +channel_tag: 2 +language_code: "en-us" + diff --git a/tatt/config.py b/tatt/config.py index 5909b4d..f9a9738 100644 --- a/tatt/config.py +++ b/tatt/config.py @@ -7,6 +7,15 @@ BUCKET_NAME_FMTR_MEDIA = 'tatt-media-{}' BUCKET_NAME_FMTR_TRANSCRIPT = 'tatt-transcript-{}' BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE = 'tatt_transcript_{}' + +def GOOGLE_SPEECH_USE_ENHANCED(): + enhanced = os.getenv('GOOGLE_SPEECH_USE_ENHANCED') + if enhanced and enhanced == 'true': + return True + else: + return False + + if os.getenv('AWS_CONFIG_FILEPATH'): AWS_CONFIG_FILEPATH = Path(os.getenv('AWS_CONFIG_FILEPATH')) else: diff --git a/tatt/exceptions.py b/tatt/exceptions.py index b9c675d..4456d7c 100644 --- a/tatt/exceptions.py +++ b/tatt/exceptions.py @@ -17,3 +17,7 @@ class NotAvailable(Exception): class DependencyRequired(Exception): pass + + +class FormatError(Exception): + pass diff --git a/tatt/helpers.py b/tatt/helpers.py index 4d18076..cb2377a 100644 --- a/tatt/helpers.py +++ b/tatt/helpers.py @@ -6,6 +6,7 @@ from typing import Dict, List import audioread from tatt import config, exceptions, vendors +from tatt.vendors.vendor import TranscriberBaseClass def make_string_all_services(free_only=False): @@ -40,13 +41,14 @@ def get_job(job_name): return job -def get_transcript(job_name): +def get_transcript(job_name) -> tuple: job = get_job(job_name) service = get_service(job['service_name']) - return service.retrieve_transcript(job_name) + transcript = service.retrieve_transcript(job_name) + return transcript, service -def get_service(service_name): +def get_service(service_name) -> TranscriberBaseClass: module = vendors.SERVICES[service_name] return getattr(module, config.SERVICE_CLASS_NAME) diff --git a/tatt/transcribe.py b/tatt/transcribe.py index 1de45ef..ad05e12 100644 --- a/tatt/transcribe.py +++ b/tatt/transcribe.py @@ -22,16 +22,20 @@ def cli(): def get(name, save, pretty): """Downloads and/or saves completed transcript.""" try: - transcript = json.dumps(helpers.get_transcript(name), - indent=4 if pretty else None) + transcript, service = helpers.get_transcript(name) except exceptions.DoesntExistError: raise click.ClickException(f'no such transcript {name}') except exceptions.NotAvailable as e: raise click.ClickException(str(e)) file = None - if save: + if service.transcript_type == dict: + transcript = json.dumps(transcript, indent=4 if pretty else None) filepath = f'{name}.json' + else: + filepath = f'{name}.txt' + + if save: file = open(filepath, 'w') click.echo(transcript, file=file) diff --git a/tatt/vendors/amazon.py b/tatt/vendors/amazon.py index 5924494..59286a5 100644 --- a/tatt/vendors/amazon.py +++ b/tatt/vendors/amazon.py @@ -14,6 +14,7 @@ from .vendor import TranscriberBaseClass NAME = 'amazon' BUCKET_NAME_MEDIA = config.BUCKET_NAME_FMTR_MEDIA.format(NAME) BUCKET_NAME_TRANSCRIPT = config.BUCKET_NAME_FMTR_TRANSCRIPT.format(NAME) +TRANSCRIPT_TYPE = dict def _check_for_config() -> bool: @@ -23,6 +24,7 @@ def _check_for_config() -> bool: ) + class Transcriber(TranscriberBaseClass): cost_per_15_seconds = .024 / 4 @@ -30,6 +32,7 @@ class Transcriber(TranscriberBaseClass): 'transcript': BUCKET_NAME_TRANSCRIPT} no_config_error_message = 'please run "aws configure" first' + transcript_type = TRANSCRIPT_TYPE if _check_for_config(): tr = boto3.client('transcribe') @@ -120,7 +123,8 @@ class Transcriber(TranscriberBaseClass): return jobs @classmethod - def retrieve_transcript(cls, transcription_job_name: str) -> dict: + def retrieve_transcript(cls, transcription_job_name: str + ) -> TRANSCRIPT_TYPE: job = cls.tr.get_transcription_job( TranscriptionJobName=transcription_job_name )['TranscriptionJob'] diff --git a/tatt/vendors/google.py b/tatt/vendors/google.py index f1d6c8c..43da19b 100644 --- a/tatt/vendors/google.py +++ b/tatt/vendors/google.py @@ -14,12 +14,13 @@ from google.cloud import ( exceptions as gc_exceptions, ) -from tatt import exceptions, helpers, config +from tatt import exceptions, helpers, config as config_mod from .vendor import TranscriberBaseClass NAME = 'google' -BUCKET_NAME_TRANSCRIPT = config.BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE.format( +BUCKET_NAME_TRANSCRIPT = config_mod.BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE.format( 'goog') +TRANSCRIPT_TYPE = str def _check_for_config(): @@ -35,6 +36,7 @@ class Transcriber(TranscriberBaseClass): 'and put the path to your credentials in an ' 'environment variable "GOOGLE_APPLICATION_CREDENTIALS"' ) + transcript_type = TRANSCRIPT_TYPE if _check_for_config(): speech_client = speech.SpeechClient() @@ -74,10 +76,6 @@ class Transcriber(TranscriberBaseClass): def file_format(self): return pathlib.Path(self.filepath).suffix[1:].lower() - @property - def transcript_name(self): - return self.basename + '.txt' - @staticmethod def check_for_config() -> bool: return _check_for_config() @@ -89,13 +87,13 @@ class Transcriber(TranscriberBaseClass): def _check_if_transcript_exists(self, transcript_name=None): return storage.Blob( bucket=self.transcript_bucket, - name=transcript_name or self.transcript_name + name=transcript_name or self.basename ).exists(self.storage_client) def _request_transcription( self, language_code='en-US', - model='video', + # model='video', ) -> str: """Returns the job_name""" if self._check_if_transcript_exists(): @@ -103,6 +101,9 @@ class Transcriber(TranscriberBaseClass): f'{self.basename} already exists on {NAME}') num_audio_channels = helpers.get_num_audio_channels(self.filepath) + use_enhanced = config_mod.GOOGLE_SPEECH_USE_ENHANCED() + print(use_enhanced) + with io.open(self.filepath, 'rb') as audio_file: content = audio_file.read() audio = speech.types.RecognitionAudio(content=content) @@ -116,7 +117,12 @@ class Transcriber(TranscriberBaseClass): enable_word_time_offsets=True, language_code=language_code, enable_automatic_punctuation=True, - model=model, + enable_speaker_diarization=True, + # not clear whether this has to be 'phone_call' in order to + # use_enhanced + model='video', + use_enhanced=use_enhanced, + # model=model, ) self.operation = self.speech_client.long_running_recognize(config, @@ -143,7 +149,8 @@ class Transcriber(TranscriberBaseClass): return self.basename @classmethod - def retrieve_transcript(cls, transcription_job_name: str) -> dict: + def retrieve_transcript(cls, transcription_job_name: str + ) -> TRANSCRIPT_TYPE: """Get transcript from BUCKET_NAME_TRANSCRIPT""" if not cls._check_if_transcript_exists( cls, @@ -161,7 +168,7 @@ class Transcriber(TranscriberBaseClass): return transcript_text def upload_file(self, bucket_name, path): - blob = self.transcript_bucket.blob(self.transcript_name) + blob = self.transcript_bucket.blob(self.basename) blob.upload_from_filename(path) @classmethod diff --git a/tatt/vendors/vendor.py b/tatt/vendors/vendor.py index 7745cb1..82c392b 100644 --- a/tatt/vendors/vendor.py +++ b/tatt/vendors/vendor.py @@ -1,7 +1,7 @@ import abc import os from pathlib import PurePath -from typing import List +from typing import List, Union from tatt import exceptions @@ -12,6 +12,8 @@ class TranscriberBaseClass: def __init__(self, filepath): self._setup() + if ' ' in filepath: + raise exceptions.FormatError('Please don\'t put any spaces in the filename.') self.filepath = PurePath(filepath) self.basename = str(os.path.basename(self.filepath)) @@ -24,6 +26,11 @@ class TranscriberBaseClass: """ pass + @property + @abc.abstractmethod + def transcript_type(self): + pass + @property @abc.abstractmethod def cost_per_15_seconds(self): @@ -56,7 +63,7 @@ class TranscriberBaseClass: @classmethod @abc.abstractmethod - def retrieve_transcript(transcription_job_name: str) -> dict: + def retrieve_transcript(transcription_job_name: str) -> Union[str, dict]: pass @classmethod