fixed #42, fixed #43, got sample rate programmatically

This commit is contained in:
2019-03-07 17:58:27 -05:00
parent 2c93d4724f
commit 5b7a05379f
7 changed files with 18706 additions and 26 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -8,14 +8,6 @@ BUCKET_NAME_FMTR_TRANSCRIPT = 'tatt-transcript-{}'
BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE = 'tatt_transcript_{}' BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE = 'tatt_transcript_{}'
def GOOGLE_SPEECH_USE_ENHANCED():
enhanced = os.getenv('GOOGLE_SPEECH_USE_ENHANCED')
if enhanced and enhanced == 'true':
return True
else:
return False
if os.getenv('AWS_CONFIG_FILEPATH'): if os.getenv('AWS_CONFIG_FILEPATH'):
AWS_CONFIG_FILEPATH = Path(os.getenv('AWS_CONFIG_FILEPATH')) AWS_CONFIG_FILEPATH = Path(os.getenv('AWS_CONFIG_FILEPATH'))
else: else:

View File

@@ -124,10 +124,18 @@ def get_transcription_jobs_dict():
def get_num_audio_channels(filepath): def get_num_audio_channels(filepath):
return get_media_info(filepath).channels
def get_sample_rate(filepath):
return get_media_info(filepath).samplerate
def get_media_info(filepath):
if isinstance(filepath, pathlib.PurePosixPath): if isinstance(filepath, pathlib.PurePosixPath):
filepath = str(filepath) filepath = str(filepath)
with audioread.audio_open(filepath) as f: with audioread.audio_open(filepath) as f:
return f.channels return f
def shell_call(command): def shell_call(command):

View File

@@ -88,15 +88,33 @@ def status(job_name):
@cli.command() @cli.command()
@click.option('--punctuation', is_flag=True, default=True,
help='only for Google Speech, defaults to True')
@click.option('--speaker-id', is_flag=True, default=True,
help='only for Google Speech, defaults to True')
@click.option('--model', default='phone_call',
help='only for Google Speech, defaults to "phone_call"')
@click.option('--use-enhanced', is_flag=True, default=True,
help='only for Google Speech, defaults to True')
@click.argument('media_filepath', type=str) @click.argument('media_filepath', type=str)
@click.argument('service_name', type=str) @click.argument('service_name', type=str)
def this(media_filepath, service_name): def this(media_filepath, service_name, punctuation, speaker_id, model,
use_enhanced):
"""Sends a media file to be transcribed.""" """Sends a media file to be transcribed."""
if service_name == 'google':
transcribe_kwargs = dict(
enable_automatic_punctuation=punctuation,
enable_speaker_diarization=speaker_id,
model=model,
use_enhanced=use_enhanced,
)
else:
transcribe_kwargs = {}
try: try:
service = get_service(service_name) service = get_service(service_name)
except KeyError as e: except KeyError as e:
raise click.ClickException( raise click.ClickException(
f'No such service! {print_all_services(print_=False)}') f'No such service! {helpers.make_string_all_services()}')
try: try:
s = service(media_filepath) s = service(media_filepath)
@@ -107,7 +125,7 @@ def this(media_filepath, service_name):
f'Okay, transcribing {media_filepath} using {service_name}...') f'Okay, transcribing {media_filepath} using {service_name}...')
try: try:
job_num = s.transcribe() job_num = s.transcribe(**transcribe_kwargs)
except exceptions.AlreadyExistsError as e: except exceptions.AlreadyExistsError as e:
raise click.ClickException(str(e)) raise click.ClickException(str(e))
click.echo(f'Okay, job {job_num} is being transcribed. Use "get" ' click.echo(f'Okay, job {job_num} is being transcribed. Use "get" '

View File

@@ -30,7 +30,7 @@ def _check_for_config():
class Transcriber(TranscriberBaseClass): class Transcriber(TranscriberBaseClass):
SUPPORTED_FORMATS = ['flac'] SUPPORTED_FORMATS = ['flac']
cost_per_15_seconds = .009 cost_per_15_seconds = [.004, .006, .009]
no_config_error_message = ( no_config_error_message = (
'Please sign up for the Google Speech-to-Text API ' 'Please sign up for the Google Speech-to-Text API '
'and put the path to your credentials in an ' 'and put the path to your credentials in an '
@@ -80,9 +80,14 @@ class Transcriber(TranscriberBaseClass):
def check_for_config() -> bool: def check_for_config() -> bool:
return _check_for_config() return _check_for_config()
def transcribe(self) -> str: def upload_file_if_too_big(self):
"""10MB limit as of Mar 7, 2019"""
pass
def transcribe(self, **kwargs) -> str:
self.convert_file_format_if_needed() self.convert_file_format_if_needed()
self._request_transcription() self.upload_file_if_too_big()
self._request_transcription(**kwargs)
def _check_if_transcript_exists(self, transcript_name=None): def _check_if_transcript_exists(self, transcript_name=None):
return storage.Blob( return storage.Blob(
@@ -93,16 +98,17 @@ class Transcriber(TranscriberBaseClass):
def _request_transcription( def _request_transcription(
self, self,
language_code='en-US', language_code='en-US',
# model='video', enable_automatic_punctuation=True,
enable_speaker_diarization=True,
model='phone_call',
use_enhanced=True,
) -> str: ) -> str:
"""Returns the job_name""" """Returns the job_name"""
if self._check_if_transcript_exists(): if self._check_if_transcript_exists():
raise exceptions.AlreadyExistsError( raise exceptions.AlreadyExistsError(
f'{self.basename} already exists on {NAME}') f'{self.basename} already exists on {NAME}')
num_audio_channels = helpers.get_num_audio_channels(self.filepath) num_audio_channels = helpers.get_num_audio_channels(self.filepath)
sample_rate = helpers.get_sample_rate(self.filepath)
use_enhanced = config_mod.GOOGLE_SPEECH_USE_ENHANCED()
print(use_enhanced)
with io.open(self.filepath, 'rb') as audio_file: with io.open(self.filepath, 'rb') as audio_file:
content = audio_file.read() content = audio_file.read()
@@ -110,19 +116,16 @@ class Transcriber(TranscriberBaseClass):
config = speech.types.RecognitionConfig( config = speech.types.RecognitionConfig(
encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC, encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=44100, sample_rate_hertz=sample_rate,
audio_channel_count=num_audio_channels, audio_channel_count=num_audio_channels,
enable_separate_recognition_per_channel=True, enable_separate_recognition_per_channel=True,
enable_word_confidence=True, enable_word_confidence=True,
enable_word_time_offsets=True, enable_word_time_offsets=True,
language_code=language_code, language_code=language_code,
enable_automatic_punctuation=True, enable_automatic_punctuation=enable_automatic_punctuation,
enable_speaker_diarization=True, enable_speaker_diarization=enable_speaker_diarization,
# not clear whether this has to be 'phone_call' in order to model=model,
# use_enhanced
model='video',
use_enhanced=use_enhanced, use_enhanced=use_enhanced,
# model=model,
) )
self.operation = self.speech_client.long_running_recognize(config, self.operation = self.speech_client.long_running_recognize(config,