fixed #32, fixed #35, use_enhanced in Google STT by default, as well as the best ('video') model

This commit is contained in:
2019-03-07 16:59:29 -05:00
parent ab6e381ebf
commit 0a2f46f971
11 changed files with 896 additions and 20 deletions

View File

@@ -0,0 +1,202 @@
alternatives {
transcript: "testing"
confidence: 0.9585370421409607
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " This is everybody saying things."
confidence: 0.8421146273612976
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004213452339172
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.7137818932533264
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628734588623047
speaker_tag: 2
}
}
channel_tag: 1
language_code: "en-us"
alternatives {
transcript: " 2019"
confidence: 0.9188541173934937
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004213452339172
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.7137818932533264
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628734588623047
speaker_tag: 2
}
words {
start_time {
seconds: 10
nanos: 300000000
}
end_time {
seconds: 11
nanos: 600000000
}
word: "2019"
confidence: 0.901819109916687
speaker_tag: 2
}
}
channel_tag: 1
language_code: "en-us"

View File

@@ -0,0 +1,202 @@
alternatives {
transcript: "testing"
confidence: 0.9585370421409607
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " This is everybody saying things."
confidence: 0.8421170711517334
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004211068153381
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.7137818336486816
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628880620002747
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " 2019"
confidence: 0.9188556671142578
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004211068153381
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.7137818336486816
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628880620002747
speaker_tag: 2
}
words {
start_time {
seconds: 10
nanos: 300000000
}
end_time {
seconds: 11
nanos: 600000000
}
word: "2019"
confidence: 0.901819109916687
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"

View File

@@ -0,0 +1,202 @@
alternatives {
transcript: "testing"
confidence: 0.9585370421409607
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " This is everybody saying things."
confidence: 0.8421146273612976
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004211068153381
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.713782012462616
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628734588623047
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " 2019"
confidence: 0.9188556671142578
words {
start_time {
seconds: 4
nanos: 700000000
}
end_time {
seconds: 5
nanos: 300000000
}
word: "testing"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 200000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "This"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 800000000
}
word: "is"
confidence: 0.9585370421409607
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 800000000
}
end_time {
seconds: 7
nanos: 800000000
}
word: "everybody"
confidence: 0.5004211068153381
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 800000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.713782012462616
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 200000000
}
word: "things."
confidence: 0.9628734588623047
speaker_tag: 2
}
words {
start_time {
seconds: 10
nanos: 300000000
}
end_time {
seconds: 11
nanos: 600000000
}
word: "2019"
confidence: 0.901819109916687
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"

View File

@@ -0,0 +1,233 @@
alternatives {
transcript: "Testing, this is Zev, Ivory box saying things."
confidence: 0.800268292427063
words {
start_time {
seconds: 4
}
end_time {
seconds: 5
nanos: 500000000
}
word: "Testing,"
confidence: 0.8863372206687927
speaker_tag: 2
}
words {
start_time {
seconds: 5
nanos: 500000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "this"
confidence: 0.8322268724441528
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 900000000
}
word: "is"
confidence: 0.7659580111503601
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 900000000
}
end_time {
seconds: 7
nanos: 300000000
}
word: "Zev,"
confidence: 0.9128385782241821
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 300000000
}
end_time {
seconds: 7
nanos: 700000000
}
word: "Ivory"
confidence: 0.7265068292617798
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 700000000
}
end_time {
seconds: 7
nanos: 900000000
}
word: "box"
confidence: 0.7768470644950867
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 900000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.8872998952865601
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 400000000
}
word: "things."
confidence: 0.9128385782241821
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"
alternatives {
transcript: " 2019"
confidence: 0.7211146354675293
words {
start_time {
seconds: 4
}
end_time {
seconds: 5
nanos: 500000000
}
word: "Testing,"
confidence: 0.8863372206687927
speaker_tag: 2
}
words {
start_time {
seconds: 5
nanos: 500000000
}
end_time {
seconds: 6
nanos: 600000000
}
word: "this"
confidence: 0.8322268724441528
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 600000000
}
end_time {
seconds: 6
nanos: 900000000
}
word: "is"
confidence: 0.7659580111503601
speaker_tag: 2
}
words {
start_time {
seconds: 6
nanos: 900000000
}
end_time {
seconds: 7
nanos: 300000000
}
word: "Zev,"
confidence: 0.9128385782241821
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 300000000
}
end_time {
seconds: 7
nanos: 700000000
}
word: "Ivory"
confidence: 0.7265068292617798
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 700000000
}
end_time {
seconds: 7
nanos: 900000000
}
word: "box"
confidence: 0.7768470644950867
speaker_tag: 2
}
words {
start_time {
seconds: 7
nanos: 900000000
}
end_time {
seconds: 8
nanos: 700000000
}
word: "saying"
confidence: 0.8872998952865601
speaker_tag: 2
}
words {
start_time {
seconds: 8
nanos: 700000000
}
end_time {
seconds: 9
nanos: 400000000
}
word: "things."
confidence: 0.9128385782241821
speaker_tag: 2
}
words {
start_time {
seconds: 10
nanos: 300000000
}
end_time {
seconds: 11
nanos: 500000000
}
word: "2019"
confidence: 0.7581849098205566
speaker_tag: 2
}
}
channel_tag: 2
language_code: "en-us"

View File

@@ -7,6 +7,15 @@ BUCKET_NAME_FMTR_MEDIA = 'tatt-media-{}'
BUCKET_NAME_FMTR_TRANSCRIPT = 'tatt-transcript-{}'
BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE = 'tatt_transcript_{}'
def GOOGLE_SPEECH_USE_ENHANCED():
enhanced = os.getenv('GOOGLE_SPEECH_USE_ENHANCED')
if enhanced and enhanced == 'true':
return True
else:
return False
if os.getenv('AWS_CONFIG_FILEPATH'):
AWS_CONFIG_FILEPATH = Path(os.getenv('AWS_CONFIG_FILEPATH'))
else:

View File

@@ -17,3 +17,7 @@ class NotAvailable(Exception):
class DependencyRequired(Exception):
pass
class FormatError(Exception):
pass

View File

@@ -6,6 +6,7 @@ from typing import Dict, List
import audioread
from tatt import config, exceptions, vendors
from tatt.vendors.vendor import TranscriberBaseClass
def make_string_all_services(free_only=False):
@@ -40,13 +41,14 @@ def get_job(job_name):
return job
def get_transcript(job_name):
def get_transcript(job_name) -> tuple:
job = get_job(job_name)
service = get_service(job['service_name'])
return service.retrieve_transcript(job_name)
transcript = service.retrieve_transcript(job_name)
return transcript, service
def get_service(service_name):
def get_service(service_name) -> TranscriberBaseClass:
module = vendors.SERVICES[service_name]
return getattr(module, config.SERVICE_CLASS_NAME)

View File

@@ -22,16 +22,20 @@ def cli():
def get(name, save, pretty):
"""Downloads and/or saves completed transcript."""
try:
transcript = json.dumps(helpers.get_transcript(name),
indent=4 if pretty else None)
transcript, service = helpers.get_transcript(name)
except exceptions.DoesntExistError:
raise click.ClickException(f'no such transcript {name}')
except exceptions.NotAvailable as e:
raise click.ClickException(str(e))
file = None
if save:
if service.transcript_type == dict:
transcript = json.dumps(transcript, indent=4 if pretty else None)
filepath = f'{name}.json'
else:
filepath = f'{name}.txt'
if save:
file = open(filepath, 'w')
click.echo(transcript, file=file)

View File

@@ -14,6 +14,7 @@ from .vendor import TranscriberBaseClass
NAME = 'amazon'
BUCKET_NAME_MEDIA = config.BUCKET_NAME_FMTR_MEDIA.format(NAME)
BUCKET_NAME_TRANSCRIPT = config.BUCKET_NAME_FMTR_TRANSCRIPT.format(NAME)
TRANSCRIPT_TYPE = dict
def _check_for_config() -> bool:
@@ -23,6 +24,7 @@ def _check_for_config() -> bool:
)
class Transcriber(TranscriberBaseClass):
cost_per_15_seconds = .024 / 4
@@ -30,6 +32,7 @@ class Transcriber(TranscriberBaseClass):
'transcript': BUCKET_NAME_TRANSCRIPT}
no_config_error_message = 'please run "aws configure" first'
transcript_type = TRANSCRIPT_TYPE
if _check_for_config():
tr = boto3.client('transcribe')
@@ -120,7 +123,8 @@ class Transcriber(TranscriberBaseClass):
return jobs
@classmethod
def retrieve_transcript(cls, transcription_job_name: str) -> dict:
def retrieve_transcript(cls, transcription_job_name: str
) -> TRANSCRIPT_TYPE:
job = cls.tr.get_transcription_job(
TranscriptionJobName=transcription_job_name
)['TranscriptionJob']

View File

@@ -14,12 +14,13 @@ from google.cloud import (
exceptions as gc_exceptions,
)
from tatt import exceptions, helpers, config
from tatt import exceptions, helpers, config as config_mod
from .vendor import TranscriberBaseClass
NAME = 'google'
BUCKET_NAME_TRANSCRIPT = config.BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE.format(
BUCKET_NAME_TRANSCRIPT = config_mod.BUCKET_NAME_FMTR_TRANSCRIPT_GOOGLE.format(
'goog')
TRANSCRIPT_TYPE = str
def _check_for_config():
@@ -35,6 +36,7 @@ class Transcriber(TranscriberBaseClass):
'and put the path to your credentials in an '
'environment variable "GOOGLE_APPLICATION_CREDENTIALS"'
)
transcript_type = TRANSCRIPT_TYPE
if _check_for_config():
speech_client = speech.SpeechClient()
@@ -74,10 +76,6 @@ class Transcriber(TranscriberBaseClass):
def file_format(self):
return pathlib.Path(self.filepath).suffix[1:].lower()
@property
def transcript_name(self):
return self.basename + '.txt'
@staticmethod
def check_for_config() -> bool:
return _check_for_config()
@@ -89,13 +87,13 @@ class Transcriber(TranscriberBaseClass):
def _check_if_transcript_exists(self, transcript_name=None):
return storage.Blob(
bucket=self.transcript_bucket,
name=transcript_name or self.transcript_name
name=transcript_name or self.basename
).exists(self.storage_client)
def _request_transcription(
self,
language_code='en-US',
model='video',
# model='video',
) -> str:
"""Returns the job_name"""
if self._check_if_transcript_exists():
@@ -103,6 +101,9 @@ class Transcriber(TranscriberBaseClass):
f'{self.basename} already exists on {NAME}')
num_audio_channels = helpers.get_num_audio_channels(self.filepath)
use_enhanced = config_mod.GOOGLE_SPEECH_USE_ENHANCED()
print(use_enhanced)
with io.open(self.filepath, 'rb') as audio_file:
content = audio_file.read()
audio = speech.types.RecognitionAudio(content=content)
@@ -116,7 +117,12 @@ class Transcriber(TranscriberBaseClass):
enable_word_time_offsets=True,
language_code=language_code,
enable_automatic_punctuation=True,
model=model,
enable_speaker_diarization=True,
# not clear whether this has to be 'phone_call' in order to
# use_enhanced
model='video',
use_enhanced=use_enhanced,
# model=model,
)
self.operation = self.speech_client.long_running_recognize(config,
@@ -143,7 +149,8 @@ class Transcriber(TranscriberBaseClass):
return self.basename
@classmethod
def retrieve_transcript(cls, transcription_job_name: str) -> dict:
def retrieve_transcript(cls, transcription_job_name: str
) -> TRANSCRIPT_TYPE:
"""Get transcript from BUCKET_NAME_TRANSCRIPT"""
if not cls._check_if_transcript_exists(
cls,
@@ -161,7 +168,7 @@ class Transcriber(TranscriberBaseClass):
return transcript_text
def upload_file(self, bucket_name, path):
blob = self.transcript_bucket.blob(self.transcript_name)
blob = self.transcript_bucket.blob(self.basename)
blob.upload_from_filename(path)
@classmethod

View File

@@ -1,7 +1,7 @@
import abc
import os
from pathlib import PurePath
from typing import List
from typing import List, Union
from tatt import exceptions
@@ -12,6 +12,8 @@ class TranscriberBaseClass:
def __init__(self, filepath):
self._setup()
if ' ' in filepath:
raise exceptions.FormatError('Please don\'t put any spaces in the filename.')
self.filepath = PurePath(filepath)
self.basename = str(os.path.basename(self.filepath))
@@ -24,6 +26,11 @@ class TranscriberBaseClass:
"""
pass
@property
@abc.abstractmethod
def transcript_type(self):
pass
@property
@abc.abstractmethod
def cost_per_15_seconds(self):
@@ -56,7 +63,7 @@ class TranscriberBaseClass:
@classmethod
@abc.abstractmethod
def retrieve_transcript(transcription_job_name: str) -> dict:
def retrieve_transcript(transcription_job_name: str) -> Union[str, dict]:
pass
@classmethod