Add support for other languages.

Add CLI options for specifying language as well as for getting a list of supported languages. Add abstract method on vendor.py for `language_list`. Manually add list of language codes (`_language_list` and `language_list`) to Amazon. Manually add list of language codes to Google. Add 'name' attribute to each Transcriber
2019-03-25 07:50:39 +01:00
parent c97ec79ece
commit d8f90cd98a
4 changed files with 57 additions and 4 deletions
--- a/tatt/transcribe.py
+++ b/tatt/transcribe.py
@@ -87,6 +87,15 @@ def status(job_name):
        break


+@cli.command()
+@click.argument('service_name', type=click.Choice(vendors.SERVICES))
+def languages(service_name):
+    service = get_service(service_name)
+    languages_string = "\n" + "\n".join(service.language_list())
+    click.echo(
+            f'{service.name} supports {languages_string}')
+
+
@cli.command()
@click.option('--punctuation', is_flag=True, default=True, 
              help='only for Google Speech, defaults to True')
@@ -98,6 +107,8 @@ def status(job_name):
              help='only for Google Speech, defaults to "phone_call"')
@click.option('--use-enhanced', is_flag=True, default=True,
              help='only for Google Speech, defaults to True')
+@click.option('--language-code', default='en-US',
+              help='only for google and amazon, defaults to en-US')
@click.argument('media_filepath', type=str)
@click.argument('service_name', type=str)
 def this(media_filepath, 
@@ -106,7 +117,9 @@ def this(media_filepath,
         speaker_id,
         num_speakers,
         model,
-         use_enhanced):
+         use_enhanced,
+         language_code,
+         ):
    """Sends a media file to be transcribed."""
    if service_name == 'google':
        transcribe_kwargs = dict(
@@ -115,14 +128,17 @@ def this(media_filepath,
            model=model,
            use_enhanced=use_enhanced,
            num_speakers=num_speakers,
+            language_code=language_code,
            )
    elif service_name == 'amazon':
        transcribe_kwargs = dict(
            enable_speaker_diarization=speaker_id,
            num_speakers=num_speakers,
+            language_code=language_code,
                )
    else:
        transcribe_kwargs = {}
+
    try:
        service = get_service(service_name)
    except KeyError as e:
--- a/tatt/vendors/amazon.py
+++ b/tatt/vendors/amazon.py
@@ -27,12 +27,16 @@ def _check_for_config() -> bool:

 class Transcriber(TranscriberBaseClass):

+    name = NAME
    cost_per_15_seconds = .024 / 4
    bucket_names = {'media': BUCKET_NAME_MEDIA,
                    'transcript': BUCKET_NAME_TRANSCRIPT}

    no_config_error_message = 'please run "aws configure" first'
    transcript_type = TRANSCRIPT_TYPE
+    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/transcribe.html
+    _language_list = ['en-US', 'es-US', 'en-AU', 'fr-CA', 'en-GB', 'de-DE', 
+                     'pt-BR', 'fr-FR', 'it-IT', 'ko-KR']

    if _check_for_config():
        tr = boto3.client('transcribe')
--- a/tatt/vendors/google.py
+++ b/tatt/vendors/google.py
@@ -29,6 +29,7 @@ def _check_for_config():

 class Transcriber(TranscriberBaseClass):

+    name = NAME
    SUPPORTED_FORMATS = ['flac']
    cost_per_15_seconds = [.004, .006, .009]
    no_config_error_message = (
@@ -37,6 +38,25 @@ class Transcriber(TranscriberBaseClass):
            'environment variable "GOOGLE_APPLICATION_CREDENTIALS"'
            )
    transcript_type = TRANSCRIPT_TYPE
+    # https://cloud.google.com/speech-to-text/docs/languages
+    # Array.from(document.querySelector('.devsite-table-wrapper').querySelectorAll('table tr')).slice(1).map(row => row.children[1].innerText)
+    _language_list = [
+        'af-ZA', 'am-ET', 'hy-AM', 'az-AZ', 'id-ID', 'ms-MY',
+        'bn-BD', 'bn-IN', 'ca-ES', 'cs-CZ', 'da-DK', 'de-DE', 'en-AU', 'en-CA',
+        'en-GH', 'en-GB', 'en-IN', 'en-IE', 'en-KE', 'en-NZ', 'en-NG', 'en-PH',
+        'en-SG', 'en-ZA', 'en-TZ', 'en-US', 'es-AR', 'es-BO', 'es-CL', 'es-CO',
+        'es-CR', 'es-EC', 'es-SV', 'es-ES', 'es-US', 'es-GT', 'es-HN', 'es-MX',
+        'es-NI', 'es-PA', 'es-PY', 'es-PE', 'es-PR', 'es-DO', 'es-UY', 'es-VE',
+        'eu-ES', 'fil-PH', 'fr-CA', 'fr-FR', 'gl-ES', 'ka-GE', 'gu-IN', 'hr-HR',
+        'zu-ZA', 'is-IS', 'it-IT', 'jv-ID', 'kn-IN', 'km-KH', 'lo-LA', 'lv-LV',
+        'lt-LT', 'hu-HU', 'ml-IN', 'mr-IN', 'nl-NL', 'ne-NP', 'nb-NO', 'pl-PL',
+        'pt-BR', 'pt-PT', 'ro-RO', 'si-LK', 'sk-SK', 'sl-SI', 'su-ID', 'sw-TZ',
+        'sw-KE', 'fi-FI', 'sv-SE', 'ta-IN', 'ta-SG', 'ta-LK', 'ta-MY', 'te-IN',
+        'vi-VN', 'tr-TR', 'ur-PK', 'ur-IN', 'el-GR', 'bg-BG', 'ru-RU', 'sr-RS',
+        'uk-UA', 'he-IL', 'ar-IL', 'ar-JO', 'ar-AE', 'ar-BH', 'ar-DZ', 'ar-SA',
+        'ar-IQ', 'ar-KW', 'ar-MA', 'ar-TN', 'ar-OM', 'ar-PS', 'ar-QA', 'ar-LB',
+        'ar-EG', 'fa-IR', 'hi-IN', 'th-TH', 'ko-KR', 'zh-TW', 'yue-Hant-HK',
+        'ja-JP', 'zh-HK', 'zh']

    if _check_for_config():
        speech_client = speech.SpeechClient()
@@ -115,6 +135,9 @@ class Transcriber(TranscriberBaseClass):
            content = audio_file.read()
            audio = speech.types.RecognitionAudio(content=content)

+        if language_code != 'en-US':
+            model = None
+
        config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=sample_rate,
--- a/tatt/vendors/vendor.py
+++ b/tatt/vendors/vendor.py
@@ -31,6 +31,11 @@ class TranscriberBaseClass:
    def transcript_type(self):
        pass

+    @property
+    @abc.abstractmethod
+    def _language_list(self):
+        pass
+
    @property
    @abc.abstractmethod
    def cost_per_15_seconds(self):
@@ -42,9 +47,9 @@ class TranscriberBaseClass:
        if not cls.check_for_config():
            raise exceptions.ConfigError(cls.no_config_error_message)

-    @staticmethod
+    @classmethod
    @abc.abstractmethod
-    def check_for_config() -> bool:
+    def check_for_config(cls) -> bool:
        pass

    @abc.abstractmethod
@@ -61,9 +66,14 @@ class TranscriberBaseClass:
        """Returns the job_name"""
        pass

+    @classmethod
+    def language_list(cls) -> List[str]:
+        return sorted(cls._language_list)
+
    @classmethod
    @abc.abstractmethod
-    def retrieve_transcript(transcription_job_name: str) -> Union[str, dict]:
+    def retrieve_transcript(cls, transcription_job_name: str
+            ) -> Union[str, dict]:
        pass

    @classmethod