From 27254c14c4790924422f4d7937ebaf5621392fcf Mon Sep 17 00:00:00 2001 From: zevav Date: Mon, 11 Feb 2019 22:46:46 -0500 Subject: [PATCH] CLI is working for submitting jobs to Amazon. started retrieval and listing functionality --- Pipfile | 5 +- Pipfile.lock | 168 ++++++++++++++++++++++++++++++++++++++--- config.py | 18 +++-- helpers.py | 23 ++++++ tatt/vendors/amazon.py | 116 ++++++++++++++++++++++------ tests/test_amazon.py | 19 +++++ transcribe.py | 69 +++++++++-------- 7 files changed, 346 insertions(+), 72 deletions(-) create mode 100644 helpers.py create mode 100644 tests/test_amazon.py diff --git a/Pipfile b/Pipfile index d3d79b9..c2dc162 100644 --- a/Pipfile +++ b/Pipfile @@ -6,11 +6,12 @@ verify_ssl = true [dev-packages] [packages] -requests = "*" -awscli = "*" boto3 = "*" +awscli = "*" +pytest = "*" click = "*" tatt = {editable = true,path = "."} +ipython = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 8c288cc..ffafc63 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "80cec15bc1119ce4635c01c8595743c2dd3c78c667fe051cc55e5420e7ee83f4" + "sha256": "006d8177b930549d4028114a64abd22e8f5ba739d3d61751813138e3f0922854" }, "pipfile-spec": 6, "requires": { @@ -16,28 +16,57 @@ ] }, "default": { + "appnope": { + "hashes": [ + "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", + "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" + ], + "markers": "sys_platform == 'darwin'", + "version": "==0.1.0" + }, + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", + "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + ], + "version": "==18.2.0" + }, "awscli": { "hashes": [ - "sha256:92d8637f1c65252d586f6e88a521f1b809d2e6895b92a072a95fb9ccf32d22a3", - "sha256:a9dd44db98f70c449bdd2ba27098e9c8023bdfdf93bba1183294be52c6156a69" + "sha256:165ebffb2ff10d0a40fdc985f08bc7a93e08ef7a8f8f68d6f76211935806d43f", + "sha256:28d457973c97bbe154574ba8902e1c47335a80cfb74836bd49d1e3632104fbda" ], "index": "pypi", - "version": "==1.16.99" + "version": "==1.16.102" + }, + "backcall": { + "hashes": [ + "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", + "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" + ], + "version": "==0.1.0" }, "boto3": { "hashes": [ - "sha256:465b4da5d292373f9ec5bb8834f26251a5f464f2ce9da1756988c16bb5e49cff", - "sha256:6ca40ef1893eacb37a3696bb2a5739a9b33a7d978658b451f4d87729cb5ec576" + "sha256:2bcda6aa7cbc51a30fc49f9129500c4df8b92fee3b4a44562c9d595bf32c4dcd", + "sha256:609900ca26f379123911b51ced68e437322ff3c347deaac7d84a53710d612c2c" ], "index": "pypi", - "version": "==1.9.89" + "version": "==1.9.92" }, "botocore": { "hashes": [ - "sha256:2257dc1c012f535ef364b6b60fc9fdc822605fafd6765c3095385528669260aa", - "sha256:b0b9f204cbba3ad7a523f7b274e2d0ca252384e0c114fdfe94c00eb205fb2537" + "sha256:19a48491bb0f22ea95f26ed3bd9ca9e0cd35aadf04027774995817d6403abec9", + "sha256:97a43a70876dae5ebe4334db8ea846181467b80adc45f681720c9bb859491bf5" ], - "version": "==1.12.89" + "version": "==1.12.92" }, "certifi": { "hashes": [ @@ -68,6 +97,13 @@ ], "version": "==0.3.9" }, + "decorator": { + "hashes": [ + "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e", + "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b" + ], + "version": "==4.3.2" + }, "docutils": { "hashes": [ "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", @@ -83,6 +119,28 @@ ], "version": "==2.8" }, + "ipython": { + "hashes": [ + "sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12", + "sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742" + ], + "index": "pypi", + "version": "==7.2.0" + }, + "ipython-genutils": { + "hashes": [ + "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", + "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + ], + "version": "==0.2.0" + }, + "jedi": { + "hashes": [ + "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd", + "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191" + ], + "version": "==0.13.2" + }, "jmespath": { "hashes": [ "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64", @@ -90,6 +148,64 @@ ], "version": "==0.9.3" }, + "more-itertools": { + "hashes": [ + "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40", + "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1" + ], + "version": "==6.0.0" + }, + "parso": { + "hashes": [ + "sha256:6ecf7244be8e7283ec9009c72d074830e7e0e611c974f813d76db0390a4e0dd6", + "sha256:8162be7570ffb34ec0b8d215d7f3b6c5fab24f51eb3886d6dee362de96b6db94" + ], + "version": "==0.3.3" + }, + "pexpect": { + "hashes": [ + "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba", + "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.6.0" + }, + "pickleshare": { + "hashes": [ + "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", + "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" + ], + "version": "==0.7.5" + }, + "pluggy": { + "hashes": [ + "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616", + "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a" + ], + "version": "==0.8.1" + }, + "prompt-toolkit": { + "hashes": [ + "sha256:88002cc618cacfda8760c4539e76c3b3f148ecdb7035a3d422c7ecdc90c2a3ba", + "sha256:c6655a12e9b08edb8cf5aeab4815fd1e1bdea4ad73d3bbf269cf2e0c4eb75d5e", + "sha256:df5835fb8f417aa55e5cafadbaeb0cf630a1e824aad16989f9f0493e679ec010" + ], + "version": "==2.0.8" + }, + "ptyprocess": { + "hashes": [ + "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", + "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" + ], + "version": "==0.6.0" + }, + "py": { + "hashes": [ + "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694", + "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6" + ], + "version": "==1.7.0" + }, "pyasn1": { "hashes": [ "sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7", @@ -97,6 +213,21 @@ ], "version": "==0.4.5" }, + "pygments": { + "hashes": [ + "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", + "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" + ], + "version": "==2.3.1" + }, + "pytest": { + "hashes": [ + "sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07", + "sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d" + ], + "index": "pypi", + "version": "==4.2.0" + }, "python-dateutil": { "hashes": [ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", @@ -119,14 +250,13 @@ "sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537", "sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531" ], - "version": ">=4.2b1" + "version": "==3.13" }, "requests": { "hashes": [ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" ], - "index": "pypi", "version": "==2.21.0" }, "rsa": { @@ -154,6 +284,13 @@ "editable": true, "path": "." }, + "traitlets": { + "hashes": [ + "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", + "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" + ], + "version": "==4.3.2" + }, "urllib3": { "hashes": [ "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", @@ -161,6 +298,13 @@ ], "markers": "python_version >= '3.4'", "version": "==1.24.1" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" } }, "develop": {} diff --git a/config.py b/config.py index 94082a6..5fc8a65 100644 --- a/config.py +++ b/config.py @@ -1,18 +1,22 @@ import os - -from tatt.vendors import ( - amazon, - ) +from pathlib import Path +import sqlite3 STT_SERVICES = { 'amazon': { 'cost_per_minute': .024, 'free': '60_minutes_per_month_for_the_first_12_months', - 'function': amazon.transcribe, }, } -DEFAULT_BUCKET_NAME_FORMATTER = 'tatt_{}' -AWS_CREDENTIALS_FILEPATH = os.getenv('AWS_CREDENTIALS_FILEPATH') or '~/.aws/credentials' +AWS_BUCKET_NAME_FMTR_MEDIA = 'tatt-media-{}' +AWS_BUCKET_NAME_FMTR_TRANSCRIPT = 'tatt-transcript-{}' +AWS_CREDENTIALS_FILEPATH = ( + os.getenv('AWS_CREDENTIALS_FILEPATH') + or Path.home() / '.aws/credentials' +) +AWS_REGION = 'us-east-1' + +SERVICE_CLASS_NAME = 'transcribe' diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..70af447 --- /dev/null +++ b/helpers.py @@ -0,0 +1,23 @@ +import config + + +def print_all_services(free_only=False, print_=True): + # TODO: make a jinja template for this + all_services_string = ( + '\n\nHere are all the available ' + + f'{"free " if free_only else ""}speech-to-text services:' + + '\n\n' + + '\n'.join(['{}{}{}{}'.format('\t', service_name, '\t\t', + + f'({info["free"].replace("_", " ")})' + if isinstance(info["free"], str) else "" + + ) + + for service_name, info in + config.STT_SERVICES.items()]) + + '\n' + ) + if print_: + print(all_services_string) + return all_services_string diff --git a/tatt/vendors/amazon.py b/tatt/vendors/amazon.py index 6ba1a91..5e8f061 100644 --- a/tatt/vendors/amazon.py +++ b/tatt/vendors/amazon.py @@ -1,56 +1,130 @@ +import json import os from pathlib import PurePath from subprocess import check_output +import uuid import boto3 import config NAME = 'amazon' -BUCKET_NAME = config.DEFAULT_BUCKET_NAME_FORMATTER.format(NAME)): - +BUCKET_NAME_MEDIA = config.AWS_BUCKET_NAME_FMTR_MEDIA.format(NAME) +BUCKET_NAME_TRANSCRIPT = config.AWS_BUCKET_NAME_FMTR_TRANSCRIPT.format(NAME) +tr = boto3.client('transcribe') +s3 = boto3.resource('s3') class ConfigError(Exception): pass -class Transcribe: +class transcribe: - bucket_name = BUCKET_NAME + bucket_names = {'media': BUCKET_NAME_MEDIA, + 'transcript': BUCKET_NAME_TRANSCRIPT} def __init__(self, filepath): self._setup() - self.s3 = boto3.resource('s3') self.filepath = PurePath(filepath) + self.basename = str(os.path.basename(self.filepath)) + self.media_file_uri = ( + f"https://s3-{config.AWS_REGION}.amazonaws.com/" + f"{self.bucket_names['media']}/{self.basename}") - def setup(self): + def _setup(self): if not check_for_credentials(): - make_credentials() and check_for_credentials() or raise ConfigError - if not self.check_for_bucket(): - self.make_bucket() + make_credentials() + if not check_for_credentials(): + raise ConfigError + for bucket_name in self.bucket_names.values(): + if not self.check_for_bucket(bucket_name): + self.make_bucket(bucket_name) - def check_for_bucket(self): - return bool(self.s3.Bucket(self.bucket_name).creation_date) + def check_for_bucket(self, bucket_name): + return bool(s3.Bucket(bucket_name).creation_date) - def make_bucket(self): - s3.create_bucket(Bucket=self.bucket_name) + def make_bucket(self, bucket_name): + s3.create_bucket(Bucket=bucket_name) def transcribe(self): - upload_file(self.filepath) - self.request_transcription() + self._upload_file() + return self._request_transcription() - def upload_file(self): - basename = os.path.basename(filepath) - s3.Bucket(bucket_name).upload_file(filepath, basename) - return basename + def _upload_file(self): + s3.Bucket(self.bucket_names['media']).upload_file( + str(self.filepath), + self.basename) - def request_transcription(self): + def _request_transcription(self, language_code='en-US'): + job_name = str(uuid.uuid4()) + tr.start_transcription_job( + TranscriptionJobName=job_name, + LanguageCode=language_code, + MediaFormat=self.basename.split('.')[-1].lower(), + Media={ + 'MediaFileUri': self.media_file_uri + }, + OutputBucketName=self.bucket_names['transcript'] + ) + return job_name + @staticmethod + def get_completed_jobs(): + return transcribe.get_transcription_jobs(status='completed') + + @staticmethod + def get_pending_jobs(): + return transcribe.get_transcription_jobs(status='in_progress') + + @staticmethod + def get_all_jobs(): + return transcribe.get_transcription_jobs() + + @staticmethod + def get_transcription_jobs(status=None): + kwargs = {'MaxResults': 100} + if status is not None: + kwargs['Status'] = status.upper() + jobs_data = tr.list_transcription_jobs(**kwargs) + jobs = homogenize_transcription_job_data(jobs_data['TranscriptionJobSummaries']) + while jobs_data.get('NextToken'): + jobs_data = tr.list_transcription_jobs(NextToken=jobs_data['NextToken']) + jobs += homogenize_transcription_job_data( + jobs_data['TranscriptionJobSummaries']) + return jobs + + +def homogenize_transcription_job_data(transcription_job_data): + return [{ + 'created': jd['CreationTime'], + 'name': jd['TranscriptionJobName'], + 'status': jd['TranscriptionJobStatus'] + } + for jd in transcription_job_data] + + +def retrieve_transcript(transcription_job_name): + job = tr.get_transcription_job( + TranscriptionJobName=transcription_job_name + )['TranscriptionJob'] + + if not job['TranscriptionJobStatus'] == 'COMPLETED': + return + + transcript_file_uri = job['Transcript']['TranscriptFileUri'] + transcript_path = transcript_file_uri.split("amazonaws.com/", 1)[1] + + transcript_bucket = transcript_path.split('/', 1)[0] + transcript_key = transcript_path.split('/', 1)[1] + + s3_object = s3.Object(transcript_bucket, transcript_key).get() + transcript_json = s3_object['Body'].read().decode('utf-8') + return json.loads(transcript_json) def check_for_credentials(): - os.path.exists(config.AWS_CREDENTIALS_FILEPATH) + return config.AWS_CREDENTIALS_FILEPATH.exists() def make_credentials(): diff --git a/tests/test_amazon.py b/tests/test_amazon.py new file mode 100644 index 0000000..86d3b74 --- /dev/null +++ b/tests/test_amazon.py @@ -0,0 +1,19 @@ +from tatt.vendors.amazon import transcribe, retrieve_transcript + + + +def test_transcribe_instantiate(): + filepath = '/Users/zev/tester.mp3' + t = transcribe(filepath) + assert str(t.filepath) == filepath + assert t.basename == 'tester.mp3' + assert t.media_file_uri == ( + f'https://s3-us-east-1.amazonaws.com/tatt-media-amazon/tester.mp3' + ) + + +def test_retrieve(): + filepath = '/Users/zev/tester.mp3' + t = retrieve_transcript('4db6808e-a7e8-4d8d-a1b7-753ab97094dc') + print(t) + assert t is not None diff --git a/transcribe.py b/transcribe.py index 48bc85c..9f2b8ca 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,8 +1,12 @@ +from pprint import pprint +import sqlite3 import sys import click -from config import STT_SERVICES +import config +import helpers +from tatt import vendors @click.group() @@ -11,50 +15,55 @@ def cli(): @cli.command() -@click.option('-f', '--free-only', is_flag=True) -def services(free_only): - """Lists available speech-to-text services.""" - print_all_services(free_only) +@click.argument('uid', required=False) +def retrieve(name=None, service=None): + pending_jobs = [get_service(service_name).get_pending_jobs(name) + for service_name, data in config.STT_SERVICES + if service is None + or service == service_name] + if not pending_jobs: + click.ClickException('no pending jobs currently!') + for job in pending_jobs: + print(dict(job)) @cli.command() -@click.option('-d', '--dry-run', default=False, help=( +@click.option('-f', '--free-only', is_flag=True) +def services(free_only): + """Lists available speech-to-text services.""" + helpers.print_all_services(free_only) + + +@cli.command() +@click.option('-d', '--dry-run', is_flag=True, help=( 'Do a dry run without actually submitting the media file for transcription')) -@click.argument('media_filepath', type=click.File('r')) +@click.argument('media_filepath', type=str) @click.argument('service_name', type=str) def this(dry_run, media_filepath, service_name): """Transcribe All The Things!™""" - if service_name not in STT_SERVICES: + if service_name not in config.STT_SERVICES: print() raise click.ClickException( f'No such service! {print_all_services(print_=False)}') + + service = get_service(service_name) + s = service(media_filepath) + if dry_run: print('If this weren\'t a dry run, I would transcribe ' - f'{media_filepath.name} using {service_name}') - print(STT_SERVICES[service_name]) + f'{media_filepath} using {service_name}') + pprint(vars(s)) else: print( - f'Okay, transcribing {media_filepath.name} using {service_name}...') - print(STT_SERVICES[service_name]['function']) + f'Okay, transcribing {media_filepath} using {service_name}...') + + job_num = s.transcribe() + db.create_pending_job(job_num, s.basename, service_name) + print(f'Okay, job {job_num} is being transcribed. Use "retrieve" ' + 'command to download it.') -def print_all_services(free_only=False, print_=True): - # TODO: make a jinja template for this - all_services_string = ( - '\n\nHere are all the available ' + - f'{"free " if free_only else ""}speech-to-text services:' + - '\n\n' + - '\n'.join(['{}{}{}{}'.format('\t', service_name, '\t\t', +def get_service(service_name): + return getattr(getattr(vendors, service_name), config.SERVICE_CLASS_NAME) - f'({info["free"].replace("_", " ")})' - if isinstance(info["free"], str) else "" - - ) - for service_name, info in - STT_SERVICES.items()]) - + '\n' - ) - if print_: - print(all_services_string) - return all_services_string