CLI is working for submitting jobs to Amazon. started retrieval and listing functionality

This commit is contained in:
2019-02-11 22:46:46 -05:00
parent b29e7acac9
commit 27254c14c4
7 changed files with 346 additions and 72 deletions

View File

@@ -6,11 +6,12 @@ verify_ssl = true
[dev-packages] [dev-packages]
[packages] [packages]
requests = "*"
awscli = "*"
boto3 = "*" boto3 = "*"
awscli = "*"
pytest = "*"
click = "*" click = "*"
tatt = {editable = true,path = "."} tatt = {editable = true,path = "."}
ipython = "*"
[requires] [requires]
python_version = "3.7" python_version = "3.7"

168
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "80cec15bc1119ce4635c01c8595743c2dd3c78c667fe051cc55e5420e7ee83f4" "sha256": "006d8177b930549d4028114a64abd22e8f5ba739d3d61751813138e3f0922854"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -16,28 +16,57 @@
] ]
}, },
"default": { "default": {
"appnope": {
"hashes": [
"sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
"sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
],
"markers": "sys_platform == 'darwin'",
"version": "==0.1.0"
},
"atomicwrites": {
"hashes": [
"sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
"sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
],
"version": "==1.3.0"
},
"attrs": {
"hashes": [
"sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
"sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
],
"version": "==18.2.0"
},
"awscli": { "awscli": {
"hashes": [ "hashes": [
"sha256:92d8637f1c65252d586f6e88a521f1b809d2e6895b92a072a95fb9ccf32d22a3", "sha256:165ebffb2ff10d0a40fdc985f08bc7a93e08ef7a8f8f68d6f76211935806d43f",
"sha256:a9dd44db98f70c449bdd2ba27098e9c8023bdfdf93bba1183294be52c6156a69" "sha256:28d457973c97bbe154574ba8902e1c47335a80cfb74836bd49d1e3632104fbda"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.16.99" "version": "==1.16.102"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
}, },
"boto3": { "boto3": {
"hashes": [ "hashes": [
"sha256:465b4da5d292373f9ec5bb8834f26251a5f464f2ce9da1756988c16bb5e49cff", "sha256:2bcda6aa7cbc51a30fc49f9129500c4df8b92fee3b4a44562c9d595bf32c4dcd",
"sha256:6ca40ef1893eacb37a3696bb2a5739a9b33a7d978658b451f4d87729cb5ec576" "sha256:609900ca26f379123911b51ced68e437322ff3c347deaac7d84a53710d612c2c"
], ],
"index": "pypi", "index": "pypi",
"version": "==1.9.89" "version": "==1.9.92"
}, },
"botocore": { "botocore": {
"hashes": [ "hashes": [
"sha256:2257dc1c012f535ef364b6b60fc9fdc822605fafd6765c3095385528669260aa", "sha256:19a48491bb0f22ea95f26ed3bd9ca9e0cd35aadf04027774995817d6403abec9",
"sha256:b0b9f204cbba3ad7a523f7b274e2d0ca252384e0c114fdfe94c00eb205fb2537" "sha256:97a43a70876dae5ebe4334db8ea846181467b80adc45f681720c9bb859491bf5"
], ],
"version": "==1.12.89" "version": "==1.12.92"
}, },
"certifi": { "certifi": {
"hashes": [ "hashes": [
@@ -68,6 +97,13 @@
], ],
"version": "==0.3.9" "version": "==0.3.9"
}, },
"decorator": {
"hashes": [
"sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
"sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
],
"version": "==4.3.2"
},
"docutils": { "docutils": {
"hashes": [ "hashes": [
"sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
@@ -83,6 +119,28 @@
], ],
"version": "==2.8" "version": "==2.8"
}, },
"ipython": {
"hashes": [
"sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12",
"sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742"
],
"index": "pypi",
"version": "==7.2.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"jedi": {
"hashes": [
"sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd",
"sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191"
],
"version": "==0.13.2"
},
"jmespath": { "jmespath": {
"hashes": [ "hashes": [
"sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64", "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64",
@@ -90,6 +148,64 @@
], ],
"version": "==0.9.3" "version": "==0.9.3"
}, },
"more-itertools": {
"hashes": [
"sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
"sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
],
"version": "==6.0.0"
},
"parso": {
"hashes": [
"sha256:6ecf7244be8e7283ec9009c72d074830e7e0e611c974f813d76db0390a4e0dd6",
"sha256:8162be7570ffb34ec0b8d215d7f3b6c5fab24f51eb3886d6dee362de96b6db94"
],
"version": "==0.3.3"
},
"pexpect": {
"hashes": [
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
"sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
],
"markers": "sys_platform != 'win32'",
"version": "==4.6.0"
},
"pickleshare": {
"hashes": [
"sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
"sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
],
"version": "==0.7.5"
},
"pluggy": {
"hashes": [
"sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
"sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
],
"version": "==0.8.1"
},
"prompt-toolkit": {
"hashes": [
"sha256:88002cc618cacfda8760c4539e76c3b3f148ecdb7035a3d422c7ecdc90c2a3ba",
"sha256:c6655a12e9b08edb8cf5aeab4815fd1e1bdea4ad73d3bbf269cf2e0c4eb75d5e",
"sha256:df5835fb8f417aa55e5cafadbaeb0cf630a1e824aad16989f9f0493e679ec010"
],
"version": "==2.0.8"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"version": "==0.6.0"
},
"py": {
"hashes": [
"sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
"sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
],
"version": "==1.7.0"
},
"pyasn1": { "pyasn1": {
"hashes": [ "hashes": [
"sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7", "sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7",
@@ -97,6 +213,21 @@
], ],
"version": "==0.4.5" "version": "==0.4.5"
}, },
"pygments": {
"hashes": [
"sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
"sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
],
"version": "==2.3.1"
},
"pytest": {
"hashes": [
"sha256:65aeaa77ae87c7fc95de56285282546cfa9c886dc8e5dc78313db1c25e21bc07",
"sha256:6ac6d467d9f053e95aaacd79f831dbecfe730f419c6c7022cb316b365cd9199d"
],
"index": "pypi",
"version": "==4.2.0"
},
"python-dateutil": { "python-dateutil": {
"hashes": [ "hashes": [
"sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
@@ -119,14 +250,13 @@
"sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537", "sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537",
"sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531" "sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531"
], ],
"version": ">=4.2b1" "version": "==3.13"
}, },
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
"sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
], ],
"index": "pypi",
"version": "==2.21.0" "version": "==2.21.0"
}, },
"rsa": { "rsa": {
@@ -154,6 +284,13 @@
"editable": true, "editable": true,
"path": "." "path": "."
}, },
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"urllib3": { "urllib3": {
"hashes": [ "hashes": [
"sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39",
@@ -161,6 +298,13 @@
], ],
"markers": "python_version >= '3.4'", "markers": "python_version >= '3.4'",
"version": "==1.24.1" "version": "==1.24.1"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
} }
}, },
"develop": {} "develop": {}

View File

@@ -1,18 +1,22 @@
import os import os
from pathlib import Path
from tatt.vendors import ( import sqlite3
amazon,
)
STT_SERVICES = { STT_SERVICES = {
'amazon': { 'amazon': {
'cost_per_minute': .024, 'cost_per_minute': .024,
'free': '60_minutes_per_month_for_the_first_12_months', 'free': '60_minutes_per_month_for_the_first_12_months',
'function': amazon.transcribe,
}, },
} }
DEFAULT_BUCKET_NAME_FORMATTER = 'tatt_{}' AWS_BUCKET_NAME_FMTR_MEDIA = 'tatt-media-{}'
AWS_CREDENTIALS_FILEPATH = os.getenv('AWS_CREDENTIALS_FILEPATH') or '~/.aws/credentials' AWS_BUCKET_NAME_FMTR_TRANSCRIPT = 'tatt-transcript-{}'
AWS_CREDENTIALS_FILEPATH = (
os.getenv('AWS_CREDENTIALS_FILEPATH')
or Path.home() / '.aws/credentials'
)
AWS_REGION = 'us-east-1'
SERVICE_CLASS_NAME = 'transcribe'

23
helpers.py Normal file
View File

@@ -0,0 +1,23 @@
import config
def print_all_services(free_only=False, print_=True):
# TODO: make a jinja template for this
all_services_string = (
'\n\nHere are all the available ' +
f'{"free " if free_only else ""}speech-to-text services:' +
'\n\n' +
'\n'.join(['{}{}{}{}'.format('\t', service_name, '\t\t',
f'({info["free"].replace("_", " ")})'
if isinstance(info["free"], str) else ""
)
for service_name, info in
config.STT_SERVICES.items()])
+ '\n'
)
if print_:
print(all_services_string)
return all_services_string

116
tatt/vendors/amazon.py vendored
View File

@@ -1,56 +1,130 @@
import json
import os import os
from pathlib import PurePath from pathlib import PurePath
from subprocess import check_output from subprocess import check_output
import uuid
import boto3 import boto3
import config import config
NAME = 'amazon' NAME = 'amazon'
BUCKET_NAME = config.DEFAULT_BUCKET_NAME_FORMATTER.format(NAME)): BUCKET_NAME_MEDIA = config.AWS_BUCKET_NAME_FMTR_MEDIA.format(NAME)
BUCKET_NAME_TRANSCRIPT = config.AWS_BUCKET_NAME_FMTR_TRANSCRIPT.format(NAME)
tr = boto3.client('transcribe')
s3 = boto3.resource('s3')
class ConfigError(Exception): class ConfigError(Exception):
pass pass
class Transcribe: class transcribe:
bucket_name = BUCKET_NAME bucket_names = {'media': BUCKET_NAME_MEDIA,
'transcript': BUCKET_NAME_TRANSCRIPT}
def __init__(self, filepath): def __init__(self, filepath):
self._setup() self._setup()
self.s3 = boto3.resource('s3')
self.filepath = PurePath(filepath) self.filepath = PurePath(filepath)
self.basename = str(os.path.basename(self.filepath))
self.media_file_uri = (
f"https://s3-{config.AWS_REGION}.amazonaws.com/"
f"{self.bucket_names['media']}/{self.basename}")
def setup(self): def _setup(self):
if not check_for_credentials(): if not check_for_credentials():
make_credentials() and check_for_credentials() or raise ConfigError make_credentials()
if not self.check_for_bucket(): if not check_for_credentials():
self.make_bucket() raise ConfigError
for bucket_name in self.bucket_names.values():
if not self.check_for_bucket(bucket_name):
self.make_bucket(bucket_name)
def check_for_bucket(self): def check_for_bucket(self, bucket_name):
return bool(self.s3.Bucket(self.bucket_name).creation_date) return bool(s3.Bucket(bucket_name).creation_date)
def make_bucket(self): def make_bucket(self, bucket_name):
s3.create_bucket(Bucket=self.bucket_name) s3.create_bucket(Bucket=bucket_name)
def transcribe(self): def transcribe(self):
upload_file(self.filepath) self._upload_file()
self.request_transcription() return self._request_transcription()
def upload_file(self): def _upload_file(self):
basename = os.path.basename(filepath) s3.Bucket(self.bucket_names['media']).upload_file(
s3.Bucket(bucket_name).upload_file(filepath, basename) str(self.filepath),
return basename self.basename)
def request_transcription(self): def _request_transcription(self, language_code='en-US'):
job_name = str(uuid.uuid4())
tr.start_transcription_job(
TranscriptionJobName=job_name,
LanguageCode=language_code,
MediaFormat=self.basename.split('.')[-1].lower(),
Media={
'MediaFileUri': self.media_file_uri
},
OutputBucketName=self.bucket_names['transcript']
)
return job_name
@staticmethod
def get_completed_jobs():
return transcribe.get_transcription_jobs(status='completed')
@staticmethod
def get_pending_jobs():
return transcribe.get_transcription_jobs(status='in_progress')
@staticmethod
def get_all_jobs():
return transcribe.get_transcription_jobs()
@staticmethod
def get_transcription_jobs(status=None):
kwargs = {'MaxResults': 100}
if status is not None:
kwargs['Status'] = status.upper()
jobs_data = tr.list_transcription_jobs(**kwargs)
jobs = homogenize_transcription_job_data(jobs_data['TranscriptionJobSummaries'])
while jobs_data.get('NextToken'):
jobs_data = tr.list_transcription_jobs(NextToken=jobs_data['NextToken'])
jobs += homogenize_transcription_job_data(
jobs_data['TranscriptionJobSummaries'])
return jobs
def homogenize_transcription_job_data(transcription_job_data):
return [{
'created': jd['CreationTime'],
'name': jd['TranscriptionJobName'],
'status': jd['TranscriptionJobStatus']
}
for jd in transcription_job_data]
def retrieve_transcript(transcription_job_name):
job = tr.get_transcription_job(
TranscriptionJobName=transcription_job_name
)['TranscriptionJob']
if not job['TranscriptionJobStatus'] == 'COMPLETED':
return
transcript_file_uri = job['Transcript']['TranscriptFileUri']
transcript_path = transcript_file_uri.split("amazonaws.com/", 1)[1]
transcript_bucket = transcript_path.split('/', 1)[0]
transcript_key = transcript_path.split('/', 1)[1]
s3_object = s3.Object(transcript_bucket, transcript_key).get()
transcript_json = s3_object['Body'].read().decode('utf-8')
return json.loads(transcript_json)
def check_for_credentials(): def check_for_credentials():
os.path.exists(config.AWS_CREDENTIALS_FILEPATH) return config.AWS_CREDENTIALS_FILEPATH.exists()
def make_credentials(): def make_credentials():

19
tests/test_amazon.py Normal file
View File

@@ -0,0 +1,19 @@
from tatt.vendors.amazon import transcribe, retrieve_transcript
def test_transcribe_instantiate():
filepath = '/Users/zev/tester.mp3'
t = transcribe(filepath)
assert str(t.filepath) == filepath
assert t.basename == 'tester.mp3'
assert t.media_file_uri == (
f'https://s3-us-east-1.amazonaws.com/tatt-media-amazon/tester.mp3'
)
def test_retrieve():
filepath = '/Users/zev/tester.mp3'
t = retrieve_transcript('4db6808e-a7e8-4d8d-a1b7-753ab97094dc')
print(t)
assert t is not None

View File

@@ -1,8 +1,12 @@
from pprint import pprint
import sqlite3
import sys import sys
import click import click
from config import STT_SERVICES import config
import helpers
from tatt import vendors
@click.group() @click.group()
@@ -11,50 +15,55 @@ def cli():
@cli.command() @cli.command()
@click.option('-f', '--free-only', is_flag=True) @click.argument('uid', required=False)
def services(free_only): def retrieve(name=None, service=None):
"""Lists available speech-to-text services.""" pending_jobs = [get_service(service_name).get_pending_jobs(name)
print_all_services(free_only) for service_name, data in config.STT_SERVICES
if service is None
or service == service_name]
if not pending_jobs:
click.ClickException('no pending jobs currently!')
for job in pending_jobs:
print(dict(job))
@cli.command() @cli.command()
@click.option('-d', '--dry-run', default=False, help=( @click.option('-f', '--free-only', is_flag=True)
def services(free_only):
"""Lists available speech-to-text services."""
helpers.print_all_services(free_only)
@cli.command()
@click.option('-d', '--dry-run', is_flag=True, help=(
'Do a dry run without actually submitting the media file for transcription')) 'Do a dry run without actually submitting the media file for transcription'))
@click.argument('media_filepath', type=click.File('r')) @click.argument('media_filepath', type=str)
@click.argument('service_name', type=str) @click.argument('service_name', type=str)
def this(dry_run, media_filepath, service_name): def this(dry_run, media_filepath, service_name):
"""Transcribe All The Things!™""" """Transcribe All The Things!™"""
if service_name not in STT_SERVICES: if service_name not in config.STT_SERVICES:
print() print()
raise click.ClickException( raise click.ClickException(
f'No such service! {print_all_services(print_=False)}') f'No such service! {print_all_services(print_=False)}')
service = get_service(service_name)
s = service(media_filepath)
if dry_run: if dry_run:
print('If this weren\'t a dry run, I would transcribe ' print('If this weren\'t a dry run, I would transcribe '
f'{media_filepath.name} using {service_name}') f'{media_filepath} using {service_name}')
print(STT_SERVICES[service_name]) pprint(vars(s))
else: else:
print( print(
f'Okay, transcribing {media_filepath.name} using {service_name}...') f'Okay, transcribing {media_filepath} using {service_name}...')
print(STT_SERVICES[service_name]['function'])
job_num = s.transcribe()
db.create_pending_job(job_num, s.basename, service_name)
print(f'Okay, job {job_num} is being transcribed. Use "retrieve" '
'command to download it.')
def print_all_services(free_only=False, print_=True): def get_service(service_name):
# TODO: make a jinja template for this return getattr(getattr(vendors, service_name), config.SERVICE_CLASS_NAME)
all_services_string = (
'\n\nHere are all the available ' +
f'{"free " if free_only else ""}speech-to-text services:' +
'\n\n' +
'\n'.join(['{}{}{}{}'.format('\t', service_name, '\t\t',
f'({info["free"].replace("_", " ")})'
if isinstance(info["free"], str) else ""
)
for service_name, info in
STT_SERVICES.items()])
+ '\n'
)
if print_:
print(all_services_string)
return all_services_string