brought over changes from when transcript_processing was nested inside transcribely's back_end package. started refactoring converters into OOP

This commit is contained in:
2019-02-06 20:57:21 -05:00
parent c9c4cbe550
commit 84fe4d2fd4
19 changed files with 277 additions and 135802 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -1,3 +0,0 @@
{
"python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1
__init__.py Normal file
View File

@@ -0,0 +1 @@
from transcript_processing.converter import TranscriptConverter

4
config.py Normal file
View File

@@ -0,0 +1,4 @@
import os
AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')

98
converter.py Normal file
View File

@@ -0,0 +1,98 @@
import abc
import json
from collections import namedtuple
import os
import helpers
from transcript_processing.converters import converters
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
class TranscriptConverter:
__metaclass__ = abc.ABCMeta
def __init__(self, path, output_target):
self.path = path
self.output_target = output_target
def convert(self):
tagged_words = None
with open(self.path) as f:
data = json.load(f)
word_objects = self.get_word_objects(data)
words = self.get_words(word_objects)
if self.output_target == 'interactive_transcript':
tagged_words = helpers.tag_words(words)
self.converted_words = self.convert_words(
word_objects,
words,
tagged_words
)
@staticmethod
@abc.abstractmethod
def get_word_objects(json_data):
pass
@staticmethod
@abc.abstractmethod
def get_words(word_objects):
pass
@staticmethod
@abc.abstractmethod
def convert_words(word_objects, words, tagged_words=None):
pass
@staticmethod
@abc.abstractmethod
def get_word_start(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_end(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_confidence(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_word(word_object):
pass
@staticmethod
def check_if_proper_noun(index, tagged_words):
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
def get_word_object(self, word_object, index, tagged_words, word_objects):
return Word(
self.get_word_start(word_object),
self.get_word_end(word_object),
self.get_word_confidence(word_object),
self.get_word_word(word_object),
self.check_if_proper_noun(index, tagged_words),
self.get_next_word(word_objects, index)
)
def get_next_word(self, word_objects, index):
if index < len(word_objects) - 1:
return word_objects[index + 1]
def to_json(self):
return json.dumps(self.converted_words, indent=4)
def save(self, path):
with open(path, 'w') as fout:
fout.write(self.to_json())
return path

24
converters/__init__.py Normal file
View File

@@ -0,0 +1,24 @@
"""
fields for converted transcript:
start
end
word
confidence
index
always_capitalized
punc_before
punc_after
"""
from transcript_processing.converters.amazon import amazon_converter
from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
converters = {
'speechmatics': speechmatics_converter,
'speechmatics_align': speechmatics_aligned_text_converter,
'amazon': amazon_converter,
}

146
converters/amazon.py Normal file
View File

@@ -0,0 +1,146 @@
import json
from transcript_processing import helpers
class AmazonConverter(TranscriptConverter):
def __init__(self, path, output_target):
super().__init__(path, output_target)
def get_word_objects(self, json_data):
return data['results']['items']
def get_words(self, word_objects):
return [self.get_word_word(w)
for w in word_objects])
@staticmethod
def get_word_start(word_object):
return float(word_object['start_time'])
@staticmethod
def get_word_end(word_object):
return float(word_object['end_time'])
@staticmethod
def get_word_confidence(word_object):
return float(word_object['alternatives'][0]['confidence'])
@staticmethod
def get_word_word(word_object):
word_word = w['alternatives'][0]['content']
if word_word == 'i':
# weird Amazon quirk
word_word = 'I'
return word_word
def convert_words(self, word_objects, words, tagged_words=None):
converted_words = []
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_obj = self.get_word_object(w, i, tagged_words, words)
if word_obj.next_word:
next_word = self.get_word_word(word_obj.next_word)
next_word_type = word_obj.next_word['type']
if next_word in ['.', ',']:
punc_after = next_word
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word_obj.word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_obj.start,
'end': word_obj.end,
'confidence': word_obj.confidence,
'word': word_obj.word,
'always_capitalized': (
word_obj.is_proper_noun
or word_obj.word == 'I'),
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def amazon_converter(data: dict):
data = json.load(data)
converted_words = []
words = data['results']['items']
tagged_words = helpers.tag_words(
[w['alternatives'][0]['content'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_start = float(w['start_time'])
word_end = float(w['end_time'])
confidence = float(w['alternatives'][0]['confidence'])
word = w['alternatives'][0]['content']
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['alternatives'][0]['content']
next_word_type = words[i + 1]['type']
if next_word == '.':
punc_after = '.'
elif next_word == ',':
punc_after = ','
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word == 'i':
# weird Amazon quirk
word = 'I'
if word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words

View File

@@ -1,88 +1,13 @@
"""
fields for converted transcript:
start
end
word
confidence
index
always_capitalized
punc_before
punc_after
"""
from collections import namedtuple from collections import namedtuple
from decimal import Decimal
import json import json
from typing import Dict, Union, List
import helpers from transcript_processing import helpers
def amazon_converter(data: dict): Word = namedtuple('Word', 'start end word')
data = json.load(data)
converted_words = []
words = data['results']['items']
tagged_words = helpers.tag_words(
[w['alternatives'][0]['content'] for w in words])
punc_before = False
punc_after = False
num_words = len(words)
index = 0
for i, w in enumerate(words):
if w['type'] == 'punctuation':
continue
next_word_punc_after = None
word_start = float(w['start_time'])
word_end = float(w['end_time'])
confidence = float(w['alternatives'][0]['confidence'])
word = w['alternatives'][0]['content']
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None
if i < num_words - 1:
next_word = words[i + 1]['alternatives'][0]['content']
next_word_type = words[i + 1]['type']
if next_word == '.':
punc_after = '.'
elif next_word == ',':
punc_after = ','
elif next_word_punc_after:
punc_after = next_word_punc_after
next_word_punc_after = None
if word == 'i':
# weird Amazon quirk
word = 'I'
if word.lower() == 'you' and next_word == 'know':
prev_word = words[i - 1]
if prev_word['type'] != 'punctuation':
converted_words[-1]['punc_after'] = ','
if next_word_type != 'punctuation':
next_word_punc_after = ','
converted_words.append({
'start': word_start,
'end': word_end,
'confidence': confidence,
'word': word,
'always_capitalized': is_proper_noun or word == 'I',
'index': index,
'punc_after': punc_after,
'punc_before': punc_before,
})
index += 1
punc_after = False
return converted_words
def speechmatics_converter(data: dict): def speechmatics_converter(data):
data = json.load(data) data = json.load(data)
converted_words = [] converted_words = []
words = data['words'] words = data['words']
@@ -126,7 +51,6 @@ def speechmatics_converter(data: dict):
def speechmatics_aligned_text_converter(data): def speechmatics_aligned_text_converter(data):
data = data.readlines()[0] data = data.readlines()[0]
Word = namedtuple('Word', 'start end word')
class Exhausted(Exception): class Exhausted(Exception):
pass pass
@@ -186,8 +110,4 @@ def speechmatics_aligned_text_converter(data):
return converted_words return converted_words
converters = { def gentle_converter
'speechmatics': speechmatics_converter,
'speechmatics_align': speechmatics_aligned_text_converter,
'amazon': amazon_converter,
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,20 +0,0 @@
import json
import os
from converters import converters
class TranscriptConverter:
def __init__(self, path, format_name):
self.path = path
with open(path) as f:
self.words = converters[format_name](f)
def to_json(self):
return json.dumps(self.words, indent=4)
def save(self):
name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
with open(name, 'w') as fout:
fout.write(self.to_json())

0
tests/__init__.py Normal file
View File

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff