brought over changes from when transcript_processing was nested inside transcribely's back_end package. started refactoring converters into OOP
This commit is contained in:
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@@ -1,3 +0,0 @@
|
||||
{
|
||||
"python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
|
||||
}
|
||||
4915
Lelandmp3.json
4915
Lelandmp3.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1
__init__.py
Normal file
1
__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from transcript_processing.converter import TranscriptConverter
|
||||
4
config.py
Normal file
4
config.py
Normal file
@@ -0,0 +1,4 @@
|
||||
import os
|
||||
|
||||
|
||||
AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')
|
||||
98
converter.py
Normal file
98
converter.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import abc
|
||||
import json
|
||||
from collections import namedtuple
|
||||
import os
|
||||
|
||||
import helpers
|
||||
from transcript_processing.converters import converters
|
||||
|
||||
|
||||
|
||||
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
|
||||
|
||||
|
||||
class TranscriptConverter:
|
||||
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
self.path = path
|
||||
self.output_target = output_target
|
||||
|
||||
def convert(self):
|
||||
tagged_words = None
|
||||
|
||||
with open(self.path) as f:
|
||||
data = json.load(f)
|
||||
word_objects = self.get_word_objects(data)
|
||||
words = self.get_words(word_objects)
|
||||
|
||||
if self.output_target == 'interactive_transcript':
|
||||
tagged_words = helpers.tag_words(words)
|
||||
|
||||
self.converted_words = self.convert_words(
|
||||
word_objects,
|
||||
words,
|
||||
tagged_words
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_objects(json_data):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_words(word_objects):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def convert_words(word_objects, words, tagged_words=None):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_start(word_object):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_end(word_object):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_confidence(word_object):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def get_word_word(word_object):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def check_if_proper_noun(index, tagged_words):
|
||||
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
def get_word_object(self, word_object, index, tagged_words, word_objects):
|
||||
return Word(
|
||||
self.get_word_start(word_object),
|
||||
self.get_word_end(word_object),
|
||||
self.get_word_confidence(word_object),
|
||||
self.get_word_word(word_object),
|
||||
self.check_if_proper_noun(index, tagged_words),
|
||||
self.get_next_word(word_objects, index)
|
||||
)
|
||||
|
||||
def get_next_word(self, word_objects, index):
|
||||
if index < len(word_objects) - 1:
|
||||
return word_objects[index + 1]
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self.converted_words, indent=4)
|
||||
|
||||
def save(self, path):
|
||||
with open(path, 'w') as fout:
|
||||
fout.write(self.to_json())
|
||||
return path
|
||||
24
converters/__init__.py
Normal file
24
converters/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
|
||||
fields for converted transcript:
|
||||
|
||||
start
|
||||
end
|
||||
word
|
||||
confidence
|
||||
index
|
||||
always_capitalized
|
||||
punc_before
|
||||
punc_after
|
||||
|
||||
"""
|
||||
|
||||
from transcript_processing.converters.amazon import amazon_converter
|
||||
from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
|
||||
|
||||
|
||||
converters = {
|
||||
'speechmatics': speechmatics_converter,
|
||||
'speechmatics_align': speechmatics_aligned_text_converter,
|
||||
'amazon': amazon_converter,
|
||||
}
|
||||
146
converters/amazon.py
Normal file
146
converters/amazon.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import json
|
||||
|
||||
from transcript_processing import helpers
|
||||
|
||||
|
||||
|
||||
class AmazonConverter(TranscriptConverter):
|
||||
|
||||
def __init__(self, path, output_target):
|
||||
super().__init__(path, output_target)
|
||||
|
||||
def get_word_objects(self, json_data):
|
||||
return data['results']['items']
|
||||
|
||||
def get_words(self, word_objects):
|
||||
return [self.get_word_word(w)
|
||||
for w in word_objects])
|
||||
|
||||
@staticmethod
|
||||
def get_word_start(word_object):
|
||||
return float(word_object['start_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_end(word_object):
|
||||
return float(word_object['end_time'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_confidence(word_object):
|
||||
return float(word_object['alternatives'][0]['confidence'])
|
||||
|
||||
@staticmethod
|
||||
def get_word_word(word_object):
|
||||
word_word = w['alternatives'][0]['content']
|
||||
if word_word == 'i':
|
||||
# weird Amazon quirk
|
||||
word_word = 'I'
|
||||
return word_word
|
||||
|
||||
def convert_words(self, word_objects, words, tagged_words=None):
|
||||
converted_words = []
|
||||
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_obj = self.get_word_object(w, i, tagged_words, words)
|
||||
|
||||
if word_obj.next_word:
|
||||
next_word = self.get_word_word(word_obj.next_word)
|
||||
next_word_type = word_obj.next_word['type']
|
||||
if next_word in ['.', ',']:
|
||||
punc_after = next_word
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word_obj.word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_obj.start,
|
||||
'end': word_obj.end,
|
||||
'confidence': word_obj.confidence,
|
||||
'word': word_obj.word,
|
||||
'always_capitalized': (
|
||||
word_obj.is_proper_noun
|
||||
or word_obj.word == 'I'),
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
|
||||
|
||||
def amazon_converter(data: dict):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['results']['items']
|
||||
tagged_words = helpers.tag_words(
|
||||
[w['alternatives'][0]['content'] for w in words])
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_start = float(w['start_time'])
|
||||
word_end = float(w['end_time'])
|
||||
confidence = float(w['alternatives'][0]['confidence'])
|
||||
word = w['alternatives'][0]['content']
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
next_word = None
|
||||
if i < num_words - 1:
|
||||
next_word = words[i + 1]['alternatives'][0]['content']
|
||||
next_word_type = words[i + 1]['type']
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
elif next_word == ',':
|
||||
punc_after = ','
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word == 'i':
|
||||
# weird Amazon quirk
|
||||
word = 'I'
|
||||
|
||||
if word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_start,
|
||||
'end': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
@@ -1,88 +1,13 @@
|
||||
"""
|
||||
|
||||
fields for converted transcript:
|
||||
|
||||
start
|
||||
end
|
||||
word
|
||||
confidence
|
||||
index
|
||||
always_capitalized
|
||||
punc_before
|
||||
punc_after
|
||||
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from decimal import Decimal
|
||||
import json
|
||||
from typing import Dict, Union, List
|
||||
|
||||
import helpers
|
||||
from transcript_processing import helpers
|
||||
|
||||
|
||||
def amazon_converter(data: dict):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['results']['items']
|
||||
tagged_words = helpers.tag_words(
|
||||
[w['alternatives'][0]['content'] for w in words])
|
||||
punc_before = False
|
||||
punc_after = False
|
||||
num_words = len(words)
|
||||
index = 0
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w['type'] == 'punctuation':
|
||||
continue
|
||||
next_word_punc_after = None
|
||||
word_start = float(w['start_time'])
|
||||
word_end = float(w['end_time'])
|
||||
confidence = float(w['alternatives'][0]['confidence'])
|
||||
word = w['alternatives'][0]['content']
|
||||
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||
|
||||
next_word = None
|
||||
if i < num_words - 1:
|
||||
next_word = words[i + 1]['alternatives'][0]['content']
|
||||
next_word_type = words[i + 1]['type']
|
||||
if next_word == '.':
|
||||
punc_after = '.'
|
||||
elif next_word == ',':
|
||||
punc_after = ','
|
||||
elif next_word_punc_after:
|
||||
punc_after = next_word_punc_after
|
||||
next_word_punc_after = None
|
||||
|
||||
if word == 'i':
|
||||
# weird Amazon quirk
|
||||
word = 'I'
|
||||
|
||||
if word.lower() == 'you' and next_word == 'know':
|
||||
prev_word = words[i - 1]
|
||||
if prev_word['type'] != 'punctuation':
|
||||
converted_words[-1]['punc_after'] = ','
|
||||
if next_word_type != 'punctuation':
|
||||
next_word_punc_after = ','
|
||||
|
||||
converted_words.append({
|
||||
'start': word_start,
|
||||
'end': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'always_capitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
'punc_after': punc_after,
|
||||
'punc_before': punc_before,
|
||||
})
|
||||
|
||||
index += 1
|
||||
punc_after = False
|
||||
|
||||
return converted_words
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
|
||||
def speechmatics_converter(data: dict):
|
||||
def speechmatics_converter(data):
|
||||
data = json.load(data)
|
||||
converted_words = []
|
||||
words = data['words']
|
||||
@@ -126,7 +51,6 @@ def speechmatics_converter(data: dict):
|
||||
|
||||
def speechmatics_aligned_text_converter(data):
|
||||
data = data.readlines()[0]
|
||||
Word = namedtuple('Word', 'start end word')
|
||||
|
||||
class Exhausted(Exception):
|
||||
pass
|
||||
@@ -186,8 +110,4 @@ def speechmatics_aligned_text_converter(data):
|
||||
return converted_words
|
||||
|
||||
|
||||
converters = {
|
||||
'speechmatics': speechmatics_converter,
|
||||
'speechmatics_align': speechmatics_aligned_text_converter,
|
||||
'amazon': amazon_converter,
|
||||
}
|
||||
def gentle_converter
|
||||
43845
fifty_min.json
43845
fifty_min.json
File diff suppressed because it is too large
Load Diff
68542
fifty_min_processed.json
68542
fifty_min_processed.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
20
models.py
20
models.py
@@ -1,20 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from converters import converters
|
||||
|
||||
|
||||
class TranscriptConverter:
|
||||
|
||||
def __init__(self, path, format_name):
|
||||
self.path = path
|
||||
with open(path) as f:
|
||||
self.words = converters[format_name](f)
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self.words, indent=4)
|
||||
|
||||
def save(self):
|
||||
name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
|
||||
with open(name, 'w') as fout:
|
||||
fout.write(self.to_json())
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
2643
transcript.json
2643
transcript.json
File diff suppressed because it is too large
Load Diff
2115
two_min.json
2115
two_min.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user