brought over changes from when transcript_processing was nested inside transcribely's back_end package. started refactoring converters into OOP

2019-02-06 20:57:21 -05:00
parent c9c4cbe550
commit 84fe4d2fd4
19 changed files with 277 additions and 135802 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +0,0 @@
 {
    "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
 }
--- a/Lelandmp3.json
+++ b/Lelandmp3.json
--- a/Lelandmp3_processed.json
+++ b/Lelandmp3_processed.json
--- a/init.py
+++ b/init.py
@@ -0,0 +1 @@
 from transcript_processing.converter import TranscriptConverter
--- a/config.py
+++ b/config.py
@@ -0,0 +1,4 @@
 import os
 AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')
--- a/converter.py
+++ b/converter.py
@@ -0,0 +1,98 @@
 import abc
 import json
 from collections import namedtuple
 import os
 import helpers
 from transcript_processing.converters import converters
 Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
 class TranscriptConverter:
    __metaclass__ = abc.ABCMeta
    def __init__(self, path, output_target):
        self.path = path
        self.output_target = output_target
    def convert(self):
        tagged_words = None
        with open(self.path) as f:
            data = json.load(f)
            word_objects = self.get_word_objects(data)
            words = self.get_words(word_objects)
            if self.output_target == 'interactive_transcript':
                tagged_words = helpers.tag_words(words)
            self.converted_words = self.convert_words(
                    word_objects,
                    words,
                    tagged_words
                    )
    @staticmethod
    @abc.abstractmethod
    def get_word_objects(json_data):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_words(word_objects):
        pass
    @staticmethod
    @abc.abstractmethod
    def convert_words(word_objects, words, tagged_words=None):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_word_start(word_object):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_word_end(word_object):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_word_confidence(word_object):
        pass
    @staticmethod
    @abc.abstractmethod
    def get_word_word(word_object):
        pass
    @staticmethod
    def check_if_proper_noun(index, tagged_words):
        return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
    def get_word_object(self, word_object, index, tagged_words, word_objects):
        return Word(
            self.get_word_start(word_object),
            self.get_word_end(word_object),
            self.get_word_confidence(word_object),
            self.get_word_word(word_object),
            self.check_if_proper_noun(index, tagged_words),
            self.get_next_word(word_objects, index)
                ) 
    def get_next_word(self, word_objects, index):
        if index < len(word_objects) - 1:
            return word_objects[index + 1]
    def to_json(self):
        return json.dumps(self.converted_words, indent=4)
    def save(self, path):
        with open(path, 'w') as fout:
            fout.write(self.to_json())
        return path
--- a/converters/init.py
+++ b/converters/init.py
@@ -0,0 +1,24 @@
 """
 fields for converted transcript:
    start
    end
    word
    confidence
    index
    always_capitalized
    punc_before
    punc_after
 """
 from transcript_processing.converters.amazon import amazon_converter
 from transcript_processing.converters.speechmatics import speechmatics_aligned_text_converter, speechmatics_converter
 converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
    'amazon': amazon_converter,
 }
--- a/converters/amazon.py
+++ b/converters/amazon.py
@@ -0,0 +1,146 @@
 import json
 from transcript_processing import helpers
 class AmazonConverter(TranscriptConverter):
    def __init__(self, path, output_target):
        super().__init__(path, output_target)
    def get_word_objects(self, json_data):
        return data['results']['items']
    def get_words(self, word_objects):
        return [self.get_word_word(w)
                for w in word_objects])
    @staticmethod
    def get_word_start(word_object):
        return float(word_object['start_time'])
    @staticmethod
    def get_word_end(word_object):
        return float(word_object['end_time'])
    @staticmethod
    def get_word_confidence(word_object):
        return float(word_object['alternatives'][0]['confidence'])
    @staticmethod
    def get_word_word(word_object):
        word_word = w['alternatives'][0]['content']
        if word_word == 'i':
            # weird Amazon quirk
            word_word = 'I'
        return word_word
    def convert_words(self, word_objects, words, tagged_words=None):
        converted_words = []
        punc_before = False
        punc_after = False
        num_words = len(words)
        index = 0
        for i, w in enumerate(words):
            if w['type'] == 'punctuation':
                continue
            next_word_punc_after = None
            word_obj = self.get_word_object(w, i, tagged_words, words)
            if word_obj.next_word:
                next_word = self.get_word_word(word_obj.next_word)
                next_word_type = word_obj.next_word['type']
                if next_word in ['.', ',']:
                    punc_after = next_word
                elif next_word_punc_after:
                    punc_after = next_word_punc_after
                    next_word_punc_after = None
            if word_obj.word.lower() == 'you' and next_word == 'know':
                prev_word = words[i - 1]
                if prev_word['type'] != 'punctuation':
                    converted_words[-1]['punc_after'] = ','
                if next_word_type != 'punctuation':
                    next_word_punc_after = ','
            converted_words.append({
                'start': word_obj.start,
                'end': word_obj.end,
                'confidence': word_obj.confidence,
                'word': word_obj.word,
                'always_capitalized': (
                    word_obj.is_proper_noun 
                    or word_obj.word == 'I'),
                'index': index,
                'punc_after': punc_after,
                'punc_before': punc_before,
            })
            index += 1
            punc_after = False
        return converted_words
 def amazon_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0
    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None
        if word == 'i':
            # weird Amazon quirk
            word = 'I'
        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','
        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })
        index += 1
        punc_after = False
    return converted_words
--- a/converters/speechmatics.py
+++ b/converters/speechmatics.py
@@ -1,88 +1,13 @@
 """
 fields for converted transcript:
    start
    end
    word
    confidence
    index
    always_capitalized
    punc_before
    punc_after
 """
 from collections import namedtuple
 from decimal import Decimal
 import json
 from typing import Dict, Union, List
-import helpers
+from transcript_processing import helpers
-def amazon_converter(data: dict):
+Word = namedtuple('Word', 'start end word')
    data = json.load(data)
    converted_words = []
    words = data['results']['items']
    tagged_words = helpers.tag_words(
        [w['alternatives'][0]['content'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
    index = 0
    for i, w in enumerate(words):
        if w['type'] == 'punctuation':
            continue
        next_word_punc_after = None
        word_start = float(w['start_time'])
        word_end = float(w['end_time'])
        confidence = float(w['alternatives'][0]['confidence'])
        word = w['alternatives'][0]['content']
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
            next_word = words[i + 1]['alternatives'][0]['content']
            next_word_type = words[i + 1]['type']
        if next_word == '.':
            punc_after = '.'
        elif next_word == ',':
            punc_after = ','
        elif next_word_punc_after:
            punc_after = next_word_punc_after
            next_word_punc_after = None
        if word == 'i':
            # weird Amazon quirk
            word = 'I'
        if word.lower() == 'you' and next_word == 'know':
            prev_word = words[i - 1]
            if prev_word['type'] != 'punctuation':
                converted_words[-1]['punc_after'] = ','
            if next_word_type != 'punctuation':
                next_word_punc_after = ','
        converted_words.append({
            'start': word_start,
            'end': word_end,
            'confidence': confidence,
            'word': word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
            'punc_after': punc_after,
            'punc_before': punc_before,
        })
        index += 1
        punc_after = False
    return converted_words
-def speechmatics_converter(data: dict):
+def speechmatics_converter(data):
    data = json.load(data)
    converted_words = []
    words = data['words']
@@ -126,7 +51,6 @@ def speechmatics_converter(data: dict):
 def speechmatics_aligned_text_converter(data):
    data = data.readlines()[0]
    Word = namedtuple('Word', 'start end word')
    class Exhausted(Exception):
        pass
@@ -186,8 +110,4 @@ def speechmatics_aligned_text_converter(data):
    return converted_words
-converters = {
+def gentle_converter
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
    'amazon': amazon_converter,
 }
--- a/fifty_min.json
+++ b/fifty_min.json
--- a/fifty_min_processed.json
+++ b/fifty_min_processed.json
--- a/leland_transcript.json
+++ b/leland_transcript.json
--- a/leland_transcript_processed.json
+++ b/leland_transcript_processed.json
--- a/models.py
+++ b/models.py
@@ -1,20 +0,0 @@
 import json
 import os
 from converters import converters
 class TranscriptConverter:
    def __init__(self, path, format_name):
        self.path = path
        with open(path) as f:
            self.words = converters[format_name](f)
    def to_json(self):
        return json.dumps(self.words, indent=4)
    def save(self):
        name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
        with open(name, 'w') as fout:
            fout.write(self.to_json())
--- a/tests/init.py
+++ b/tests/init.py
--- a/transcript.json
+++ b/transcript.json
--- a/two_min.json
+++ b/two_min.json
--- a/two_min_processed.json
+++ b/two_min_processed.json
		`@@ -0,0 +1 @@`
							`from transcript_processing.converter import TranscriptConverter`
		`@@ -0,0 +1,4 @@`
							`import os`


							`AMAZON_TRANSCRIPT_TEST_FILE = os.getenv('AMAZON_TRANSCRIPT_TEST_FILE')`