created converter for speechmatics timing

2018-11-22 03:56:03 -05:00
parent 4da2317db8
commit d5a37df5a8
12 changed files with 120525 additions and 16 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/pycache/converters.cpython-36.pyc
+++ b/pycache/converters.cpython-36.pyc
--- a/pycache/helpers.cpython-36.pyc
+++ b/pycache/helpers.cpython-36.pyc
--- a/pycache/models.cpython-36.pyc
+++ b/pycache/models.cpython-36.pyc
--- a/converters.py
+++ b/converters.py
@@ -2,27 +2,30 @@
 fields for converted transcript:
-    wordStart
+    start
-    wordEnd
+    end
    word
    confidence
    index
-    alwaysCapitalized
+    always_capitalized
-    puncBefore
+    punc_before
-    puncAfter
+    punc_after
 """
 from collections import namedtuple
 from decimal import Decimal
 import json
 from typing import Dict, Union, List
-from helpers import tag_words, PROPER_NOUN_TAGS
+import helpers
-def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
+def speechmatics_converter(data: dict):
    data = json.load(data)
    converted_words = []
    words = data['words']
-    tagged_words = tag_words([w['name'] for w in words])
+    tagged_words = helpers.tag_words([w['name'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
@@ -35,7 +38,7 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
        word = w['name']
        if word == '.':
            continue
-        is_proper_noun = tagged_words[i][1] in PROPER_NOUN_TAGS
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        next_word = None
        if i < num_words - 1:
@@ -44,14 +47,14 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
            punc_after = '.'
        converted_words.append({
-            'wordStart': word_start,
+            'start': word_start,
-            'wordEnd': word_end,
+            'end': word_end,
            'confidence': confidence,
            'word': word,
-            'alwaysCapitalized': is_proper_noun or word == 'I',
+            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
-            'puncAfter': punc_after,
+            'punc_after': punc_after,
-            'puncBefore': punc_before,
+            'punc_before': punc_before,
        })
        index += 1
@@ -60,6 +63,69 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
    return converted_words
 def speechmatics_aligned_text_converter(data):
    data = data.readlines()[0]
    Word = namedtuple('Word', 'start end word')
    class Exhausted(Exception):
        pass
    def get_time(transcript, index):
        time_index = transcript.find('time=', index)
        if time_index == -1:
            raise Exhausted
        close_index = transcript.find('>', time_index)
        return float(transcript[time_index + 5: close_index]), close_index
    def find_next_word(transcript, start_index):
        start, end_of_start_index = get_time(transcript, start_index)
        word_start_index = end_of_start_index + 1
        word_end_index = transcript.find('<', word_start_index)
        word = transcript[word_start_index: word_end_index]
        end, close_index = get_time(transcript, word_end_index)
        return Word(start, end, word), close_index
    words = []
    next_index = 0
    word = None
    while True:
        try:
            word, next_index = find_next_word(data, next_index)
        except Exhausted:
            break
        else:
            words.append(word)
    tagged_words = helpers.tag_words([w.word for w in words])
    converted_words = []
    for i, word in enumerate(words):
        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
        punc_before = helpers.get_punc_before(word.word)
        punc_after = helpers.get_punc_after(word.word)
        the_word = word.word
        if punc_before or punc_after:
            for p in helpers.PUNCTUATION:
                the_word = the_word.replace(p, '')
        converted_words.append({
            'start': word.start,
            'end': word.end,
            'confidence': 1,
            'word': the_word,
            'always_capitalized': is_proper_noun or word == 'I',
            'index': i,
            'punc_before': punc_before,
            'punc_after': punc_after,
        })
    return converted_words
 converters = {
    'speechmatics': speechmatics_converter,
    'speechmatics_align': speechmatics_aligned_text_converter,
 }
--- a/fifty_min.json
+++ b/fifty_min.json
--- a/fifty_min_processed.json
+++ b/fifty_min_processed.json
--- a/helpers.py
+++ b/helpers.py
@@ -6,6 +6,8 @@ st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
 PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
 PUNCTUATION = ['.', '?', ',', ':', '"', '!']
 def tag_words(words):
    return st.tag(words)
@@ -15,3 +17,21 @@ def is_a_proper_noun(phrase):
    tagged_words = tag_words(phrase.split())
    return any(tagged_word[1] in PROPER_NOUN_TAGS
               for tagged_word in tagged_words)
 def get_punc_before(word):
    punc = []
    for char in word:
        if char.isalpha():
            return punc
        if char in PUNCTUATION:
            punc.append(char)
 def get_punc_after(word):
    punc = []
    for char in reversed(word):
        if char.isalpha():
            return punc
        if char in PUNCTUATION:
            punc.insert(0, char)
--- a/leland_transcript.json
+++ b/leland_transcript.json
--- a/models.py
+++ b/models.py
@@ -8,8 +8,8 @@ class TranscriptConverter:
    def __init__(self, path, format_name):
        self.path = path
-        with open(path, 'r') as fin:
+        with open(path) as f:
-            self.words = converters[format_name](json.load(fin))
+            self.words = converters[format_name](f)
    def to_json(self):
        return json.dumps(self.words, indent=4)
--- a/two_min.json
+++ b/two_min.json
--- a/two_min_processed.json
+++ b/two_min_processed.json