created converter for speechmatics timing

2018-11-22 03:56:03 -05:00
parent 4da2317db8
commit d5a37df5a8
12 changed files with 120525 additions and 16 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/pycache/converters.cpython-36.pyc
+++ b/pycache/converters.cpython-36.pyc
--- a/pycache/helpers.cpython-36.pyc
+++ b/pycache/helpers.cpython-36.pyc
--- a/pycache/models.cpython-36.pyc
+++ b/pycache/models.cpython-36.pyc
--- a/converters.py
+++ b/converters.py
@@ -2,27 +2,30 @@

 fields for converted transcript:

-    wordStart
-    wordEnd
+    start
+    end
    word
    confidence
    index
-    alwaysCapitalized
-    puncBefore
-    puncAfter
+    always_capitalized
+    punc_before
+    punc_after

 """

+from collections import namedtuple
 from decimal import Decimal
+import json
 from typing import Dict, Union, List

-from helpers import tag_words, PROPER_NOUN_TAGS
+import helpers


-def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
+def speechmatics_converter(data: dict):
+    data = json.load(data)
    converted_words = []
    words = data['words']
-    tagged_words = tag_words([w['name'] for w in words])
+    tagged_words = helpers.tag_words([w['name'] for w in words])
    punc_before = False
    punc_after = False
    num_words = len(words)
@@ -35,7 +38,7 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
        word = w['name']
        if word == '.':
            continue
-        is_proper_noun = tagged_words[i][1] in PROPER_NOUN_TAGS
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS

        next_word = None
        if i < num_words - 1:
@@ -44,14 +47,14 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
            punc_after = '.'

        converted_words.append({
-            'wordStart': word_start,
-            'wordEnd': word_end,
+            'start': word_start,
+            'end': word_end,
            'confidence': confidence,
            'word': word,
-            'alwaysCapitalized': is_proper_noun or word == 'I',
+            'always_capitalized': is_proper_noun or word == 'I',
            'index': index,
-            'puncAfter': punc_after,
-            'puncBefore': punc_before,
+            'punc_after': punc_after,
+            'punc_before': punc_before,
        })

        index += 1
@@ -60,6 +63,69 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
    return converted_words


+def speechmatics_aligned_text_converter(data):
+    data = data.readlines()[0]
+    Word = namedtuple('Word', 'start end word')
+
+    class Exhausted(Exception):
+        pass
+
+    def get_time(transcript, index):
+        time_index = transcript.find('time=', index)
+        if time_index == -1:
+            raise Exhausted
+        close_index = transcript.find('>', time_index)
+        return float(transcript[time_index + 5: close_index]), close_index
+
+    def find_next_word(transcript, start_index):
+        start, end_of_start_index = get_time(transcript, start_index)
+
+        word_start_index = end_of_start_index + 1
+        word_end_index = transcript.find('<', word_start_index)
+        word = transcript[word_start_index: word_end_index]
+
+        end, close_index = get_time(transcript, word_end_index)
+
+        return Word(start, end, word), close_index
+
+    words = []
+    next_index = 0
+    word = None
+
+    while True:
+        try:
+            word, next_index = find_next_word(data, next_index)
+        except Exhausted:
+            break
+        else:
+            words.append(word)
+
+    tagged_words = helpers.tag_words([w.word for w in words])
+    converted_words = []
+
+    for i, word in enumerate(words):
+        is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
+        punc_before = helpers.get_punc_before(word.word)
+        punc_after = helpers.get_punc_after(word.word)
+        the_word = word.word
+        if punc_before or punc_after:
+            for p in helpers.PUNCTUATION:
+                the_word = the_word.replace(p, '')
+        converted_words.append({
+            'start': word.start,
+            'end': word.end,
+            'confidence': 1,
+            'word': the_word,
+            'always_capitalized': is_proper_noun or word == 'I',
+            'index': i,
+            'punc_before': punc_before,
+            'punc_after': punc_after,
+        })
+
+    return converted_words
+
+
 converters = {
    'speechmatics': speechmatics_converter,
+    'speechmatics_align': speechmatics_aligned_text_converter,
 }
--- a/fifty_min.json
+++ b/fifty_min.json
--- a/fifty_min_processed.json
+++ b/fifty_min_processed.json
--- a/helpers.py
+++ b/helpers.py
@@ -6,6 +6,8 @@ st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',

 PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']

+PUNCTUATION = ['.', '?', ',', ':', '"', '!']
+

 def tag_words(words):
    return st.tag(words)
@@ -15,3 +17,21 @@ def is_a_proper_noun(phrase):
    tagged_words = tag_words(phrase.split())
    return any(tagged_word[1] in PROPER_NOUN_TAGS
               for tagged_word in tagged_words)
+
+
+def get_punc_before(word):
+    punc = []
+    for char in word:
+        if char.isalpha():
+            return punc
+        if char in PUNCTUATION:
+            punc.append(char)
+
+
+def get_punc_after(word):
+    punc = []
+    for char in reversed(word):
+        if char.isalpha():
+            return punc
+        if char in PUNCTUATION:
+            punc.insert(0, char)
--- a/leland_transcript.json
+++ b/leland_transcript.json
--- a/models.py
+++ b/models.py
@@ -8,8 +8,8 @@ class TranscriptConverter:

    def __init__(self, path, format_name):
        self.path = path
-        with open(path, 'r') as fin:
-            self.words = converters[format_name](json.load(fin))
+        with open(path) as f:
+            self.words = converters[format_name](f)

    def to_json(self):
        return json.dumps(self.words, indent=4)
--- a/two_min.json
+++ b/two_min.json
--- a/two_min_processed.json
+++ b/two_min_processed.json