first

2018-10-12 18:49:54 -04:00
commit e47f7a04e8
8 changed files with 80 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
+}
--- a/pycache/converters.cpython-36.pyc
+++ b/pycache/converters.cpython-36.pyc
--- a/pycache/helpers.cpython-36.pyc
+++ b/pycache/helpers.cpython-36.pyc
--- a/pycache/models.cpython-36.pyc
+++ b/pycache/models.cpython-36.pyc
--- a/converters.py
+++ b/converters.py
@@ -0,0 +1,33 @@
+from decimal import Decimal
+from typing import Dict, Union, List
+
+from helpers import tag_words, PROPER_NOUN_TAGS
+
+
+def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
+    converted_words = []
+    words = data['words']
+    tagged_words = tag_words([w['name'] for w in words])
+
+    for index, w in enumerate(words):
+        word_start = Decimal(w['time'])
+        word_end = word_start + Decimal(w['duration'])
+        confidence = Decimal(w['confidence'])
+        word = w['name']
+        space = '' if word == '.' else ' '
+        is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS
+        converted_words.append({
+            'wordStart': word_start,
+            'wordEnd': word_end,
+            'confidence': confidence,
+            'word': word,
+            'space': space,
+            'alwaysCapitalized': is_proper_noun or word == 'I',
+            'index': index,
+        })
+    return converted_words
+
+
+converters = {
+    'speechmatics': speechmatics_converter,
+}
--- a/helpers.py
+++ b/helpers.py
@@ -0,0 +1,17 @@
+from nltk.tag.stanford import StanfordNERTagger
+
+st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
+                       '/usr/local/bin/stanford-ner.jar')
+
+
+PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
+
+
+def tag_words(words):
+    return st.tag(words)
+
+
+def is_a_proper_noun(phrase):
+    tagged_words = tag_words(phrase.split())
+    return any(tagged_word[1] in PROPER_NOUN_TAGS
+               for tagged_word in tagged_words)
--- a/models.py
+++ b/models.py
@@ -0,0 +1,27 @@
+import json
+import os
+
+from converters import converters
+
+
+class TranscriptConverter:
+
+    def __init__(self, path, format_name):
+        self.path = path
+        with open(path, 'r') as fin:
+            self.words = converters[format_name](json.load(fin))
+            # wordStart
+            # wordEnd
+            # word
+            # confidence
+            # index
+            # space
+            # alwaysCapitalized
+
+    def to_json(self):
+        return json.dumps(self.words, indent=4)
+
+    def save(self):
+        name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
+        with open(name, 'w') as fout:
+            fout.write(self.to_json())