commit e47f7a04e81210a7423a03ec068a7b13f28fab43
Author: zevav <zev@averba.ch>
Date:   Fri Oct 12 18:49:54 2018 -0400

    first

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/.DS_Store differ
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..bd53818
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
+}
\ No newline at end of file
diff --git a/__pycache__/converters.cpython-36.pyc b/__pycache__/converters.cpython-36.pyc
new file mode 100644
index 0000000..fc0de87
Binary files /dev/null and b/__pycache__/converters.cpython-36.pyc differ
diff --git a/__pycache__/helpers.cpython-36.pyc b/__pycache__/helpers.cpython-36.pyc
new file mode 100644
index 0000000..942b999
Binary files /dev/null and b/__pycache__/helpers.cpython-36.pyc differ
diff --git a/__pycache__/models.cpython-36.pyc b/__pycache__/models.cpython-36.pyc
new file mode 100644
index 0000000..7deb95c
Binary files /dev/null and b/__pycache__/models.cpython-36.pyc differ
diff --git a/converters.py b/converters.py
new file mode 100644
index 0000000..91b1f2e
--- /dev/null
+++ b/converters.py
@@ -0,0 +1,33 @@
+from decimal import Decimal
+from typing import Dict, Union, List
+
+from helpers import tag_words, PROPER_NOUN_TAGS
+
+
+def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
+    converted_words = []
+    words = data['words']
+    tagged_words = tag_words([w['name'] for w in words])
+
+    for index, w in enumerate(words):
+        word_start = Decimal(w['time'])
+        word_end = word_start + Decimal(w['duration'])
+        confidence = Decimal(w['confidence'])
+        word = w['name']
+        space = '' if word == '.' else ' '
+        is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS
+        converted_words.append({
+            'wordStart': word_start,
+            'wordEnd': word_end,
+            'confidence': confidence,
+            'word': word,
+            'space': space,
+            'alwaysCapitalized': is_proper_noun or word == 'I',
+            'index': index,
+        })
+    return converted_words
+
+
+converters = {
+    'speechmatics': speechmatics_converter,
+}
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000..4ed4388
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,17 @@
+from nltk.tag.stanford import StanfordNERTagger
+
+st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
+                       '/usr/local/bin/stanford-ner.jar')
+
+
+PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
+
+
+def tag_words(words):
+    return st.tag(words)
+
+
+def is_a_proper_noun(phrase):
+    tagged_words = tag_words(phrase.split())
+    return any(tagged_word[1] in PROPER_NOUN_TAGS
+               for tagged_word in tagged_words)
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..bda7729
--- /dev/null
+++ b/models.py
@@ -0,0 +1,27 @@
+import json
+import os
+
+from converters import converters
+
+
+class TranscriptConverter:
+
+    def __init__(self, path, format_name):
+        self.path = path
+        with open(path, 'r') as fin:
+            self.words = converters[format_name](json.load(fin))
+            # wordStart
+            # wordEnd
+            # word
+            # confidence
+            # index
+            # space
+            # alwaysCapitalized
+
+    def to_json(self):
+        return json.dumps(self.words, indent=4)
+
+    def save(self):
+        name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
+        with open(name, 'w') as fout:
+            fout.write(self.to_json())