commit e47f7a04e81210a7423a03ec068a7b13f28fab43 Author: zevav Date: Fri Oct 12 18:49:54 2018 -0400 first diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/.DS_Store differ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..bd53818 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3" +} \ No newline at end of file diff --git a/__pycache__/converters.cpython-36.pyc b/__pycache__/converters.cpython-36.pyc new file mode 100644 index 0000000..fc0de87 Binary files /dev/null and b/__pycache__/converters.cpython-36.pyc differ diff --git a/__pycache__/helpers.cpython-36.pyc b/__pycache__/helpers.cpython-36.pyc new file mode 100644 index 0000000..942b999 Binary files /dev/null and b/__pycache__/helpers.cpython-36.pyc differ diff --git a/__pycache__/models.cpython-36.pyc b/__pycache__/models.cpython-36.pyc new file mode 100644 index 0000000..7deb95c Binary files /dev/null and b/__pycache__/models.cpython-36.pyc differ diff --git a/converters.py b/converters.py new file mode 100644 index 0000000..91b1f2e --- /dev/null +++ b/converters.py @@ -0,0 +1,33 @@ +from decimal import Decimal +from typing import Dict, Union, List + +from helpers import tag_words, PROPER_NOUN_TAGS + + +def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]): + converted_words = [] + words = data['words'] + tagged_words = tag_words([w['name'] for w in words]) + + for index, w in enumerate(words): + word_start = Decimal(w['time']) + word_end = word_start + Decimal(w['duration']) + confidence = Decimal(w['confidence']) + word = w['name'] + space = '' if word == '.' else ' ' + is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS + converted_words.append({ + 'wordStart': word_start, + 'wordEnd': word_end, + 'confidence': confidence, + 'word': word, + 'space': space, + 'alwaysCapitalized': is_proper_noun or word == 'I', + 'index': index, + }) + return converted_words + + +converters = { + 'speechmatics': speechmatics_converter, +} diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..4ed4388 --- /dev/null +++ b/helpers.py @@ -0,0 +1,17 @@ +from nltk.tag.stanford import StanfordNERTagger + +st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz', + '/usr/local/bin/stanford-ner.jar') + + +PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION'] + + +def tag_words(words): + return st.tag(words) + + +def is_a_proper_noun(phrase): + tagged_words = tag_words(phrase.split()) + return any(tagged_word[1] in PROPER_NOUN_TAGS + for tagged_word in tagged_words) diff --git a/models.py b/models.py new file mode 100644 index 0000000..bda7729 --- /dev/null +++ b/models.py @@ -0,0 +1,27 @@ +import json +import os + +from converters import converters + + +class TranscriptConverter: + + def __init__(self, path, format_name): + self.path = path + with open(path, 'r') as fin: + self.words = converters[format_name](json.load(fin)) + # wordStart + # wordEnd + # word + # confidence + # index + # space + # alwaysCapitalized + + def to_json(self): + return json.dumps(self.words, indent=4) + + def save(self): + name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json" + with open(name, 'w') as fout: + fout.write(self.to_json())