This commit is contained in:
2018-10-12 18:49:54 -04:00
commit e47f7a04e8
8 changed files with 80 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

3
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

33
converters.py Normal file
View File

@@ -0,0 +1,33 @@
from decimal import Decimal
from typing import Dict, Union, List
from helpers import tag_words, PROPER_NOUN_TAGS
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
converted_words = []
words = data['words']
tagged_words = tag_words([w['name'] for w in words])
for index, w in enumerate(words):
word_start = Decimal(w['time'])
word_end = word_start + Decimal(w['duration'])
confidence = Decimal(w['confidence'])
word = w['name']
space = '' if word == '.' else ' '
is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS
converted_words.append({
'wordStart': word_start,
'wordEnd': word_end,
'confidence': confidence,
'word': word,
'space': space,
'alwaysCapitalized': is_proper_noun or word == 'I',
'index': index,
})
return converted_words
converters = {
'speechmatics': speechmatics_converter,
}

17
helpers.py Normal file
View File

@@ -0,0 +1,17 @@
from nltk.tag.stanford import StanfordNERTagger
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
'/usr/local/bin/stanford-ner.jar')
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
def tag_words(words):
return st.tag(words)
def is_a_proper_noun(phrase):
tagged_words = tag_words(phrase.split())
return any(tagged_word[1] in PROPER_NOUN_TAGS
for tagged_word in tagged_words)

27
models.py Normal file
View File

@@ -0,0 +1,27 @@
import json
import os
from converters import converters
class TranscriptConverter:
def __init__(self, path, format_name):
self.path = path
with open(path, 'r') as fin:
self.words = converters[format_name](json.load(fin))
# wordStart
# wordEnd
# word
# confidence
# index
# space
# alwaysCapitalized
def to_json(self):
return json.dumps(self.words, indent=4)
def save(self):
name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
with open(name, 'w') as fout:
fout.write(self.to_json())