first
This commit is contained in:
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"python.pythonPath": "/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
|
||||
}
|
||||
BIN
__pycache__/converters.cpython-36.pyc
Normal file
BIN
__pycache__/converters.cpython-36.pyc
Normal file
Binary file not shown.
BIN
__pycache__/helpers.cpython-36.pyc
Normal file
BIN
__pycache__/helpers.cpython-36.pyc
Normal file
Binary file not shown.
BIN
__pycache__/models.cpython-36.pyc
Normal file
BIN
__pycache__/models.cpython-36.pyc
Normal file
Binary file not shown.
33
converters.py
Normal file
33
converters.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from decimal import Decimal
|
||||
from typing import Dict, Union, List
|
||||
|
||||
from helpers import tag_words, PROPER_NOUN_TAGS
|
||||
|
||||
|
||||
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
|
||||
converted_words = []
|
||||
words = data['words']
|
||||
tagged_words = tag_words([w['name'] for w in words])
|
||||
|
||||
for index, w in enumerate(words):
|
||||
word_start = Decimal(w['time'])
|
||||
word_end = word_start + Decimal(w['duration'])
|
||||
confidence = Decimal(w['confidence'])
|
||||
word = w['name']
|
||||
space = '' if word == '.' else ' '
|
||||
is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS
|
||||
converted_words.append({
|
||||
'wordStart': word_start,
|
||||
'wordEnd': word_end,
|
||||
'confidence': confidence,
|
||||
'word': word,
|
||||
'space': space,
|
||||
'alwaysCapitalized': is_proper_noun or word == 'I',
|
||||
'index': index,
|
||||
})
|
||||
return converted_words
|
||||
|
||||
|
||||
converters = {
|
||||
'speechmatics': speechmatics_converter,
|
||||
}
|
||||
17
helpers.py
Normal file
17
helpers.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from nltk.tag.stanford import StanfordNERTagger
|
||||
|
||||
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
||||
'/usr/local/bin/stanford-ner.jar')
|
||||
|
||||
|
||||
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
|
||||
|
||||
|
||||
def tag_words(words):
|
||||
return st.tag(words)
|
||||
|
||||
|
||||
def is_a_proper_noun(phrase):
|
||||
tagged_words = tag_words(phrase.split())
|
||||
return any(tagged_word[1] in PROPER_NOUN_TAGS
|
||||
for tagged_word in tagged_words)
|
||||
27
models.py
Normal file
27
models.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from converters import converters
|
||||
|
||||
|
||||
class TranscriptConverter:
|
||||
|
||||
def __init__(self, path, format_name):
|
||||
self.path = path
|
||||
with open(path, 'r') as fin:
|
||||
self.words = converters[format_name](json.load(fin))
|
||||
# wordStart
|
||||
# wordEnd
|
||||
# word
|
||||
# confidence
|
||||
# index
|
||||
# space
|
||||
# alwaysCapitalized
|
||||
|
||||
def to_json(self):
|
||||
return json.dumps(self.words, indent=4)
|
||||
|
||||
def save(self):
|
||||
name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json"
|
||||
with open(name, 'w') as fout:
|
||||
fout.write(self.to_json())
|
||||
Reference in New Issue
Block a user