created converter for speechmatics timing

This commit is contained in:
2018-11-22 03:56:03 -05:00
parent 4da2317db8
commit d5a37df5a8
12 changed files with 120525 additions and 16 deletions

BIN
.DS_Store vendored

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -2,27 +2,30 @@
fields for converted transcript: fields for converted transcript:
wordStart start
wordEnd end
word word
confidence confidence
index index
alwaysCapitalized always_capitalized
puncBefore punc_before
puncAfter punc_after
""" """
from collections import namedtuple
from decimal import Decimal from decimal import Decimal
import json
from typing import Dict, Union, List from typing import Dict, Union, List
from helpers import tag_words, PROPER_NOUN_TAGS import helpers
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]): def speechmatics_converter(data: dict):
data = json.load(data)
converted_words = [] converted_words = []
words = data['words'] words = data['words']
tagged_words = tag_words([w['name'] for w in words]) tagged_words = helpers.tag_words([w['name'] for w in words])
punc_before = False punc_before = False
punc_after = False punc_after = False
num_words = len(words) num_words = len(words)
@@ -35,7 +38,7 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
word = w['name'] word = w['name']
if word == '.': if word == '.':
continue continue
is_proper_noun = tagged_words[i][1] in PROPER_NOUN_TAGS is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
next_word = None next_word = None
if i < num_words - 1: if i < num_words - 1:
@@ -44,14 +47,14 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
punc_after = '.' punc_after = '.'
converted_words.append({ converted_words.append({
'wordStart': word_start, 'start': word_start,
'wordEnd': word_end, 'end': word_end,
'confidence': confidence, 'confidence': confidence,
'word': word, 'word': word,
'alwaysCapitalized': is_proper_noun or word == 'I', 'always_capitalized': is_proper_noun or word == 'I',
'index': index, 'index': index,
'puncAfter': punc_after, 'punc_after': punc_after,
'puncBefore': punc_before, 'punc_before': punc_before,
}) })
index += 1 index += 1
@@ -60,6 +63,69 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
return converted_words return converted_words
def speechmatics_aligned_text_converter(data):
data = data.readlines()[0]
Word = namedtuple('Word', 'start end word')
class Exhausted(Exception):
pass
def get_time(transcript, index):
time_index = transcript.find('time=', index)
if time_index == -1:
raise Exhausted
close_index = transcript.find('>', time_index)
return float(transcript[time_index + 5: close_index]), close_index
def find_next_word(transcript, start_index):
start, end_of_start_index = get_time(transcript, start_index)
word_start_index = end_of_start_index + 1
word_end_index = transcript.find('<', word_start_index)
word = transcript[word_start_index: word_end_index]
end, close_index = get_time(transcript, word_end_index)
return Word(start, end, word), close_index
words = []
next_index = 0
word = None
while True:
try:
word, next_index = find_next_word(data, next_index)
except Exhausted:
break
else:
words.append(word)
tagged_words = helpers.tag_words([w.word for w in words])
converted_words = []
for i, word in enumerate(words):
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
punc_before = helpers.get_punc_before(word.word)
punc_after = helpers.get_punc_after(word.word)
the_word = word.word
if punc_before or punc_after:
for p in helpers.PUNCTUATION:
the_word = the_word.replace(p, '')
converted_words.append({
'start': word.start,
'end': word.end,
'confidence': 1,
'word': the_word,
'always_capitalized': is_proper_noun or word == 'I',
'index': i,
'punc_before': punc_before,
'punc_after': punc_after,
})
return converted_words
converters = { converters = {
'speechmatics': speechmatics_converter, 'speechmatics': speechmatics_converter,
'speechmatics_align': speechmatics_aligned_text_converter,
} }

43845
fifty_min.json Normal file

File diff suppressed because it is too large Load Diff

68542
fifty_min_processed.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -6,6 +6,8 @@ st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION'] PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
PUNCTUATION = ['.', '?', ',', ':', '"', '!']
def tag_words(words): def tag_words(words):
return st.tag(words) return st.tag(words)
@@ -15,3 +17,21 @@ def is_a_proper_noun(phrase):
tagged_words = tag_words(phrase.split()) tagged_words = tag_words(phrase.split())
return any(tagged_word[1] in PROPER_NOUN_TAGS return any(tagged_word[1] in PROPER_NOUN_TAGS
for tagged_word in tagged_words) for tagged_word in tagged_words)
def get_punc_before(word):
punc = []
for char in word:
if char.isalpha():
return punc
if char in PUNCTUATION:
punc.append(char)
def get_punc_after(word):
punc = []
for char in reversed(word):
if char.isalpha():
return punc
if char in PUNCTUATION:
punc.insert(0, char)

2589
leland_transcript.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,8 @@ class TranscriptConverter:
def __init__(self, path, format_name): def __init__(self, path, format_name):
self.path = path self.path = path
with open(path, 'r') as fin: with open(path) as f:
self.words = converters[format_name](json.load(fin)) self.words = converters[format_name](f)
def to_json(self): def to_json(self):
return json.dumps(self.words, indent=4) return json.dumps(self.words, indent=4)

2115
two_min.json Normal file

File diff suppressed because it is too large Load Diff

3332
two_min_processed.json Normal file

File diff suppressed because it is too large Load Diff