created converter for speechmatics timing
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -2,27 +2,30 @@
|
|||||||
|
|
||||||
fields for converted transcript:
|
fields for converted transcript:
|
||||||
|
|
||||||
wordStart
|
start
|
||||||
wordEnd
|
end
|
||||||
word
|
word
|
||||||
confidence
|
confidence
|
||||||
index
|
index
|
||||||
alwaysCapitalized
|
always_capitalized
|
||||||
puncBefore
|
punc_before
|
||||||
puncAfter
|
punc_after
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
|
import json
|
||||||
from typing import Dict, Union, List
|
from typing import Dict, Union, List
|
||||||
|
|
||||||
from helpers import tag_words, PROPER_NOUN_TAGS
|
import helpers
|
||||||
|
|
||||||
|
|
||||||
def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]):
|
def speechmatics_converter(data: dict):
|
||||||
|
data = json.load(data)
|
||||||
converted_words = []
|
converted_words = []
|
||||||
words = data['words']
|
words = data['words']
|
||||||
tagged_words = tag_words([w['name'] for w in words])
|
tagged_words = helpers.tag_words([w['name'] for w in words])
|
||||||
punc_before = False
|
punc_before = False
|
||||||
punc_after = False
|
punc_after = False
|
||||||
num_words = len(words)
|
num_words = len(words)
|
||||||
@@ -35,7 +38,7 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
|
|||||||
word = w['name']
|
word = w['name']
|
||||||
if word == '.':
|
if word == '.':
|
||||||
continue
|
continue
|
||||||
is_proper_noun = tagged_words[i][1] in PROPER_NOUN_TAGS
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||||
|
|
||||||
next_word = None
|
next_word = None
|
||||||
if i < num_words - 1:
|
if i < num_words - 1:
|
||||||
@@ -44,14 +47,14 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
|
|||||||
punc_after = '.'
|
punc_after = '.'
|
||||||
|
|
||||||
converted_words.append({
|
converted_words.append({
|
||||||
'wordStart': word_start,
|
'start': word_start,
|
||||||
'wordEnd': word_end,
|
'end': word_end,
|
||||||
'confidence': confidence,
|
'confidence': confidence,
|
||||||
'word': word,
|
'word': word,
|
||||||
'alwaysCapitalized': is_proper_noun or word == 'I',
|
'always_capitalized': is_proper_noun or word == 'I',
|
||||||
'index': index,
|
'index': index,
|
||||||
'puncAfter': punc_after,
|
'punc_after': punc_after,
|
||||||
'puncBefore': punc_before,
|
'punc_before': punc_before,
|
||||||
})
|
})
|
||||||
|
|
||||||
index += 1
|
index += 1
|
||||||
@@ -60,6 +63,69 @@ def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str,
|
|||||||
return converted_words
|
return converted_words
|
||||||
|
|
||||||
|
|
||||||
|
def speechmatics_aligned_text_converter(data):
|
||||||
|
data = data.readlines()[0]
|
||||||
|
Word = namedtuple('Word', 'start end word')
|
||||||
|
|
||||||
|
class Exhausted(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_time(transcript, index):
|
||||||
|
time_index = transcript.find('time=', index)
|
||||||
|
if time_index == -1:
|
||||||
|
raise Exhausted
|
||||||
|
close_index = transcript.find('>', time_index)
|
||||||
|
return float(transcript[time_index + 5: close_index]), close_index
|
||||||
|
|
||||||
|
def find_next_word(transcript, start_index):
|
||||||
|
start, end_of_start_index = get_time(transcript, start_index)
|
||||||
|
|
||||||
|
word_start_index = end_of_start_index + 1
|
||||||
|
word_end_index = transcript.find('<', word_start_index)
|
||||||
|
word = transcript[word_start_index: word_end_index]
|
||||||
|
|
||||||
|
end, close_index = get_time(transcript, word_end_index)
|
||||||
|
|
||||||
|
return Word(start, end, word), close_index
|
||||||
|
|
||||||
|
words = []
|
||||||
|
next_index = 0
|
||||||
|
word = None
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
word, next_index = find_next_word(data, next_index)
|
||||||
|
except Exhausted:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
words.append(word)
|
||||||
|
|
||||||
|
tagged_words = helpers.tag_words([w.word for w in words])
|
||||||
|
converted_words = []
|
||||||
|
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
is_proper_noun = tagged_words[i][1] in helpers.PROPER_NOUN_TAGS
|
||||||
|
punc_before = helpers.get_punc_before(word.word)
|
||||||
|
punc_after = helpers.get_punc_after(word.word)
|
||||||
|
the_word = word.word
|
||||||
|
if punc_before or punc_after:
|
||||||
|
for p in helpers.PUNCTUATION:
|
||||||
|
the_word = the_word.replace(p, '')
|
||||||
|
converted_words.append({
|
||||||
|
'start': word.start,
|
||||||
|
'end': word.end,
|
||||||
|
'confidence': 1,
|
||||||
|
'word': the_word,
|
||||||
|
'always_capitalized': is_proper_noun or word == 'I',
|
||||||
|
'index': i,
|
||||||
|
'punc_before': punc_before,
|
||||||
|
'punc_after': punc_after,
|
||||||
|
})
|
||||||
|
|
||||||
|
return converted_words
|
||||||
|
|
||||||
|
|
||||||
converters = {
|
converters = {
|
||||||
'speechmatics': speechmatics_converter,
|
'speechmatics': speechmatics_converter,
|
||||||
|
'speechmatics_align': speechmatics_aligned_text_converter,
|
||||||
}
|
}
|
||||||
|
|||||||
43845
fifty_min.json
Normal file
43845
fifty_min.json
Normal file
File diff suppressed because it is too large
Load Diff
68542
fifty_min_processed.json
Normal file
68542
fifty_min_processed.json
Normal file
File diff suppressed because it is too large
Load Diff
20
helpers.py
20
helpers.py
@@ -6,6 +6,8 @@ st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
|||||||
|
|
||||||
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
|
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
|
||||||
|
|
||||||
|
PUNCTUATION = ['.', '?', ',', ':', '"', '!']
|
||||||
|
|
||||||
|
|
||||||
def tag_words(words):
|
def tag_words(words):
|
||||||
return st.tag(words)
|
return st.tag(words)
|
||||||
@@ -15,3 +17,21 @@ def is_a_proper_noun(phrase):
|
|||||||
tagged_words = tag_words(phrase.split())
|
tagged_words = tag_words(phrase.split())
|
||||||
return any(tagged_word[1] in PROPER_NOUN_TAGS
|
return any(tagged_word[1] in PROPER_NOUN_TAGS
|
||||||
for tagged_word in tagged_words)
|
for tagged_word in tagged_words)
|
||||||
|
|
||||||
|
|
||||||
|
def get_punc_before(word):
|
||||||
|
punc = []
|
||||||
|
for char in word:
|
||||||
|
if char.isalpha():
|
||||||
|
return punc
|
||||||
|
if char in PUNCTUATION:
|
||||||
|
punc.append(char)
|
||||||
|
|
||||||
|
|
||||||
|
def get_punc_after(word):
|
||||||
|
punc = []
|
||||||
|
for char in reversed(word):
|
||||||
|
if char.isalpha():
|
||||||
|
return punc
|
||||||
|
if char in PUNCTUATION:
|
||||||
|
punc.insert(0, char)
|
||||||
2589
leland_transcript.json
Normal file
2589
leland_transcript.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,8 +8,8 @@ class TranscriptConverter:
|
|||||||
|
|
||||||
def __init__(self, path, format_name):
|
def __init__(self, path, format_name):
|
||||||
self.path = path
|
self.path = path
|
||||||
with open(path, 'r') as fin:
|
with open(path) as f:
|
||||||
self.words = converters[format_name](json.load(fin))
|
self.words = converters[format_name](f)
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
return json.dumps(self.words, indent=4)
|
return json.dumps(self.words, indent=4)
|
||||||
|
|||||||
2115
two_min.json
Normal file
2115
two_min.json
Normal file
File diff suppressed because it is too large
Load Diff
3332
two_min_processed.json
Normal file
3332
two_min_processed.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user