made google work again after change in formatting of saved transcript. added speaker_id support to base class and Google
This commit is contained in:
@@ -53,3 +53,4 @@ formats.
|
|||||||
- Word (`.doc`, `.docx`)
|
- Word (`.doc`, `.docx`)
|
||||||
- text files
|
- text files
|
||||||
- SRT (subtitles)
|
- SRT (subtitles)
|
||||||
|
- Draft.js JSON
|
||||||
|
|||||||
@@ -111,3 +111,4 @@ formats.
|
|||||||
- Word (`.doc`, `.docx`)
|
- Word (`.doc`, `.docx`)
|
||||||
- text files
|
- text files
|
||||||
- SRT (subtitles)
|
- SRT (subtitles)
|
||||||
|
- Draft.js JSON
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -6,7 +6,7 @@ with open('README_PYPI.md') as file:
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tpro",
|
name="tpro",
|
||||||
version="0.11",
|
version="0.12",
|
||||||
url='https://github.com/zevaverbach/tpro',
|
url='https://github.com/zevaverbach/tpro',
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'Click',
|
'Click',
|
||||||
|
|||||||
@@ -79,13 +79,14 @@ def test_google():
|
|||||||
g = GoogleConverter(transcript_data)
|
g = GoogleConverter(transcript_data)
|
||||||
|
|
||||||
g.convert()
|
g.convert()
|
||||||
|
print(g.converted_words[0])
|
||||||
assert g.converted_words[0] == {
|
assert g.converted_words[0] == {
|
||||||
'start': 4,
|
'start': 0.4,
|
||||||
'end': 5.5,
|
'end': 2.1,
|
||||||
'confidence': 0.88,
|
'confidence': 0.9128385782241821,
|
||||||
'word': 'Testing',
|
'word': 'Okay',
|
||||||
'always_capitalized': False,
|
'always_capitalized': False,
|
||||||
'punc_after': [','],
|
'punc_after': [','],
|
||||||
'punc_before': False,
|
'punc_before': False,
|
||||||
|
'speaker_id': None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ def test_pre_process(transcript):
|
|||||||
|
|
||||||
g = GoogleConverter(transcript_data)
|
g = GoogleConverter(transcript_data)
|
||||||
assert g.json_data
|
assert g.json_data
|
||||||
|
print(g.json_data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,9 @@ from . import converters
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
Word = namedtuple('Word', 'start end confidence word always_capitalized next_word')
|
Word = namedtuple(
|
||||||
|
'Word',
|
||||||
|
'start end confidence word always_capitalized next_word speaker_id')
|
||||||
|
|
||||||
|
|
||||||
class TranscriptConverter:
|
class TranscriptConverter:
|
||||||
@@ -58,6 +60,11 @@ class TranscriptConverter:
|
|||||||
def get_word_confidence(word_object):
|
def get_word_confidence(word_object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@abc.abstractmethod
|
||||||
|
def get_speaker_id(word_object):
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def get_word_word(word_object):
|
def get_word_word(word_object):
|
||||||
@@ -78,7 +85,8 @@ class TranscriptConverter:
|
|||||||
self.get_word_confidence(word_object),
|
self.get_word_confidence(word_object),
|
||||||
word,
|
word,
|
||||||
self.check_if_always_capitalized(word, index, tagged_words),
|
self.check_if_always_capitalized(word, index, tagged_words),
|
||||||
self.get_next_word(word_objects, index)
|
self.get_next_word(word_objects, index),
|
||||||
|
self.get_speaker_id(word_object),
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_next_word(self, word_objects, index):
|
def get_next_word(self, word_objects, index):
|
||||||
|
|||||||
@@ -35,6 +35,10 @@ class AmazonConverter(TranscriptConverter):
|
|||||||
word_word = 'I'
|
word_word = 'I'
|
||||||
return word_word
|
return word_word
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_speaker_id(word_object):
|
||||||
|
return None
|
||||||
|
|
||||||
def convert_words(self, word_objects, words, tagged_words=None):
|
def convert_words(self, word_objects, words, tagged_words=None):
|
||||||
converted_words = []
|
converted_words = []
|
||||||
|
|
||||||
@@ -74,6 +78,7 @@ class AmazonConverter(TranscriptConverter):
|
|||||||
tagged_words),
|
tagged_words),
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
|
'speaker_id': word_obj.speaker_id,
|
||||||
})
|
})
|
||||||
|
|
||||||
punc_after = False
|
punc_after = False
|
||||||
|
|||||||
@@ -14,7 +14,12 @@ class GoogleConverter(TranscriptConverter):
|
|||||||
|
|
||||||
def pre_process(self, transcript_data):
|
def pre_process(self, transcript_data):
|
||||||
friendly = make_json_friendly(transcript_data)
|
friendly = make_json_friendly(transcript_data)
|
||||||
return json.loads(friendly)
|
json_data = json.loads(friendly)
|
||||||
|
last_datum = json_data[-1]
|
||||||
|
if last_datum.get('speaker_tag'):
|
||||||
|
"""Get rid of duplicate content that doesn't have speaker_tags"""
|
||||||
|
json_data = [jd for jd in json_data if jd.get('speaker_tag')]
|
||||||
|
return json_data
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
return json_data
|
return json_data
|
||||||
@@ -47,6 +52,7 @@ class GoogleConverter(TranscriptConverter):
|
|||||||
tagged_words),
|
tagged_words),
|
||||||
'punc_after': punc_after,
|
'punc_after': punc_after,
|
||||||
'punc_before': punc_before,
|
'punc_before': punc_before,
|
||||||
|
'speaker_id': word_obj.speaker_id,
|
||||||
})
|
})
|
||||||
|
|
||||||
return converted_words
|
return converted_words
|
||||||
@@ -74,13 +80,13 @@ class GoogleConverter(TranscriptConverter):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_word_word(word_object):
|
def get_word_word(word_object):
|
||||||
print(word_object)
|
|
||||||
return word_object['word']
|
return word_object['word']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def make_json_friendly(json_string):
|
def make_json_friendly(json_string):
|
||||||
lines = [line.strip() for line in json_string.split('\\n')]
|
lines = [line.strip() for line in json_string.split('\n')]
|
||||||
|
new_string = '['
|
||||||
|
|
||||||
fields = [
|
fields = [
|
||||||
'words {',
|
'words {',
|
||||||
@@ -92,54 +98,41 @@ def make_json_friendly(json_string):
|
|||||||
'confidence: '
|
'confidence: '
|
||||||
]
|
]
|
||||||
|
|
||||||
current_field_index = 0
|
start_field = 'words {'
|
||||||
new_string = ''
|
|
||||||
|
|
||||||
for line in lines:
|
open_braces = 0
|
||||||
|
|
||||||
current_field = fields[current_field_index]
|
for index, line in enumerate(lines):
|
||||||
|
if open_braces == 0:
|
||||||
if current_field in line:
|
if start_field in line:
|
||||||
if current_field_index == len(fields) - 1:
|
open_braces = 1
|
||||||
current_field_index = 0
|
new_string += '{'
|
||||||
else:
|
|
||||||
current_field_index += 1
|
|
||||||
if current_field_index == 1:
|
|
||||||
new_string += '}, {'
|
|
||||||
# "words" was found, don't want to append that
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else:
|
if '{' in line:
|
||||||
if current_field_index == 0:
|
open_braces += 1
|
||||||
# haven't found the beginning of the next word object
|
if '}' in line:
|
||||||
|
open_braces -= 1
|
||||||
|
|
||||||
|
if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]:
|
||||||
|
line = line + ', '
|
||||||
|
|
||||||
|
if open_braces == 0:
|
||||||
|
new_string += '}, '
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# add quotes around keys
|
|
||||||
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
|
line = re.sub('^(?!")([0-9a-zA-Z_]+)',
|
||||||
'"\\1"',
|
'"\\1"',
|
||||||
line)
|
line)
|
||||||
|
|
||||||
# add colons after keys
|
if 'start_time' in line:
|
||||||
if line.endswith('{'):
|
line = line.replace('"start_time"', '"start_time":')
|
||||||
line = line.replace('" ', '": ')
|
if 'end_time' in line:
|
||||||
|
line = line.replace('"end_time"', '"end_time":')
|
||||||
# use first two decimals of confidence
|
|
||||||
if 'confidence' in current_field:
|
|
||||||
line = ', ' + line
|
|
||||||
line = line[:20]
|
|
||||||
|
|
||||||
if current_field == '}':
|
|
||||||
line = line + ', '
|
|
||||||
|
|
||||||
new_string += line
|
new_string += line
|
||||||
|
|
||||||
# cleanup
|
new_string = new_string.replace('\\', '')
|
||||||
if new_string.startswith('}, '):
|
|
||||||
new_string = new_string[3:]
|
|
||||||
if not new_string.startswith('['):
|
|
||||||
new_string = '[' + new_string
|
|
||||||
if not new_string.endswith('}]'):
|
|
||||||
new_string = new_string + '}]'
|
|
||||||
new_string = new_string.replace(', }', '}').replace('\\', '')
|
|
||||||
|
|
||||||
return new_string
|
|
||||||
|
return new_string[:-2] + ']'
|
||||||
|
|||||||
@@ -4,11 +4,14 @@ def universal_transcript(self, pretty=False):
|
|||||||
return json.dumps(self.converted_words, indent=4 if pretty else None)
|
return json.dumps(self.converted_words, indent=4 if pretty else None)
|
||||||
|
|
||||||
def viral_overlay(self, pretty=False):
|
def viral_overlay(self, pretty=False):
|
||||||
return json.dumps([{
|
return json.dumps([
|
||||||
'start': word['start'],
|
{'start': word['start'],
|
||||||
'stop': word['end'],
|
'stop': word['end'],
|
||||||
'text': word['word'].title() if word['always_capitalized'] else word['word']}
|
'text': word['word'].title()
|
||||||
|
if word['always_capitalized'] else word['word']
|
||||||
|
}
|
||||||
|
|
||||||
for word in self.converted_words], indent=4 if pretty else None
|
for word in self.converted_words]
|
||||||
|
, indent=4 if pretty else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user