diff --git a/README.md b/README.md index 63d9da9..4ea75ee 100644 --- a/README.md +++ b/README.md @@ -53,3 +53,4 @@ formats. - Word (`.doc`, `.docx`) - text files - SRT (subtitles) +- Draft.js JSON diff --git a/README_PYPI.md b/README_PYPI.md index 111d44d..701df52 100644 --- a/README_PYPI.md +++ b/README_PYPI.md @@ -111,3 +111,4 @@ formats. - Word (`.doc`, `.docx`) - text files - SRT (subtitles) +- Draft.js JSON diff --git a/setup.py b/setup.py index 1d68834..c56b025 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ with open('README_PYPI.md') as file: setup( name="tpro", - version="0.11", + version="0.12", url='https://github.com/zevaverbach/tpro', install_requires=[ 'Click', diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 015d681..2a8710f 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -79,13 +79,14 @@ def test_google(): g = GoogleConverter(transcript_data) g.convert() + print(g.converted_words[0]) assert g.converted_words[0] == { - 'start': 4, - 'end': 5.5, - 'confidence': 0.88, - 'word': 'Testing', - 'always_capitalized': False, - 'punc_after': [','], - 'punc_before': False, - } - + 'start': 0.4, + 'end': 2.1, + 'confidence': 0.9128385782241821, + 'word': 'Okay', + 'always_capitalized': False, + 'punc_after': [','], + 'punc_before': False, + 'speaker_id': None + } diff --git a/tests/test_convert_google.py b/tests/test_convert_google.py index 7fa5f6c..545d78f 100644 --- a/tests/test_convert_google.py +++ b/tests/test_convert_google.py @@ -27,6 +27,7 @@ def test_pre_process(transcript): g = GoogleConverter(transcript_data) assert g.json_data + print(g.json_data) diff --git a/transcript_processing/converter.py b/transcript_processing/converter.py index 28a1582..7e1697c 100644 --- a/transcript_processing/converter.py +++ b/transcript_processing/converter.py @@ -7,7 +7,9 @@ from . import converters -Word = namedtuple('Word', 'start end confidence word always_capitalized next_word') +Word = namedtuple( + 'Word', + 'start end confidence word always_capitalized next_word speaker_id') class TranscriptConverter: @@ -58,6 +60,11 @@ class TranscriptConverter: def get_word_confidence(word_object): pass + @staticmethod + @abc.abstractmethod + def get_speaker_id(word_object): + pass + @staticmethod @abc.abstractmethod def get_word_word(word_object): @@ -78,7 +85,8 @@ class TranscriptConverter: self.get_word_confidence(word_object), word, self.check_if_always_capitalized(word, index, tagged_words), - self.get_next_word(word_objects, index) + self.get_next_word(word_objects, index), + self.get_speaker_id(word_object), ) def get_next_word(self, word_objects, index): diff --git a/transcript_processing/converters/amazon.py b/transcript_processing/converters/amazon.py index 4196f2a..5ff147f 100644 --- a/transcript_processing/converters/amazon.py +++ b/transcript_processing/converters/amazon.py @@ -35,6 +35,10 @@ class AmazonConverter(TranscriptConverter): word_word = 'I' return word_word + @staticmethod + def get_speaker_id(word_object): + return None + def convert_words(self, word_objects, words, tagged_words=None): converted_words = [] @@ -74,6 +78,7 @@ class AmazonConverter(TranscriptConverter): tagged_words), 'punc_after': punc_after, 'punc_before': punc_before, + 'speaker_id': word_obj.speaker_id, }) punc_after = False diff --git a/transcript_processing/converters/google.py b/transcript_processing/converters/google.py index 0b5061a..81d3411 100644 --- a/transcript_processing/converters/google.py +++ b/transcript_processing/converters/google.py @@ -14,7 +14,12 @@ class GoogleConverter(TranscriptConverter): def pre_process(self, transcript_data): friendly = make_json_friendly(transcript_data) - return json.loads(friendly) + json_data = json.loads(friendly) + last_datum = json_data[-1] + if last_datum.get('speaker_tag'): + """Get rid of duplicate content that doesn't have speaker_tags""" + json_data = [jd for jd in json_data if jd.get('speaker_tag')] + return json_data def get_word_objects(self, json_data): return json_data @@ -47,6 +52,7 @@ class GoogleConverter(TranscriptConverter): tagged_words), 'punc_after': punc_after, 'punc_before': punc_before, + 'speaker_id': word_obj.speaker_id, }) return converted_words @@ -74,13 +80,13 @@ class GoogleConverter(TranscriptConverter): @staticmethod def get_word_word(word_object): - print(word_object) return word_object['word'] def make_json_friendly(json_string): - lines = [line.strip() for line in json_string.split('\\n')] + lines = [line.strip() for line in json_string.split('\n')] + new_string = '[' fields = [ 'words {', @@ -92,54 +98,41 @@ def make_json_friendly(json_string): 'confidence: ' ] - current_field_index = 0 - new_string = '' + start_field = 'words {' - for line in lines: + open_braces = 0 - current_field = fields[current_field_index] + for index, line in enumerate(lines): + if open_braces == 0: + if start_field in line: + open_braces = 1 + new_string += '{' + continue - if current_field in line: - if current_field_index == len(fields) - 1: - current_field_index = 0 - else: - current_field_index += 1 - if current_field_index == 1: - new_string += '}, {' - # "words" was found, don't want to append that - continue + if '{' in line: + open_braces += 1 + if '}' in line: + open_braces -= 1 - else: - if current_field_index == 0: - # haven't found the beginning of the next word object - continue - - # add quotes around keys - line = re.sub('^(?!")([0-9a-zA-Z_]+)', - '"\\1"', - line) - - # add colons after keys - if line.endswith('{'): - line = line.replace('" ', '": ') - - # use first two decimals of confidence - if 'confidence' in current_field: - line = ', ' + line - line = line[:20] - - if current_field == '}': + if open_braces > 0 and '{' not in line and '}' not in lines[index + 1]: line = line + ', ' + if open_braces == 0: + new_string += '}, ' + continue + + line = re.sub('^(?!")([0-9a-zA-Z_]+)', + '"\\1"', + line) + + if 'start_time' in line: + line = line.replace('"start_time"', '"start_time":') + if 'end_time' in line: + line = line.replace('"end_time"', '"end_time":') + new_string += line - # cleanup - if new_string.startswith('}, '): - new_string = new_string[3:] - if not new_string.startswith('['): - new_string = '[' + new_string - if not new_string.endswith('}]'): - new_string = new_string + '}]' - new_string = new_string.replace(', }', '}').replace('\\', '') + new_string = new_string.replace('\\', '') - return new_string + + return new_string[:-2] + ']' diff --git a/transcript_processing/outputs.py b/transcript_processing/outputs.py index 26acf68..f073e48 100644 --- a/transcript_processing/outputs.py +++ b/transcript_processing/outputs.py @@ -4,11 +4,14 @@ def universal_transcript(self, pretty=False): return json.dumps(self.converted_words, indent=4 if pretty else None) def viral_overlay(self, pretty=False): - return json.dumps([{ - 'start': word['start'], - 'stop': word['end'], - 'text': word['word'].title() if word['always_capitalized'] else word['word']} + return json.dumps([ + {'start': word['start'], + 'stop': word['end'], + 'text': word['word'].title() + if word['always_capitalized'] else word['word'] + } - for word in self.converted_words], indent=4 if pretty else None + for word in self.converted_words] + , indent=4 if pretty else None )