Files
tpro/converter.py

107 lines
2.6 KiB
Python

import abc
import json
from collections import namedtuple
import os
import helpers
import converters
Word = namedtuple('Word', 'start end confidence word is_proper_noun next_word')
class TranscriptConverter:
__metaclass__ = abc.ABCMeta
def __init__(self, path, output_target):
self.path = path
self.output_target = output_target
def convert(self):
tagged_words = None
with open(self.path) as f:
data = json.load(f)
word_objects = self.get_word_objects(data)
words = self.get_words(word_objects)
tagged_words = helpers.tag_words(words)
self.converted_words = self.convert_words(
word_objects,
words,
tagged_words
)
@staticmethod
@abc.abstractmethod
def get_word_objects(json_data):
pass
@staticmethod
@abc.abstractmethod
def get_words(word_objects):
pass
@staticmethod
@abc.abstractmethod
def convert_words(word_objects, words, tagged_words=None):
pass
@staticmethod
@abc.abstractmethod
def get_word_start(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_end(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_confidence(word_object):
pass
@staticmethod
@abc.abstractmethod
def get_word_word(word_object):
pass
@staticmethod
def check_if_proper_noun(index, tagged_words):
return tagged_words[index][1] in helpers.PROPER_NOUN_TAGS
def get_word_object(self, word_object, index, tagged_words, word_objects):
return Word(
self.get_word_start(word_object),
self.get_word_end(word_object),
self.get_word_confidence(word_object),
self.get_word_word(word_object),
self.check_if_proper_noun(index, tagged_words),
self.get_next_word(word_objects, index)
)
def get_next_word(self, word_objects, index):
if index < len(word_objects) - 1:
return word_objects[index + 1]
def interactive_transcript(self):
return json.dumps(self.converted_words, indent=4)
def viral_overlay(self):
return json.dumps(
[{'start': word['start'],
'stop': word['end'],
'text': word['word']}
for word in self.converted_words],
indent=4
)
def save(self, path):
with open(path, 'w') as fout:
fout.write(getattr(self, self.output_target)())
return path