added more support for language codes

This commit is contained in:
2019-03-25 23:07:28 +01:00
parent 6fb95b4dd5
commit 12006591fa
6 changed files with 23 additions and 12 deletions

View File

@@ -16,8 +16,9 @@ class TranscriptConverter:
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
def __init__(self, json_data: dict): def __init__(self, json_data: dict, language_code='en-US'):
self.json_data = json_data self.json_data = json_data
self.language_code = language_code
def convert(self): def convert(self):
tagged_words = None tagged_words = None
@@ -25,7 +26,10 @@ class TranscriptConverter:
word_objects = self.get_word_objects(self.json_data) word_objects = self.get_word_objects(self.json_data)
words = self.get_words(word_objects) words = self.get_words(word_objects)
tagged_words = helpers.tag_words(words) if self.language_code != 'en-US':
tagged_words = None
else:
tagged_words = helpers.tag_words(words)
self.converted_words = self.convert_words( self.converted_words = self.convert_words(
word_objects, word_objects,
@@ -77,10 +81,14 @@ class TranscriptConverter:
@staticmethod @staticmethod
def check_if_always_capitalized(word, index, tagged_words): def check_if_always_capitalized(word, index, tagged_words):
if word.upper() == 'I': if tagged_words is None:
return True return False
word_category = tagged_words[index][1]
return word_category in helpers.PROPER_NOUN_TAGS else:
if word.upper() == 'I':
return True
word_category = tagged_words[index][1]
return word_category in helpers.PROPER_NOUN_TAGS
def get_word_object( def get_word_object(
self, self,

View File

@@ -11,7 +11,7 @@ class AmazonConverter(TranscriptConverter):
name = 'amazon' name = 'amazon'
transcript_type = dict transcript_type = dict
def __init__(self, json_data): def __init__(self, json_data, language_code='en-US'):
super().__init__(json_data) super().__init__(json_data)
def get_word_objects(self, json_data) -> list: def get_word_objects(self, json_data) -> list:

View File

@@ -8,7 +8,7 @@ class GentleConverter(TranscriptConverter):
name = 'gentle' name = 'gentle'
transcript_type = dict transcript_type = dict
def __init__(self, json_data): def __init__(self, json_data, language_code='en-US'):
super().__init__(json_data) super().__init__(json_data)
def get_word_objects(self, json_data): def get_word_objects(self, json_data):

View File

@@ -10,7 +10,7 @@ class GoogleConverter(TranscriptConverter):
transcript_type = str transcript_type = str
def __init__(self, transcript_data: str): def __init__(self, transcript_data: str, language_code='en-US'):
super().__init__(transcript_data) super().__init__(transcript_data)
self.json_data = self.pre_process(transcript_data) self.json_data = self.pre_process(transcript_data)

View File

@@ -11,7 +11,7 @@ class SpeechmaticsConverter(TranscriptConverter):
name = 'speechmatics' name = 'speechmatics'
transcript_type = dict transcript_type = dict
def __init__(self, path): def __init__(self, path, language_code='en-US'):
super().__init__(path) super().__init__(path)
def get_word_objects(self, json_data): def get_word_objects(self, json_data):

View File

@@ -13,6 +13,8 @@ output_choices = [k for k, v in
@click.command() @click.command()
@click.option('-p', '--print-output', is_flag=True, default=True, @click.option('-p', '--print-output', is_flag=True, default=True,
help='pretty print the transcript, breaks pipeability') help='pretty print the transcript, breaks pipeability')
@click.option('--language-code', default='en-US',
help='specify language, defaults to en-US.')
@click.argument('transcript_data_path', type=click.File('r')) @click.argument('transcript_data_path', type=click.File('r'))
@click.argument('output_path', type=click.Path(writable=True, dir_okay=False)) @click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
@click.argument('input_format', type=click.Choice(services.keys())) @click.argument('input_format', type=click.Choice(services.keys()))
@@ -21,7 +23,8 @@ def cli(print_output,
transcript_data_path, transcript_data_path,
output_path, output_path,
input_format, input_format,
output_format): output_format,
language_code):
transcript_data_file_handle = transcript_data_path transcript_data_file_handle = transcript_data_path
@@ -31,7 +34,7 @@ def cli(print_output,
else: else:
transcript_data = transcript_data_file_handle.read() transcript_data = transcript_data_file_handle.read()
converter = service(transcript_data) converter = service(transcript_data, language_code)
converter.convert() converter.convert()
converter.save(output_path, output_format) converter.save(output_path, output_format)