From 12006591fab1a7fb012621baeae90baad5aed943 Mon Sep 17 00:00:00 2001 From: zevav Date: Mon, 25 Mar 2019 23:07:28 +0100 Subject: [PATCH] added more support for language codes --- transcript_processing/converter.py | 20 +++++++++++++------ transcript_processing/converters/amazon.py | 2 +- transcript_processing/converters/gentle.py | 2 +- transcript_processing/converters/google.py | 2 +- .../converters/speechmatics.py | 2 +- transcript_processing/tpro.py | 7 +++++-- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/transcript_processing/converter.py b/transcript_processing/converter.py index 4456584..bfe6e8a 100644 --- a/transcript_processing/converter.py +++ b/transcript_processing/converter.py @@ -16,8 +16,9 @@ class TranscriptConverter: __metaclass__ = abc.ABCMeta - def __init__(self, json_data: dict): + def __init__(self, json_data: dict, language_code='en-US'): self.json_data = json_data + self.language_code = language_code def convert(self): tagged_words = None @@ -25,7 +26,10 @@ class TranscriptConverter: word_objects = self.get_word_objects(self.json_data) words = self.get_words(word_objects) - tagged_words = helpers.tag_words(words) + if self.language_code != 'en-US': + tagged_words = None + else: + tagged_words = helpers.tag_words(words) self.converted_words = self.convert_words( word_objects, @@ -77,10 +81,14 @@ class TranscriptConverter: @staticmethod def check_if_always_capitalized(word, index, tagged_words): - if word.upper() == 'I': - return True - word_category = tagged_words[index][1] - return word_category in helpers.PROPER_NOUN_TAGS + if tagged_words is None: + return False + + else: + if word.upper() == 'I': + return True + word_category = tagged_words[index][1] + return word_category in helpers.PROPER_NOUN_TAGS def get_word_object( self, diff --git a/transcript_processing/converters/amazon.py b/transcript_processing/converters/amazon.py index 275a990..7610bd8 100644 --- a/transcript_processing/converters/amazon.py +++ b/transcript_processing/converters/amazon.py @@ -11,7 +11,7 @@ class AmazonConverter(TranscriptConverter): name = 'amazon' transcript_type = dict - def __init__(self, json_data): + def __init__(self, json_data, language_code='en-US'): super().__init__(json_data) def get_word_objects(self, json_data) -> list: diff --git a/transcript_processing/converters/gentle.py b/transcript_processing/converters/gentle.py index f6a0016..b65b08e 100644 --- a/transcript_processing/converters/gentle.py +++ b/transcript_processing/converters/gentle.py @@ -8,7 +8,7 @@ class GentleConverter(TranscriptConverter): name = 'gentle' transcript_type = dict - def __init__(self, json_data): + def __init__(self, json_data, language_code='en-US'): super().__init__(json_data) def get_word_objects(self, json_data): diff --git a/transcript_processing/converters/google.py b/transcript_processing/converters/google.py index d85f6d3..07d9b9d 100644 --- a/transcript_processing/converters/google.py +++ b/transcript_processing/converters/google.py @@ -10,7 +10,7 @@ class GoogleConverter(TranscriptConverter): transcript_type = str - def __init__(self, transcript_data: str): + def __init__(self, transcript_data: str, language_code='en-US'): super().__init__(transcript_data) self.json_data = self.pre_process(transcript_data) diff --git a/transcript_processing/converters/speechmatics.py b/transcript_processing/converters/speechmatics.py index b63a0f8..b93cffe 100644 --- a/transcript_processing/converters/speechmatics.py +++ b/transcript_processing/converters/speechmatics.py @@ -11,7 +11,7 @@ class SpeechmaticsConverter(TranscriptConverter): name = 'speechmatics' transcript_type = dict - def __init__(self, path): + def __init__(self, path, language_code='en-US'): super().__init__(path) def get_word_objects(self, json_data): diff --git a/transcript_processing/tpro.py b/transcript_processing/tpro.py index cc32867..7766d29 100644 --- a/transcript_processing/tpro.py +++ b/transcript_processing/tpro.py @@ -13,6 +13,8 @@ output_choices = [k for k, v in @click.command() @click.option('-p', '--print-output', is_flag=True, default=True, help='pretty print the transcript, breaks pipeability') +@click.option('--language-code', default='en-US', + help='specify language, defaults to en-US.') @click.argument('transcript_data_path', type=click.File('r')) @click.argument('output_path', type=click.Path(writable=True, dir_okay=False)) @click.argument('input_format', type=click.Choice(services.keys())) @@ -21,7 +23,8 @@ def cli(print_output, transcript_data_path, output_path, input_format, - output_format): + output_format, + language_code): transcript_data_file_handle = transcript_data_path @@ -31,7 +34,7 @@ def cli(print_output, else: transcript_data = transcript_data_file_handle.read() - converter = service(transcript_data) + converter = service(transcript_data, language_code) converter.convert() converter.save(output_path, output_format)