From 12006591fab1a7fb012621baeae90baad5aed943 Mon Sep 17 00:00:00 2001
From: zevav <zev@averba.ch>
Date: Mon, 25 Mar 2019 23:07:28 +0100
Subject: [PATCH] added more support for language codes

---
 transcript_processing/converter.py            | 20 +++++++++++++------
 transcript_processing/converters/amazon.py    |  2 +-
 transcript_processing/converters/gentle.py    |  2 +-
 transcript_processing/converters/google.py    |  2 +-
 .../converters/speechmatics.py                |  2 +-
 transcript_processing/tpro.py                 |  7 +++++--
 6 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/transcript_processing/converter.py b/transcript_processing/converter.py
index 4456584..bfe6e8a 100644
--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -16,8 +16,9 @@ class TranscriptConverter:
 
     __metaclass__ = abc.ABCMeta
 
-    def __init__(self, json_data: dict):
+    def __init__(self, json_data: dict, language_code='en-US'):
         self.json_data = json_data
+        self.language_code = language_code
 
     def convert(self):
         tagged_words = None
@@ -25,7 +26,10 @@ class TranscriptConverter:
         word_objects = self.get_word_objects(self.json_data)
         words = self.get_words(word_objects)
 
-        tagged_words = helpers.tag_words(words)
+        if self.language_code != 'en-US':
+            tagged_words = None
+        else:
+            tagged_words = helpers.tag_words(words)
 
         self.converted_words = self.convert_words(
                 word_objects,
@@ -77,10 +81,14 @@ class TranscriptConverter:
 
     @staticmethod
     def check_if_always_capitalized(word, index, tagged_words):
-        if word.upper() == 'I':
-            return True
-        word_category = tagged_words[index][1] 
-        return word_category in helpers.PROPER_NOUN_TAGS
+        if tagged_words is None:
+            return False
+
+        else:
+            if word.upper() == 'I':
+                return True
+            word_category = tagged_words[index][1] 
+            return word_category in helpers.PROPER_NOUN_TAGS
 
     def get_word_object(
             self, 
diff --git a/transcript_processing/converters/amazon.py b/transcript_processing/converters/amazon.py
index 275a990..7610bd8 100644
--- a/transcript_processing/converters/amazon.py
+++ b/transcript_processing/converters/amazon.py
@@ -11,7 +11,7 @@ class AmazonConverter(TranscriptConverter):
     name = 'amazon'
     transcript_type = dict
 
-    def __init__(self, json_data):
+    def __init__(self, json_data, language_code='en-US'):
         super().__init__(json_data)
 
     def get_word_objects(self, json_data) -> list:
diff --git a/transcript_processing/converters/gentle.py b/transcript_processing/converters/gentle.py
index f6a0016..b65b08e 100644
--- a/transcript_processing/converters/gentle.py
+++ b/transcript_processing/converters/gentle.py
@@ -8,7 +8,7 @@ class GentleConverter(TranscriptConverter):
     name = 'gentle'
     transcript_type = dict
 
-    def __init__(self, json_data):
+    def __init__(self, json_data, language_code='en-US'):
         super().__init__(json_data)
 
     def get_word_objects(self, json_data):
diff --git a/transcript_processing/converters/google.py b/transcript_processing/converters/google.py
index d85f6d3..07d9b9d 100644
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -10,7 +10,7 @@ class GoogleConverter(TranscriptConverter):
 
     transcript_type = str
 
-    def __init__(self, transcript_data: str):
+    def __init__(self, transcript_data: str, language_code='en-US'):
         super().__init__(transcript_data)
         self.json_data = self.pre_process(transcript_data)
 
diff --git a/transcript_processing/converters/speechmatics.py b/transcript_processing/converters/speechmatics.py
index b63a0f8..b93cffe 100644
--- a/transcript_processing/converters/speechmatics.py
+++ b/transcript_processing/converters/speechmatics.py
@@ -11,7 +11,7 @@ class SpeechmaticsConverter(TranscriptConverter):
     name = 'speechmatics'
     transcript_type = dict
 
-    def __init__(self, path):
+    def __init__(self, path, language_code='en-US'):
         super().__init__(path)
 
     def get_word_objects(self, json_data):
diff --git a/transcript_processing/tpro.py b/transcript_processing/tpro.py
index cc32867..7766d29 100644
--- a/transcript_processing/tpro.py
+++ b/transcript_processing/tpro.py
@@ -13,6 +13,8 @@ output_choices =  [k for k, v in
 @click.command()
 @click.option('-p', '--print-output', is_flag=True, default=True,
         help='pretty print the transcript, breaks pipeability')
+@click.option('--language-code', default='en-US',
+        help='specify language, defaults to en-US.')
 @click.argument('transcript_data_path', type=click.File('r'))
 @click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
 @click.argument('input_format', type=click.Choice(services.keys()))
@@ -21,7 +23,8 @@ def cli(print_output,
         transcript_data_path,
         output_path,
         input_format,
-        output_format):
+        output_format,
+        language_code):
 
     transcript_data_file_handle = transcript_data_path
 
@@ -31,7 +34,7 @@ def cli(print_output,
     else:
         transcript_data = transcript_data_file_handle.read()
 
-    converter = service(transcript_data)
+    converter = service(transcript_data, language_code)
     converter.convert()
     converter.save(output_path, output_format)