added more support for language codes
This commit is contained in:
@@ -16,8 +16,9 @@ class TranscriptConverter:
|
|||||||
|
|
||||||
__metaclass__ = abc.ABCMeta
|
__metaclass__ = abc.ABCMeta
|
||||||
|
|
||||||
def __init__(self, json_data: dict):
|
def __init__(self, json_data: dict, language_code='en-US'):
|
||||||
self.json_data = json_data
|
self.json_data = json_data
|
||||||
|
self.language_code = language_code
|
||||||
|
|
||||||
def convert(self):
|
def convert(self):
|
||||||
tagged_words = None
|
tagged_words = None
|
||||||
@@ -25,7 +26,10 @@ class TranscriptConverter:
|
|||||||
word_objects = self.get_word_objects(self.json_data)
|
word_objects = self.get_word_objects(self.json_data)
|
||||||
words = self.get_words(word_objects)
|
words = self.get_words(word_objects)
|
||||||
|
|
||||||
tagged_words = helpers.tag_words(words)
|
if self.language_code != 'en-US':
|
||||||
|
tagged_words = None
|
||||||
|
else:
|
||||||
|
tagged_words = helpers.tag_words(words)
|
||||||
|
|
||||||
self.converted_words = self.convert_words(
|
self.converted_words = self.convert_words(
|
||||||
word_objects,
|
word_objects,
|
||||||
@@ -77,10 +81,14 @@ class TranscriptConverter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_if_always_capitalized(word, index, tagged_words):
|
def check_if_always_capitalized(word, index, tagged_words):
|
||||||
if word.upper() == 'I':
|
if tagged_words is None:
|
||||||
return True
|
return False
|
||||||
word_category = tagged_words[index][1]
|
|
||||||
return word_category in helpers.PROPER_NOUN_TAGS
|
else:
|
||||||
|
if word.upper() == 'I':
|
||||||
|
return True
|
||||||
|
word_category = tagged_words[index][1]
|
||||||
|
return word_category in helpers.PROPER_NOUN_TAGS
|
||||||
|
|
||||||
def get_word_object(
|
def get_word_object(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class AmazonConverter(TranscriptConverter):
|
|||||||
name = 'amazon'
|
name = 'amazon'
|
||||||
transcript_type = dict
|
transcript_type = dict
|
||||||
|
|
||||||
def __init__(self, json_data):
|
def __init__(self, json_data, language_code='en-US'):
|
||||||
super().__init__(json_data)
|
super().__init__(json_data)
|
||||||
|
|
||||||
def get_word_objects(self, json_data) -> list:
|
def get_word_objects(self, json_data) -> list:
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ class GentleConverter(TranscriptConverter):
|
|||||||
name = 'gentle'
|
name = 'gentle'
|
||||||
transcript_type = dict
|
transcript_type = dict
|
||||||
|
|
||||||
def __init__(self, json_data):
|
def __init__(self, json_data, language_code='en-US'):
|
||||||
super().__init__(json_data)
|
super().__init__(json_data)
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class GoogleConverter(TranscriptConverter):
|
|||||||
|
|
||||||
transcript_type = str
|
transcript_type = str
|
||||||
|
|
||||||
def __init__(self, transcript_data: str):
|
def __init__(self, transcript_data: str, language_code='en-US'):
|
||||||
super().__init__(transcript_data)
|
super().__init__(transcript_data)
|
||||||
self.json_data = self.pre_process(transcript_data)
|
self.json_data = self.pre_process(transcript_data)
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ class SpeechmaticsConverter(TranscriptConverter):
|
|||||||
name = 'speechmatics'
|
name = 'speechmatics'
|
||||||
transcript_type = dict
|
transcript_type = dict
|
||||||
|
|
||||||
def __init__(self, path):
|
def __init__(self, path, language_code='en-US'):
|
||||||
super().__init__(path)
|
super().__init__(path)
|
||||||
|
|
||||||
def get_word_objects(self, json_data):
|
def get_word_objects(self, json_data):
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ output_choices = [k for k, v in
|
|||||||
@click.command()
|
@click.command()
|
||||||
@click.option('-p', '--print-output', is_flag=True, default=True,
|
@click.option('-p', '--print-output', is_flag=True, default=True,
|
||||||
help='pretty print the transcript, breaks pipeability')
|
help='pretty print the transcript, breaks pipeability')
|
||||||
|
@click.option('--language-code', default='en-US',
|
||||||
|
help='specify language, defaults to en-US.')
|
||||||
@click.argument('transcript_data_path', type=click.File('r'))
|
@click.argument('transcript_data_path', type=click.File('r'))
|
||||||
@click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
|
@click.argument('output_path', type=click.Path(writable=True, dir_okay=False))
|
||||||
@click.argument('input_format', type=click.Choice(services.keys()))
|
@click.argument('input_format', type=click.Choice(services.keys()))
|
||||||
@@ -21,7 +23,8 @@ def cli(print_output,
|
|||||||
transcript_data_path,
|
transcript_data_path,
|
||||||
output_path,
|
output_path,
|
||||||
input_format,
|
input_format,
|
||||||
output_format):
|
output_format,
|
||||||
|
language_code):
|
||||||
|
|
||||||
transcript_data_file_handle = transcript_data_path
|
transcript_data_file_handle = transcript_data_path
|
||||||
|
|
||||||
@@ -31,7 +34,7 @@ def cli(print_output,
|
|||||||
else:
|
else:
|
||||||
transcript_data = transcript_data_file_handle.read()
|
transcript_data = transcript_data_file_handle.read()
|
||||||
|
|
||||||
converter = service(transcript_data)
|
converter = service(transcript_data, language_code)
|
||||||
converter.convert()
|
converter.convert()
|
||||||
converter.save(output_path, output_format)
|
converter.save(output_path, output_format)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user