fixed problem with escaped unicode chars, from Spanish smple data

2019-07-18 17:12:48 +02:00
parent 5693f91a20
commit 640710541e
7 changed files with 9486 additions and 12 deletions
--- a/B-050719-1400H-MARIA_R-PART_1.MP4.json
+++ b/B-050719-1400H-MARIA_R-PART_1.MP4.json
--- a/B-050719-1400H-MARIA_R-PART_1.MP4.txt
+++ b/B-050719-1400H-MARIA_R-PART_1.MP4.txt
--- a/transcript_processing/converter.py
+++ b/transcript_processing/converter.py
@@ -87,14 +87,14 @@ class TranscriptConverter:
        else:
            if word.upper() == 'I':
                return True
-            word_category = tagged_words[index][1] 
+            word_category = tagged_words[index][1]
            return word_category in helpers.PROPER_NOUN_TAGS

    def get_word_object(
-            self, 
-            word_object, 
-            index, 
-            tagged_words, 
+            self,
+            word_object,
+            index,
+            tagged_words,
            word_objects,
            speaker_segments=None,
            ):
@@ -107,7 +107,7 @@ class TranscriptConverter:
            self.check_if_always_capitalized(word, index, tagged_words),
            self.get_next_word(word_objects, index),
            self.get_speaker_id(word_object, speaker_segments),
-                ) 
+                )

    def get_next_word(self, word_objects, index):
        if index < len(word_objects) - 1:
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -1,3 +1,4 @@
+import codecs
 import json
 import re

@@ -38,6 +39,7 @@ class GoogleConverter(TranscriptConverter):
            punc_after = helpers.get_punc_after(word_obj.word) or False

            the_word = word_obj.word
+
            if punc_before:
                the_word = the_word[len(punc_before):]
            if punc_after:
@@ -128,6 +130,7 @@ def make_json_friendly(json_string):

        new_string += line

-    new_string = new_string.replace('\\', '')
+    if '\\' in new_string:
+        new_string = codecs.escape_decode(new_string)[0].decode('utf-8')

    return new_string[:-2] + ']'
--- a/transcript_processing/helpers.py
+++ b/transcript_processing/helpers.py
@@ -24,19 +24,19 @@ def is_a_proper_noun(phrase):
 def get_punc_before(word):
    punc = []
    for char in word:
-        if char.isalpha():
-            return punc
        if char in PUNCTUATION:
            punc.append(char)
+        else:
+            return punc


 def get_punc_after(word):
    punc = []
    for char in reversed(word):
-        if char.isalpha():
-            return punc
        if char in PUNCTUATION:
            punc.insert(0, char)
+        else:
+            return punc


 def is_path(string):
--- a/transcript_processing/tpro.py
+++ b/transcript_processing/tpro.py
@@ -11,7 +11,7 @@ output_choices =  [k for k, v in
                   if callable(v)]

@click.command()
-@click.option('-p', '--print-output', is_flag=True, default=True,
+@click.option('-p', '--print-output', is_flag=True, default=False,
        help='pretty print the transcript, breaks pipeability')
@click.option('--language-code', default='en-US',
        help='specify language, defaults to en-US.')
--- a/transcription_jobs.p
+++ b/transcription_jobs.p