fixed problem with escaped unicode chars, from Spanish smple data

2019-07-18 17:12:48 +02:00
parent 5693f91a20
commit 640710541e
7 changed files with 9486 additions and 12 deletions
--- a/transcript_processing/converters/google.py
+++ b/transcript_processing/converters/google.py
@@ -1,3 +1,4 @@
+import codecs
 import json
 import re

@@ -38,6 +39,7 @@ class GoogleConverter(TranscriptConverter):
            punc_after = helpers.get_punc_after(word_obj.word) or False

            the_word = word_obj.word
+
            if punc_before:
                the_word = the_word[len(punc_before):]
            if punc_after:
@@ -128,6 +130,7 @@ def make_json_friendly(json_string):

        new_string += line

-    new_string = new_string.replace('\\', '')
+    if '\\' in new_string:
+        new_string = codecs.escape_decode(new_string)[0].decode('utf-8')

    return new_string[:-2] + ']'