From e47f7a04e81210a7423a03ec068a7b13f28fab43 Mon Sep 17 00:00:00 2001 From: zevav Date: Fri, 12 Oct 2018 18:49:54 -0400 Subject: [PATCH] first --- .DS_Store | Bin 0 -> 6148 bytes .vscode/settings.json | 3 +++ __pycache__/converters.cpython-36.pyc | Bin 0 -> 1126 bytes __pycache__/helpers.cpython-36.pyc | Bin 0 -> 813 bytes __pycache__/models.cpython-36.pyc | Bin 0 -> 1037 bytes converters.py | 33 ++++++++++++++++++++++++++ helpers.py | 17 +++++++++++++ models.py | 27 +++++++++++++++++++++ 8 files changed, 80 insertions(+) create mode 100644 .DS_Store create mode 100644 .vscode/settings.json create mode 100644 __pycache__/converters.cpython-36.pyc create mode 100644 __pycache__/helpers.cpython-36.pyc create mode 100644 __pycache__/models.cpython-36.pyc create mode 100644 converters.py create mode 100644 helpers.py create mode 100644 models.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0W3WPX{I|OYMfjESaP(XwTX#t&&|CKai@%&TceYd`ay&Z~kt%)&a#B8<4lV@@59H^>li|DQZWR%40LW6XROe4<&x zLY|a38?a~%jpAQ$CmLZBPI*z*c{TD3otA|%{(4ikjiJv0ZIpeyEmLYhp`5kA7b zxBppTO^)ysy*qqMdfNMnXrIv`*!w^S?&RCk66!;D3Y;whuMl#^I%d%}5fzIY!jeAG zo`z^P7$W6u2mL_f4^#&_)RB&Lq6ghqh)B1IrflvwJOZpg-hPcV21MppjqmzA?}k7F z6ll$MbH#O|(Yq8n!VC@37xIxp9AMDoHUEYHEHb`p@SYfeuVW|6lXc03oE-Aq$yBu6 z`*wdKc-P8_5_uyFQFdz9iMHTUmd)m*Xq#Ow6oibsgOg;rf*}-b-K|(ur7b8xhj(Ph zd9hXJsx0K}JY>9FwMAaZ6%fwC3{_>#vxw~o@P@G_;J#5Z-V|KeoCc2#I>9%7*%2E2 zbQET>MPW_lLS>;9A2&=~wpMCS#+O}Q@a%G4?d1pZQQnm*ugXIXma<`df8>k6ddIxk zsX1KBg{@nJRvF}7#~U_Egim%vlNsD2MuaE`r-tZ5NI~;`o literal 0 HcmV?d00001 diff --git a/__pycache__/helpers.cpython-36.pyc b/__pycache__/helpers.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..942b99998173e9c59bdea893ed81df072f0deeea GIT binary patch literal 813 zcmZWnL2uJA6tO#VXL}R z_gj$o8T<(jxpLZH*okKYQ&lYa*>T?c-uHd>?#@p0+p8ZRe^Nqzk+ost{vA%(!@vor zlFVp8DG`)g++JCM#cRCIox~1ozQx@;64ZEuH*u}=?J?=L?y-_|Y1J8*QFf6l?jMaN zQIZI?*z@L~JekLl^v=`F6Imitn6gMpwj0X`fbkScNM|fo7Yu}A$zt(xqYD1wzRB?a zWu$`EaCAKIPlCbZWatOZ>CtG6tNVUHybV*heX;Z&zRTh+$ zT}BDAuYX16@SP|J^B(rSGb97vLVWQKRbHIu*PaqZ4qmCE3}TfQWmu>@7656McvB$@ z%xA?-)j+(^fa#8)`xB~MjE}tZnKs_=dV!mgERop?ncdRT;%NOHS<*xNHe7B0zf!;| zc#dI7K0UpqOUlVJgt>A)5K2BH1WL5-wE;egYoCsW=u+qp&-`#QI38>5>}{}(Tob_! zT_gNY*ut`hMlTVWxGq$`t$S(CHM)46LKtb~C^yXVdB%F?dZEuA^Sic@Th*bEDHlGki@H>GaOZcOpp(h+iXjyH_d1kjsI;w08*eC> f^tB!uNNIJI+neQlE=6BkWZp*0s!_*o+HUPPq#(sU literal 0 HcmV?d00001 diff --git a/__pycache__/models.cpython-36.pyc b/__pycache__/models.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7deb95c093fe76bc2fe7a48f9694a36beabdd8a0 GIT binary patch literal 1037 zcmZuvy^a$x5VpPEWC=Mya7cH6C=ksyNdW}~LP&?8i^QKmOJVKB4(w{PUTo(MqFlAt zqI&=YJv}eLYq+JlSD<3XxtyX?EZH90<8NlZZys!I^?u*`^Wi5W*+5s$* zNLrC>xzmionN@7cGDgV@A|uIRe7zQ^yNq?hCsc@H}TVIxmX+VQ-}-6zj=qiTbxdR|&x zdtT{W`uI>AX;bdGRn^4A@Sa_sXfw^7sPn1v(WIz<5z~iw+($Tct=DWT*tu>6fdvC-%h9YCc{QA<_3G%{Vd@Ufgt8*4CbN)56aYFRE@=+Sjr zAR{;7V38m~=eIYdb>?@T70QLUA*<(LU{)oAyg-l=x~GL6*d^bgr+%;_>Nv;;eRZ;Z z1mcx3_;p&+Qc5le7mw!g05dXJ_x$RLQYFK!r-y0RGi`%fKH1N$3R1gc_l5X8 z%d6#!xdMP81G9@3w$R#w+#942WeRZ7Jd literal 0 HcmV?d00001 diff --git a/converters.py b/converters.py new file mode 100644 index 0000000..91b1f2e --- /dev/null +++ b/converters.py @@ -0,0 +1,33 @@ +from decimal import Decimal +from typing import Dict, Union, List + +from helpers import tag_words, PROPER_NOUN_TAGS + + +def speechmatics_converter(data: Dict[str, Union[Dict[str, str], List[Dict[str, str]]]]): + converted_words = [] + words = data['words'] + tagged_words = tag_words([w['name'] for w in words]) + + for index, w in enumerate(words): + word_start = Decimal(w['time']) + word_end = word_start + Decimal(w['duration']) + confidence = Decimal(w['confidence']) + word = w['name'] + space = '' if word == '.' else ' ' + is_proper_noun = tagged_words[index][1] in PROPER_NOUN_TAGS + converted_words.append({ + 'wordStart': word_start, + 'wordEnd': word_end, + 'confidence': confidence, + 'word': word, + 'space': space, + 'alwaysCapitalized': is_proper_noun or word == 'I', + 'index': index, + }) + return converted_words + + +converters = { + 'speechmatics': speechmatics_converter, +} diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..4ed4388 --- /dev/null +++ b/helpers.py @@ -0,0 +1,17 @@ +from nltk.tag.stanford import StanfordNERTagger + +st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz', + '/usr/local/bin/stanford-ner.jar') + + +PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION'] + + +def tag_words(words): + return st.tag(words) + + +def is_a_proper_noun(phrase): + tagged_words = tag_words(phrase.split()) + return any(tagged_word[1] in PROPER_NOUN_TAGS + for tagged_word in tagged_words) diff --git a/models.py b/models.py new file mode 100644 index 0000000..bda7729 --- /dev/null +++ b/models.py @@ -0,0 +1,27 @@ +import json +import os + +from converters import converters + + +class TranscriptConverter: + + def __init__(self, path, format_name): + self.path = path + with open(path, 'r') as fin: + self.words = converters[format_name](json.load(fin)) + # wordStart + # wordEnd + # word + # confidence + # index + # space + # alwaysCapitalized + + def to_json(self): + return json.dumps(self.words, indent=4) + + def save(self): + name = f"{os.path.basename(self.path).split('.json')[0]}_processed.json" + with open(name, 'w') as fout: + fout.write(self.to_json())