updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.

2019-03-07 02:30:48 -05:00
parent 6333f55c87
commit d30ecad583
16 changed files with 346 additions and 46 deletions
--- a/transcript_processing/helpers.py
+++ b/transcript_processing/helpers.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+
+from nltk.tag.stanford import StanfordNERTagger
+
+st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
+                       '/usr/local/bin/stanford-ner.jar')
+
+
+PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
+
+PUNCTUATION = ['.', '?', ',', ':', '"', '!']
+
+
+def tag_words(words):
+    return st.tag(words)
+
+
+def is_a_proper_noun(phrase):
+    tagged_words = tag_words(phrase.split())
+    return any(tagged_word[1] in PROPER_NOUN_TAGS
+               for tagged_word in tagged_words)
+
+
+def get_punc_before(word):
+    punc = []
+    for char in word:
+        if char.isalpha():
+            return punc
+        if char in PUNCTUATION:
+            punc.append(char)
+
+
+def get_punc_after(word):
+    punc = []
+    for char in reversed(word):
+        if char.isalpha():
+            return punc
+        if char in PUNCTUATION:
+            punc.insert(0, char)
+
+
+def is_path(string):
+    try:
+        return Path(string).exists()
+    except OSError:
+        return False