updated tests to current format of converters, updated static/not-static decorators on base class to match its children, added GoogleConverter, moved one or two methods to the base class because they all work the same.
This commit is contained in:
46
transcript_processing/helpers.py
Normal file
46
transcript_processing/helpers.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from pathlib import Path
|
||||
|
||||
from nltk.tag.stanford import StanfordNERTagger
|
||||
|
||||
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
||||
'/usr/local/bin/stanford-ner.jar')
|
||||
|
||||
|
||||
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
|
||||
|
||||
PUNCTUATION = ['.', '?', ',', ':', '"', '!']
|
||||
|
||||
|
||||
def tag_words(words):
|
||||
return st.tag(words)
|
||||
|
||||
|
||||
def is_a_proper_noun(phrase):
|
||||
tagged_words = tag_words(phrase.split())
|
||||
return any(tagged_word[1] in PROPER_NOUN_TAGS
|
||||
for tagged_word in tagged_words)
|
||||
|
||||
|
||||
def get_punc_before(word):
|
||||
punc = []
|
||||
for char in word:
|
||||
if char.isalpha():
|
||||
return punc
|
||||
if char in PUNCTUATION:
|
||||
punc.append(char)
|
||||
|
||||
|
||||
def get_punc_after(word):
|
||||
punc = []
|
||||
for char in reversed(word):
|
||||
if char.isalpha():
|
||||
return punc
|
||||
if char in PUNCTUATION:
|
||||
punc.insert(0, char)
|
||||
|
||||
|
||||
def is_path(string):
|
||||
try:
|
||||
return Path(string).exists()
|
||||
except OSError:
|
||||
return False
|
||||
Reference in New Issue
Block a user