47 lines
994 B
Python
47 lines
994 B
Python
from pathlib import Path
|
|
|
|
from nltk.tag.stanford import StanfordNERTagger
|
|
|
|
st = StanfordNERTagger('/usr/local/bin/english.all.3class.distsim.crf.ser.gz',
|
|
'/usr/local/bin/stanford-ner.jar')
|
|
|
|
|
|
PROPER_NOUN_TAGS = ['ORGANIZATION', 'PERSON', 'LOCATION']
|
|
|
|
PUNCTUATION = ['.', '?', ',', ':', '"', '!']
|
|
|
|
|
|
def tag_words(words):
|
|
return st.tag(words)
|
|
|
|
|
|
def is_a_proper_noun(phrase):
|
|
tagged_words = tag_words(phrase.split())
|
|
return any(tagged_word[1] in PROPER_NOUN_TAGS
|
|
for tagged_word in tagged_words)
|
|
|
|
|
|
def get_punc_before(word):
|
|
punc = []
|
|
for char in word:
|
|
if char in PUNCTUATION:
|
|
punc.append(char)
|
|
else:
|
|
return punc
|
|
|
|
|
|
def get_punc_after(word):
|
|
punc = []
|
|
for char in reversed(word):
|
|
if char in PUNCTUATION:
|
|
punc.insert(0, char)
|
|
else:
|
|
return punc
|
|
|
|
|
|
def is_path(string):
|
|
try:
|
|
return Path(string).exists()
|
|
except OSError:
|
|
return False
|