Created
April 14, 2017 20:01
-
-
Save andybp85/17047e470ea2729923f5be539d10a424 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tag_question(question): | |
# NN noun, singular 'desk' | |
# NNS noun plural 'desks' | |
# NNP proper noun, singular 'Harrison' | |
# NNPS proper noun, plural 'Americans' | |
# RB adverb very, silently, | |
# RBR adverb, comparative better | |
# RBS adverb, superlative best | |
# VB verb, base form take | |
# VBD verb, past tense took | |
# VBG verb, gerund/present participle taking | |
# VBN verb, past participle taken | |
# VBP verb, sing. present, non-3d take | |
# VBZ verb, 3rd person sing. present takes | |
# JJ adjective 'big' | |
# JJR adjective, comparative 'bigger' | |
# JJS adjective, superlative 'biggest' | |
# this will match all of the above | |
searchTags = ['NN', 'RB', 'VB', 'JJ'] | |
# parse out words that are the above parts of speech and stem them | |
sno = nltk.stem.SnowballStemmer('english') | |
tokens = nltk.word_tokenize(question.question_full.lower()) | |
tagged = nltk.pos_tag(tokens) | |
lwords = {sno.stem(t[0]) for t in tagged if any( | |
map( | |
lambda x: x in t[1], searchTags))} | |
# query for all topics postgres thinks might match all words longer than | |
# 3 characters that aren't cntractions (should just be endings after parse, | |
# eg "n't", and loop over them | |
for topic in Topic.query.filter( | |
text("name % '" + "' OR name % '".join( | |
[w for w in lwords if len(w) > 2 and "'" not in w] | |
) + "'")).all(): | |
if topic in question.topics.all(): | |
continue | |
tokens = nltk.word_tokenize(topic.name.lower()) | |
tagged = nltk.pos_tag(tokens) | |
twords = {sno.stem(t[0]) for t in tagged if any( | |
map( | |
lambda x: x in t[1], searchTags))} | |
# we'll count 1 if each indv word is found | |
found = 0 | |
for word in twords: | |
if len(word) > 5: | |
# check for a fuzzy match >= 90 that has the same first letter | |
# 1 or more matches returns True | |
if bool( | |
len( | |
filter( | |
lambda x: | |
fuzz.ratio(word,x) >= 90 and\ | |
word[0] == x[0], lwords))): | |
found += 1 | |
# if >= 80, check to see if the first parent matches too | |
elif topic.parents.first() is not None and bool( | |
len( | |
filter( | |
lambda x: | |
fuzz.ratio(word,x) >= 80 and\ | |
word[0] == x[0], lwords))): | |
parent_name = sno.stem(topic.parents.first().name.lower()) | |
if bool( | |
len( | |
filter( | |
lambda x: | |
fuzz.ratio(parent_name,x) >= 90 and \ | |
parent_name[0] == x[0], lwords))): | |
found += 1 | |
else: | |
# check for an exact match | |
if word in lwords: | |
found += 1 | |
# if the length of the topic name matches the counter, add the topic | |
if len(twords) == found: | |
question.topics.append(topic) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment