Skip to content

Instantly share code, notes, and snippets.

@andybp85
Created April 14, 2017 20:01
Show Gist options
  • Save andybp85/17047e470ea2729923f5be539d10a424 to your computer and use it in GitHub Desktop.
Save andybp85/17047e470ea2729923f5be539d10a424 to your computer and use it in GitHub Desktop.
def tag_question(question):
# NN noun, singular 'desk'
# NNS noun plural 'desks'
# NNP proper noun, singular 'Harrison'
# NNPS proper noun, plural 'Americans'
# RB adverb very, silently,
# RBR adverb, comparative better
# RBS adverb, superlative best
# VB verb, base form take
# VBD verb, past tense took
# VBG verb, gerund/present participle taking
# VBN verb, past participle taken
# VBP verb, sing. present, non-3d take
# VBZ verb, 3rd person sing. present takes
# JJ adjective 'big'
# JJR adjective, comparative 'bigger'
# JJS adjective, superlative 'biggest'
# this will match all of the above
searchTags = ['NN', 'RB', 'VB', 'JJ']
# parse out words that are the above parts of speech and stem them
sno = nltk.stem.SnowballStemmer('english')
tokens = nltk.word_tokenize(question.question_full.lower())
tagged = nltk.pos_tag(tokens)
lwords = {sno.stem(t[0]) for t in tagged if any(
map(
lambda x: x in t[1], searchTags))}
# query for all topics postgres thinks might match all words longer than
# 3 characters that aren't cntractions (should just be endings after parse,
# eg "n't", and loop over them
for topic in Topic.query.filter(
text("name % '" + "' OR name % '".join(
[w for w in lwords if len(w) > 2 and "'" not in w]
) + "'")).all():
if topic in question.topics.all():
continue
tokens = nltk.word_tokenize(topic.name.lower())
tagged = nltk.pos_tag(tokens)
twords = {sno.stem(t[0]) for t in tagged if any(
map(
lambda x: x in t[1], searchTags))}
# we'll count 1 if each indv word is found
found = 0
for word in twords:
if len(word) > 5:
# check for a fuzzy match >= 90 that has the same first letter
# 1 or more matches returns True
if bool(
len(
filter(
lambda x:
fuzz.ratio(word,x) >= 90 and\
word[0] == x[0], lwords))):
found += 1
# if >= 80, check to see if the first parent matches too
elif topic.parents.first() is not None and bool(
len(
filter(
lambda x:
fuzz.ratio(word,x) >= 80 and\
word[0] == x[0], lwords))):
parent_name = sno.stem(topic.parents.first().name.lower())
if bool(
len(
filter(
lambda x:
fuzz.ratio(parent_name,x) >= 90 and \
parent_name[0] == x[0], lwords))):
found += 1
else:
# check for an exact match
if word in lwords:
found += 1
# if the length of the topic name matches the counter, add the topic
if len(twords) == found:
question.topics.append(topic)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment