andybp85/tag_question.py

## tag_question.py
def tag_question(question):

#  NN	noun, singular 'desk'
#  NNS	noun plural	'desks'
#  NNP	proper noun, singular	'Harrison'
#  NNPS	proper noun, plural	'Americans'

#  RB	adverb	very, silently,
#  RBR	adverb, comparative	better
#  RBS	adverb, superlative	best

#  VB	verb, base form	take
#  VBD	verb, past tense	took
#  VBG	verb, gerund/present participle	taking
#  VBN	verb, past participle	taken
#  VBP	verb, sing. present, non-3d	take
#  VBZ	verb, 3rd person sing. present	takes

#  JJ	adjective	'big'
#  JJR	adjective, comparative	'bigger'
#  JJS	adjective, superlative	'biggest'

# this will match all of the above
searchTags = ['NN', 'RB', 'VB', 'JJ']

# parse out words that are the above parts of speech and stem them
sno = nltk.stem.SnowballStemmer('english')
tokens = nltk.word_tokenize(question.question_full.lower())
tagged = nltk.pos_tag(tokens)
lwords = {sno.stem(t[0]) for t in tagged if any(
    map(
        lambda x: x in t[1], searchTags))}

# query for all topics postgres thinks might match all words longer than
# 3 characters that aren't cntractions (should just be endings after parse,
# eg "n't", and loop over them
for topic in Topic.query.filter(
    text("name % '" + "' OR name % '".join(
        [w for w in lwords if len(w) > 2 and "'" not in w]
    ) + "'")).all():

    if topic in question.topics.all():
        continue

    tokens = nltk.word_tokenize(topic.name.lower())
    tagged = nltk.pos_tag(tokens)
    twords = {sno.stem(t[0]) for t in tagged if any(
        map(
            lambda x: x in t[1], searchTags))}

    # we'll count 1 if each indv word is found
    found = 0

    for word in twords:

        if len(word) > 5:

            # check for a fuzzy match >= 90 that has the same first letter
            # 1 or more matches returns True
            if bool(
                len(
                    filter(
                        lambda x:
                            fuzz.ratio(word,x) >= 90  and\
                                word[0] == x[0], lwords))):

                found += 1

            # if >= 80, check to see if the first parent matches too
            elif topic.parents.first() is not None and bool(
                len(
                    filter(
                        lambda x:
                            fuzz.ratio(word,x) >= 80  and\
                                word[0] == x[0], lwords))):

                parent_name = sno.stem(topic.parents.first().name.lower())

                if bool(
                    len(
                        filter(
                            lambda x:
                                fuzz.ratio(parent_name,x) >= 90 and \
                                    parent_name[0] == x[0], lwords))):

                    found += 1

        else:
            # check for an exact match
            if word in lwords:
                found += 1

    # if the length of the topic name matches the counter, add the topic
    if len(twords) == found:
        question.topics.append(topic)
	def tag_question(question):

	# NN noun, singular 'desk'
	# NNS noun plural 'desks'
	# NNP proper noun, singular 'Harrison'
	# NNPS proper noun, plural 'Americans'

	# RB adverb very, silently,
	# RBR adverb, comparative better
	# RBS adverb, superlative best

	# VB verb, base form take
	# VBD verb, past tense took
	# VBG verb, gerund/present participle taking
	# VBN verb, past participle taken
	# VBP verb, sing. present, non-3d take
	# VBZ verb, 3rd person sing. present takes

	# JJ adjective 'big'
	# JJR adjective, comparative 'bigger'
	# JJS adjective, superlative 'biggest'

	# this will match all of the above
	searchTags = ['NN', 'RB', 'VB', 'JJ']

	# parse out words that are the above parts of speech and stem them
	sno = nltk.stem.SnowballStemmer('english')
	tokens = nltk.word_tokenize(question.question_full.lower())
	tagged = nltk.pos_tag(tokens)
	lwords = {sno.stem(t[0]) for t in tagged if any(
	map(
	lambda x: x in t[1], searchTags))}

	# query for all topics postgres thinks might match all words longer than
	# 3 characters that aren't cntractions (should just be endings after parse,
	# eg "n't", and loop over them
	for topic in Topic.query.filter(
	text("name % '" + "' OR name % '".join(
	[w for w in lwords if len(w) > 2 and "'" not in w]
	) + "'")).all():

	if topic in question.topics.all():
	continue

	tokens = nltk.word_tokenize(topic.name.lower())
	tagged = nltk.pos_tag(tokens)
	twords = {sno.stem(t[0]) for t in tagged if any(
	map(
	lambda x: x in t[1], searchTags))}

	# we'll count 1 if each indv word is found
	found = 0

	for word in twords:

	if len(word) > 5:

	# check for a fuzzy match >= 90 that has the same first letter
	# 1 or more matches returns True
	if bool(
	len(
	filter(
	lambda x:
	fuzz.ratio(word,x) >= 90 and\
	word[0] == x[0], lwords))):

	found += 1

	# if >= 80, check to see if the first parent matches too
	elif topic.parents.first() is not None and bool(
	len(
	filter(
	lambda x:
	fuzz.ratio(word,x) >= 80 and\
	word[0] == x[0], lwords))):

	parent_name = sno.stem(topic.parents.first().name.lower())

	if bool(
	len(
	filter(
	lambda x:
	fuzz.ratio(parent_name,x) >= 90 and \
	parent_name[0] == x[0], lwords))):

	found += 1

	else:
	# check for an exact match
	if word in lwords:
	found += 1

	# if the length of the topic name matches the counter, add the topic
	if len(twords) == found:
	question.topics.append(topic)