Created
December 5, 2011 16:03
-
-
Save webcracy/1434087 to your computer and use it in GitHub Desktop.
Manybots - Python Natural Language Parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import pos_tag, word_tokenize | |
from en import verb # from http://nodebox.net/code/index.php/Linguistics | |
import re, pprint | |
from nltk.probability import ConditionalFreqDist | |
import os | |
def ie_preprocess(sentence): | |
sentence = word_tokenize(sentence) | |
sentence = pos_tag(sentence) | |
return sentence | |
def findtags(tag_prefix, tagged_text): | |
cfd = ConditionalFreqDist((tag, word) for (word, tag) in tagged_text | |
if tag.startswith(tag_prefix)) | |
return dict((tag, cfd[tag].keys()[:5]) for tag in cfd.conditions()) | |
def parse_sentence(input_sentence): | |
# | |
tagged_sentence = input_sentence.split(' tagged with ') | |
# setting the sentence as itself minus the tags (that should come at the end) | |
sentence = tagged_sentence.pop(0) | |
# tags | |
tags = tagged_sentence | |
# transforming the sentence | |
tuples = ie_preprocess(sentence) | |
result = {} | |
# finding if there's a verb | |
verbs = findtags('VBD', tuples) | |
if verbs: | |
verbs = verb.infinitive(verbs['VBD'][0]) | |
result['verbs'] = verbs | |
# print 'Verb: ' + verbs | |
# finding the wanted object | |
objects = findtags('NNS', tuples) | |
the_object = '' | |
if objects: | |
plural = True | |
the_object = objects['NNS'][0].lower().rstrip('s') | |
else: | |
plural = False | |
objects = findtags('NN', tuples) | |
the_object = objects['NN'][0] | |
# if plural: | |
# the_object = the_object + ' (plural)' | |
result['objects'] = the_object | |
# print 'Object: ' + the_object | |
# finding if there's a target | |
target = findtags('IN', tuples) | |
if target: | |
target = target['IN'][0] | |
else: | |
target = findtags('TO', tuples) | |
if target: | |
target = target['TO'][0] | |
# print target | |
# if there's a target, then consider all the rest the target value | |
if target: | |
target_value = sentence.split(target)[-1] | |
result['target_values'] = target_value.strip() | |
# print 'Target: ' + target_value.strip() | |
if tags: | |
result['tags'] = tags | |
# print 'Tagged with: ' + ', '.join(tags) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment