Skip to content

Instantly share code, notes, and snippets.

@gerbal
Last active August 29, 2015 14:00
Show Gist options
  • Save gerbal/05bff2dcfcfdf7d373de to your computer and use it in GitHub Desktop.
Save gerbal/05bff2dcfcfdf7d373de to your computer and use it in GitHub Desktop.
Simple Definition Extraction attempt
import nltk.data
import nltk
from nltk import tokenize
from nltk import tag
from nltk import chunk
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import re
import cPickle as pickle
# An input file from the Stack Exchange Data Dumpt should go here. Posts.xml is most interesting.
inputFile = open("input/Posts.xml", 'rb')
tree = ET.parse(inputFile)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# A RegEx to find things that look like URLS and remove them
re_URL = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
# Colors so we can get better user input when we ask the user to select
# definitional sentence candidates
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# Not the best method of definition sentence identification, but it's very simple to implement
patternList = [
"defined as",
"defined by",
"define",
"defines",
"definition of",
"a definition",
"the definition",
"comprise",
"comprises",
"denote",
"denotes",
"designate",
"designates",
"called",
"known as"
]
# Massively reduces false positives, improving precision, but hurting recall
# Is it worth it? I don't know
excludeList = [
"so-called",
"undefined",
"called with",
"predefined"
]
# Extract all questions, answers, and comments from 'Posts.xml' and put them in a dictionary.
temporaryDictionary = {}
for row in tree.findall('row'):
if not row.get("ParentId"):
temporaryDictionary[
row.get("Id")] = [nltk.clean_html(row.get("Body")).lower()]
elif row.get("ParentId") in temporaryDictionary.keys():
temporaryDictionary[
row.get("ParentId")
].append(nltk.clean_html(row.get("Body")).lower())
# And now we're going to undo all our work with the dictionary and chunk all of that text into a big list of sentences
sentences = []
for item in temporaryDictionary:
tmpSentence = ""
for definition in temporaryDictionary[item]:
tmpSentence = tmpSentence + " " + re_URL.sub("", definition)
sentences.extend(tokenizer.tokenize(tmpSentence))
# sentences = []
# for row in tree.findall('row'):
# string = nltk.clean_html(row.get("Body")).lower()
# sentences.extend(tokenizer.tokenize(string))
def highlight_words(sentence, words):
'''
A simple function to highlight any of our desired words in a string of text, this
is why we have the colors class up at the top
'''
newsentence = sentence
for pattern in words:
if pattern in sentence:
newsentence = re.sub(
r'' + pattern + "", color.UNDERLINE + color.BOLD + pattern + color.END, newsentence)
return newsentence
# Go through the list of sentences, and if they have any of the patterns in them
# ask the user if they are actual definitional sentences
# It occurs to me now that I don't have a good standard for what consitutes definitional
definitions = []
for sentence in sentences:
# print sentence
if any(pattern in sentence for pattern in patternList) and not any(pattern in sentence for pattern in excludeList):
print "\n-\n-\n-\n-\n-\n"
print highlight_words(sentence, patternList)
is_definition = raw_input(
"\n------\nDoes this look like a defintion? (y/n):\n------\n")
if is_definition == "y":
definitions.append(sentence)
# And write out that list of sentences to a file, using one of the wierdest named python libraries
pickle.dump(definitions, open("output/training_defintions.p", "wb"))
import nltk.data
import nltk
from nltk import tokenize
from nltk import tag
from nltk import chunk
import re
import cPickle as pickle
# Open the sentences identified in part 1
sentences = pickle.load(open("output/training_defintions.p", "rb"))
string = []
tree = []
# Tokenize them and tag them, nothing to complicated
for sentence in sentences:
chunked = tokenize.word_tokenize(
sentence.encode(encoding='UTF-8', errors='replace'))
tagged_sent = tag.pos_tag(chunked)
tree.append(chunk.ne_chunk(tagged_sent))
pickle.dump(tree, open("output/tagged_sents.p", "wb"))
# Print it all out, for debugging and to be stared at in confusion
for line in tree:
print line
import nltk
import cPickle as pickle
import pprint
# Attempting to use a grammar model to find patterns in definitional sentences
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
tree = pickle.load(open("output/tagged_sents.p", "rb"))
cp = nltk.RegexpParser(grammar)
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
patternList = [
"defined as",
"defined by",
"define",
"defines",
"definition of",
"a definition",
"the definition",
"comprise",
"comprises",
"denote",
"denotes",
"designate",
"designates",
"called",
"known as"
]
docs = []
def checkList(word, pos, list):
'''
A simple function to check if a word is a desired word and chang it (or its part of speech's)
color. This function exists because I was trying to identify common patterns in definitional sentences
I ran out of time before I could finish
'''
if any(pattern == word for pattern in list):
return color.BOLD + color.UNDERLINE + pos + color.END
else:
return word
treed_sents = []
# I'm not sure what this is all doing at this point. The goal of this code is to
# help identify part-of-speech patterns, but I got sidetracked by word-lattices
# before I could finish it. Probably just needed to get a giant test set and try to
# use statistics to determine what traits (or clusters of traits, a la WCL) are common
# to definitional sentences
# print sent.leaves()
# print sent.subtrees
for sent in tree:
# print sent
treed_sents.append(cp.parse(sent))
for subtree in sent.subtrees():
# for minitree in subtree.subtrees():
# print " ".join([b for (a,b) in subtree.leaves()])
# print subtree
docs.append(" ".join([checkList(a, b, patternList)
for (a, b) in subtree.leaves()]))
test_tree = cp.parse(subtree)
# print test_tree
# for tiny_tree in test_tree.subtrees():
# if tiny_tree.node == 'NP':
# print tiny_tree
for line in docs:
print(line)
# tree = cp.parse(line)
# for subtree in tree.subtrees():
# if subtree.node == 'CHUNK':
# print subtree
for struct in treed_sents:
print struct.pprint()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment