gerbal/1_Identify_training_set.py

## 1_Identify_training_set.py
import nltk.data
import nltk
from nltk import tokenize
from nltk import tag
from nltk import chunk
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import re
import cPickle as pickle


# An input file from the Stack Exchange Data Dumpt should go here. Posts.xml is most interesting.
inputFile = open("input/Posts.xml", 'rb')
tree = ET.parse(inputFile)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# A RegEx to find things that look like URLS and remove them
re_URL = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')


# Colors so we can get better user input when we ask the user to select
# definitional sentence candidates
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'


# Not the best method of definition sentence identification, but it's very simple to implement
patternList = [
    "defined as",
    "defined by",
    "define",
    "defines",
    "definition of",
    "a definition",
    "the definition",
    "comprise",
    "comprises",
    "denote",
    "denotes",
    "designate",
    "designates",
    "called",
    "known as"
]

# Massively reduces false positives, improving precision, but hurting recall
# Is it worth it? I don't know
excludeList = [
    "so-called",
    "undefined",
    "called with",
    "predefined"
]


# Extract all questions, answers, and comments from 'Posts.xml' and put them in a dictionary.
temporaryDictionary = {}
for row in tree.findall('row'):
    if not row.get("ParentId"):
        temporaryDictionary[
            row.get("Id")] = [nltk.clean_html(row.get("Body")).lower()]
    elif row.get("ParentId") in temporaryDictionary.keys():
        temporaryDictionary[
            row.get("ParentId")
        ].append(nltk.clean_html(row.get("Body")).lower())

# And now we're going to undo all our work with the dictionary and chunk all of that text into a big list of sentences
sentences = []
for item in temporaryDictionary:
    tmpSentence = ""
    for definition in temporaryDictionary[item]:
        tmpSentence = tmpSentence + " " + re_URL.sub("", definition)
    sentences.extend(tokenizer.tokenize(tmpSentence))

# sentences = []
# for row in tree.findall('row'):
#     string = nltk.clean_html(row.get("Body")).lower()
#     sentences.extend(tokenizer.tokenize(string))

def highlight_words(sentence, words):
    '''
    A simple function to highlight any of our desired words in a string of text, this
    is why we have the colors class up at the top
    '''
    newsentence = sentence
    for pattern in words:
        if pattern in sentence:
            newsentence = re.sub(
                r'' + pattern + "", color.UNDERLINE + color.BOLD + pattern + color.END, newsentence)
    return newsentence

# Go through the list of sentences, and if they have any of the patterns in them
# ask the user if they are actual definitional sentences
# It occurs to me now that I don't have a good standard for what consitutes definitional
definitions = []
for sentence in sentences:
    # print sentence
    if any(pattern in sentence for pattern in patternList) and not any(pattern in sentence for pattern in excludeList):
        print "\n-\n-\n-\n-\n-\n"
        print highlight_words(sentence, patternList)
        is_definition = raw_input(
            "\n------\nDoes this look like a defintion? (y/n):\n------\n")
        if is_definition == "y":
            definitions.append(sentence)

# And write out that list of sentences to a file, using one of the wierdest named python libraries
pickle.dump(definitions, open("output/training_defintions.p", "wb"))

## 2_Identify_potential_tags.py
import nltk.data
import nltk
from nltk import tokenize
from nltk import tag
from nltk import chunk
import re
import cPickle as pickle

# Open the sentences identified in part 1
sentences = pickle.load(open("output/training_defintions.p", "rb"))
string = []
tree = []

# Tokenize them and tag them, nothing to complicated
for sentence in sentences:
    chunked = tokenize.word_tokenize(
        sentence.encode(encoding='UTF-8', errors='replace'))
    tagged_sent = tag.pos_tag(chunked)

    tree.append(chunk.ne_chunk(tagged_sent))

pickle.dump(tree, open("output/tagged_sents.p", "wb"))


# Print it all out, for debugging and to be stared at in confusion
for line in tree:
    print line

## 3_evaluate_potential_tags.py
import nltk
import cPickle as pickle
import pprint


# Attempting to use a grammar model to find patterns in definitional sentences
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
tree = pickle.load(open("output/tagged_sents.p", "rb"))
cp = nltk.RegexpParser(grammar)


class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

patternList = [
    "defined as",
    "defined by",
    "define",
    "defines",
    "definition of",
    "a definition",
    "the definition",
    "comprise",
    "comprises",
    "denote",
    "denotes",
    "designate",
    "designates",
    "called",
    "known as"
]

docs = []


def checkList(word, pos, list):
    '''
    A simple function to check if a word is a desired word and chang it (or its part of speech's)
    color. This function exists because I was trying to identify common patterns in definitional sentences
    I ran out of time before I could finish
    '''
    if any(pattern == word for pattern in list):
        return color.BOLD + color.UNDERLINE + pos + color.END
    else:
        return word


treed_sents = []

# I'm not sure what this is all doing at this point. The goal of this code is to
# help identify part-of-speech patterns, but I got sidetracked by word-lattices
# before I could finish it. Probably just needed to get a giant test set and try to
# use statistics to determine what traits (or clusters of traits, a la WCL) are common
# to definitional sentences

# print sent.leaves()
# print sent.subtrees
for sent in tree:
        # print sent
    treed_sents.append(cp.parse(sent))
    for subtree in sent.subtrees():
            # for minitree in subtree.subtrees():
            # print " ".join([b for (a,b) in subtree.leaves()])
            # print subtree
        docs.append(" ".join([checkList(a, b, patternList)
                    for (a, b) in subtree.leaves()]))
        test_tree = cp.parse(subtree)
        # print test_tree
        # for tiny_tree in test_tree.subtrees():
        #     if tiny_tree.node == 'NP':
        #         print tiny_tree

for line in docs:
    print(line)
    # tree = cp.parse(line)
    # for subtree in tree.subtrees():
    #     if subtree.node == 'CHUNK':
    #         print subtree

for struct in treed_sents:
    print struct.pprint()
	import nltk.data
	import nltk
	from nltk import tokenize
	from nltk import tag
	from nltk import chunk
	try:
	import xml.etree.cElementTree as ET
	except ImportError:
	import xml.etree.ElementTree as ET
	import re
	import cPickle as pickle


	# An input file from the Stack Exchange Data Dumpt should go here. Posts.xml is most interesting.
	inputFile = open("input/Posts.xml", 'rb')
	tree = ET.parse(inputFile)
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	# A RegEx to find things that look like URLS and remove them
	re_URL = re.compile(ur'(?i)\b((?:https?://\|www\d{0,3}[.]\|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+\|\(([^\s()<>]+\|(\([^\s()<>]+\)))\))+(?:\(([^\s()<>]+\|(\([^\s()<>]+\)))\)\|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')


	# Colors so we can get better user input when we ask the user to select
	# definitional sentence candidates
	class color:
	PURPLE = '\033[95m'
	CYAN = '\033[96m'
	DARKCYAN = '\033[36m'
	BLUE = '\033[94m'
	GREEN = '\033[92m'
	YELLOW = '\033[93m'
	RED = '\033[91m'
	BOLD = '\033[1m'
	UNDERLINE = '\033[4m'
	END = '\033[0m'


	# Not the best method of definition sentence identification, but it's very simple to implement
	patternList = [
	"defined as",
	"defined by",
	"define",
	"defines",
	"definition of",
	"a definition",
	"the definition",
	"comprise",
	"comprises",
	"denote",
	"denotes",
	"designate",
	"designates",
	"called",
	"known as"
	]

	# Massively reduces false positives, improving precision, but hurting recall
	# Is it worth it? I don't know
	excludeList = [
	"so-called",
	"undefined",
	"called with",
	"predefined"
	]


	# Extract all questions, answers, and comments from 'Posts.xml' and put them in a dictionary.
	temporaryDictionary = {}
	for row in tree.findall('row'):
	if not row.get("ParentId"):
	temporaryDictionary[
	row.get("Id")] = [nltk.clean_html(row.get("Body")).lower()]
	elif row.get("ParentId") in temporaryDictionary.keys():
	temporaryDictionary[
	row.get("ParentId")
	].append(nltk.clean_html(row.get("Body")).lower())

	# And now we're going to undo all our work with the dictionary and chunk all of that text into a big list of sentences
	sentences = []
	for item in temporaryDictionary:
	tmpSentence = ""
	for definition in temporaryDictionary[item]:
	tmpSentence = tmpSentence + " " + re_URL.sub("", definition)
	sentences.extend(tokenizer.tokenize(tmpSentence))

	# sentences = []
	# for row in tree.findall('row'):
	# string = nltk.clean_html(row.get("Body")).lower()
	# sentences.extend(tokenizer.tokenize(string))

	def highlight_words(sentence, words):
	'''
	A simple function to highlight any of our desired words in a string of text, this
	is why we have the colors class up at the top
	'''
	newsentence = sentence
	for pattern in words:
	if pattern in sentence:
	newsentence = re.sub(
	r'' + pattern + "", color.UNDERLINE + color.BOLD + pattern + color.END, newsentence)
	return newsentence

	# Go through the list of sentences, and if they have any of the patterns in them
	# ask the user if they are actual definitional sentences
	# It occurs to me now that I don't have a good standard for what consitutes definitional
	definitions = []
	for sentence in sentences:
	# print sentence
	if any(pattern in sentence for pattern in patternList) and not any(pattern in sentence for pattern in excludeList):
	print "\n-\n-\n-\n-\n-\n"
	print highlight_words(sentence, patternList)
	is_definition = raw_input(
	"\n------\nDoes this look like a defintion? (y/n):\n------\n")
	if is_definition == "y":
	definitions.append(sentence)

	# And write out that list of sentences to a file, using one of the wierdest named python libraries
	pickle.dump(definitions, open("output/training_defintions.p", "wb"))
	import nltk
	import cPickle as pickle
	import pprint


	# Attempting to use a grammar model to find patterns in definitional sentences
	grammar = r"""
	NP: {<DT\|JJ\|NN.*>+} # Chunk sequences of DT, JJ, NN
	PP: {<IN><NP>} # Chunk prepositions followed by NP
	VP: {<VB.*><NP\|PP\|CLAUSE>+$} # Chunk verbs and their arguments
	CLAUSE: {<NP><VP>} # Chunk NP, VP
	"""
	tree = pickle.load(open("output/tagged_sents.p", "rb"))
	cp = nltk.RegexpParser(grammar)


	class color:
	PURPLE = '\033[95m'
	CYAN = '\033[96m'
	DARKCYAN = '\033[36m'
	BLUE = '\033[94m'
	GREEN = '\033[92m'
	YELLOW = '\033[93m'
	RED = '\033[91m'
	BOLD = '\033[1m'
	UNDERLINE = '\033[4m'
	END = '\033[0m'

	patternList = [
	"defined as",
	"defined by",
	"define",
	"defines",
	"definition of",
	"a definition",
	"the definition",
	"comprise",
	"comprises",
	"denote",
	"denotes",
	"designate",
	"designates",
	"called",
	"known as"
	]

	docs = []


	def checkList(word, pos, list):
	'''
	A simple function to check if a word is a desired word and chang it (or its part of speech's)
	color. This function exists because I was trying to identify common patterns in definitional sentences
	I ran out of time before I could finish
	'''
	if any(pattern == word for pattern in list):
	return color.BOLD + color.UNDERLINE + pos + color.END
	else:
	return word


	treed_sents = []

	# I'm not sure what this is all doing at this point. The goal of this code is to
	# help identify part-of-speech patterns, but I got sidetracked by word-lattices
	# before I could finish it. Probably just needed to get a giant test set and try to
	# use statistics to determine what traits (or clusters of traits, a la WCL) are common
	# to definitional sentences

	# print sent.leaves()
	# print sent.subtrees
	for sent in tree:
	# print sent
	treed_sents.append(cp.parse(sent))
	for subtree in sent.subtrees():
	# for minitree in subtree.subtrees():
	# print " ".join([b for (a,b) in subtree.leaves()])
	# print subtree
	docs.append(" ".join([checkList(a, b, patternList)
	for (a, b) in subtree.leaves()]))
	test_tree = cp.parse(subtree)
	# print test_tree
	# for tiny_tree in test_tree.subtrees():
	# if tiny_tree.node == 'NP':
	# print tiny_tree

	for line in docs:
	print(line)
	# tree = cp.parse(line)
	# for subtree in tree.subtrees():
	# if subtree.node == 'CHUNK':
	# print subtree

	for struct in treed_sents:
	print struct.pprint()