MissCatLady/Asst4.py

## Asst4.py
"""

Components that you need for for Assignment 4 in CSC2501/485, Fall 2010.

Usage:

from Asst4 import nyt_big  ## the full NYT corpus for A4
from Asst4 import nyt_mini ## only the first 100K lines from nyt_big, for
                           ## development
from Asst4 import wn17     ## WordNet 1.7, newer versions won't work for Q2

from Asst4 import DefaultNpChunker ## a simple NP chunker to get things started

"""

# T(he original version of t)his code was written by Ulrich Germann (11/2010)
# It provides components necessary for Assignment 4 of CSC 485/2501h

######################################################################
#
# ATTENTION!
#
# THE INFORMATION BELOW IS ***CRUCIAL*** FOR YOUR SUCCESS IN Q2
#
# Question 2 of A4 works only with Wordnet 1.7 or earlier
#
# QUESTION 2 WILL NOT WORK WITH THE NLTK DEFAULT WORDNET INSTALLATION!
#
# We have installed version 1.7 in the following location
# /u/csc2501h/include/a4/nltk/corpora
#
# To use it we need to prepend the following directory to the
# NLTK search path for corpora and other data
import nltk
nltk.data.path[0:0] = ['/Users/Lia/Documents/CSC485/A4/nltk']

# Now we can import wordnet 1.7 instead of later versions
from nltk.corpus import wordnet as wn17

# The following code provides access to the tagged NY Times corpus
# nyt_big is the full corpus
# nyt_mini a small subset for development
from nltk.data         import ZipFilePathPointer
from nltk.corpus       import TaggedCorpusReader

nyt_zipped = ZipFilePathPointer('/Users/Lia/Documents/CSC485/A4/nltk/nyt.zip')
nyt_big    = TaggedCorpusReader(nyt_zipped,'nyt/2004-tagged.txt',sep='/')
nyt_mini   = TaggedCorpusReader(nyt_zipped,'nyt/nytimes-mini.txt',sep='/')

# Finally, let's set up a default pattern for NP chunking
# Setting up the NP chunker itself is left to the main script, to encourage
# trying different variants of the pattern

DefaultNpPattern = ''.join([r'(<DT|AT>?<RB>?)?',
			    r'<JJ.*|CD.*>*',
			    r'(<JJ.*|CD.*><,>)*',
			    r'(<N.*>)+'])

## CaseEvaluation.py
import nltk
from nltk.corpus import wordnet as wn

'''
CSC418 2014 A4 Part 1: Hypernym Relations
Methods for evaluation of cases for each confidence level are found here.

In Main:
Read output data from file containing low, med, and high confidence level
data. Data is an array containing arrays in the format of [hyponym, hypernym, int]
where int describes the number of additional times it has been seen.

Time: <1min (for nyt_big data)'''

RUNMAIN = 1
FILENAME = "big_low.txt"
ENTRIES = 50

#confidence level dictionaries whose keys indicate the case
low = {1:[], 2:[], 3:[], 4:[]}
med = {1:[], 2:[], 3:[], 4:[]}
high = {1:[], 2:[], 3:[], 4:[]}

def dictToFile(dictionary, case, filename, description, entries):
	'''Given a confidence level (dictionary), an int key (case), the str
	name of the output file (filename), a str describing the contents
	of the file (description), and an int number (entries). Write number
	of entries to file, one array of format [hyponym, hypernym, int] per line.'''

	fp = open(filename, 'w+')
	fp.write(description + "\n")
	for i in dictionary[case]:
		if (entries==0):
			break
		fp.write(str(i) + "\n")
		entries -= 1
	fp.close()

def addToDict(pairAndCount):
	''' Given an array of homonyms with the count [hyponym, hypernym, int].
	Evaluate case and add to confidence count dictionary.'''

	#case is an array [hyponym, hypernym, case]
	case = getCase(pairAndCount)

	#check confidence level
	if pairAndCount[-1] == 0:
		#add pair to case key in confidence level dictionary
		low[case[-1]].append(case[0:2])
	if pairAndCount[-1] == 1:
		med[case[-1]].append(case[0:2])
	if pairAndCount[-1] >= 2:
		#for high confidence want to know how often word appeared
		high[case[-1]].append(case[0:2] + pairAndCount[-1:-1])


def getCase(pairAndCount):
	''' Given an array of words with the count: [hyponym, hypernym, int].
	Return array with pair of words and case [hyponym, hypernym, case].
	'''

	pair = pairAndCount[0:2]
	#find synsets for each word if it exists
	#first and second are arrays contain synsets
	[first, second, pair] = getWNSynsets(pair)
	return checkRelation(first,second,pair)

def checkRelation(first, second, pair):
	'''Given list of synsets for first and second and the pair of words
	Return the pair and the given case.'''

	#Case4: Either synset list is empty, thus not found in WordNet
	if (first==[]):
		return [pair[0],pair[1], 4]
	if (second==[]):
		return [pair[0],pair[1], 4]

	#Only check for relations for nouns in synset list
	first = keepNouns(first)
	second = keepNouns(second)

	#flag containing if a relationship holds
	found = 0

	#check all synsets in first against all synsets in second
	for a in first:
		for b in second:
			lowest_common = a.lowest_common_hypernyms(b)
			#relationship holds
			if (b in lowest_common):
				found = 1
			#Case2: relationship contradicted (opposite relationship)
			elif (a in lowest_common):
				return [pair[0],pair[1], 2]
			#Case2: relationship contradiction (other type of relationship)
			elif (lowest_common != []):
				return [pair[0],pair[1], 2]

	#Case1: no contradictions so relationship holds for at least one
	if (found):
		return [pair[0],pair[1], 1]
	#Case3: no relationship found
	return [pair[0],pair[1], 3]

def keepNouns(synsets):
	'''Takes list of synsets, removes all that are not nouns'''
	for i in synsets:
		if (str(i).find(".n.")==-1):
			synsets.remove(i)
	return synsets

def getWNSynsets(pair):
	'''Given pair of hypernyms and return list of synsets associated
	with each word and hypernym pair in an array:
	[synset_list, synset_list, hypernym_pair].'''

	pair = formatHypernyms(pair)
	first = wn.synsets(pair[0])
	second = wn.synsets(pair[1])

	return (first, second, pair)

#####################Methods for Hyponym Formatting############################


def formatHypernyms(pair):
	'''Given a word pair [hyponym, hypernym] format each word so so that
	it is searchable in WordNet. Basic formatting done here. '''

	for i in range(0, len(pair)):

		pair[i] = removeDigits(pair[i])
		pair[i] = remPlural(pair[i])
		#remove leading apostrophe's
		pair[i] = pair[i].lstrip("'")
		#remove prefix "the " from string
		pair[i] = pair[i].replace("the ", "")
		#strip spaces from beginning and end of string
		pair[i] = pair[i].strip()
		#replace spaces with _
		pair[i] = pair[i].replace(" ", "_")
		#replace dashes with _
		pair[i] = pair[i].replace("-", "_")
		#replace double dashes with a single one
		pair[i] = pair[i].replace("__", "_")

	return pair

def removeDigits(word):
	'''Return str word with leading digits removed'''

	wordArray = word.split(" ")

	if (wordArray[0].isdigit()):
		return word[len(wordArray[0])+1:]
	return word

def remPlural(word):
	'''Returns str word in singular.'''

	#turn words like families -> family
	if len(word)>2 and word[-3:] == "ies":
		return word[:-3] + "y"
	#turn plural into singular
	if len(word)>1 and word[-1] == "s":
		return word[:-1]

	return word

def process(filename):
	#get file pointer for reading
	fp = open(filename, 'r')
	#get data as an array
	data = eval(fp.read())

	for i in data:
		addToDict(i)

def printStats():
	print("Low Confidence")
	print("> Case 1: " + str(len(low[1])))
	print("> Case 2: " + str(len(low[2])))
	print("> Case 3: " + str(len(low[3])))
	print("> Case 4: " + str(len(low[4])) + "\n\n")


	print("Med Confidence")
	print("> Case 1: " + str(len(med[1])))
	print("> Case 2: " + str(len(med[2])))
	print("> Case 3: " + str(len(med[3])))
	print("> Case 4: " + str(len(med[4])) + "\n\n")

	print("High Confidence\n")
	print("> Case 1: " + str(len(high[1])))
	print("> Case 2: " + str(len(high[2])))
	print("> Case 3: " + str(len(high[3])))
	print("> Case 4: " + str(len(high[4])) + "\n\n")

if __name__=="__main__":


	if (RUNMAIN):
		process(FILENAME)
		printStats()


		for i in range(1,5):
			dictToFile(low, i, "lowCase" + str(i) + ".txt", "", ENTRIES)
			dictToFile(med, i, "medCase" + str(i) + ".txt", "", ENTRIES)
			dictToFile(high, i, "highCase" + str(i) + ".txt", "", ENTRIES)
	"""

	Components that you need for for Assignment 4 in CSC2501/485, Fall 2010.

	Usage:

	from Asst4 import nyt_big ## the full NYT corpus for A4
	from Asst4 import nyt_mini ## only the first 100K lines from nyt_big, for
	## development
	from Asst4 import wn17 ## WordNet 1.7, newer versions won't work for Q2

	from Asst4 import DefaultNpChunker ## a simple NP chunker to get things started

	"""

	# T(he original version of t)his code was written by Ulrich Germann (11/2010)
	# It provides components necessary for Assignment 4 of CSC 485/2501h

	######################################################################
	#
	# ATTENTION!
	#
	# THE INFORMATION BELOW IS *CRUCIAL* FOR YOUR SUCCESS IN Q2
	#
	# Question 2 of A4 works only with Wordnet 1.7 or earlier
	#
	# QUESTION 2 WILL NOT WORK WITH THE NLTK DEFAULT WORDNET INSTALLATION!
	#
	# We have installed version 1.7 in the following location
	# /u/csc2501h/include/a4/nltk/corpora
	#
	# To use it we need to prepend the following directory to the
	# NLTK search path for corpora and other data
	import nltk
	nltk.data.path[0:0] = ['/Users/Lia/Documents/CSC485/A4/nltk']

	# Now we can import wordnet 1.7 instead of later versions
	from nltk.corpus import wordnet as wn17

	# The following code provides access to the tagged NY Times corpus
	# nyt_big is the full corpus
	# nyt_mini a small subset for development
	from nltk.data import ZipFilePathPointer
	from nltk.corpus import TaggedCorpusReader

	nyt_zipped = ZipFilePathPointer('/Users/Lia/Documents/CSC485/A4/nltk/nyt.zip')
	nyt_big = TaggedCorpusReader(nyt_zipped,'nyt/2004-tagged.txt',sep='/')
	nyt_mini = TaggedCorpusReader(nyt_zipped,'nyt/nytimes-mini.txt',sep='/')

	# Finally, let's set up a default pattern for NP chunking
	# Setting up the NP chunker itself is left to the main script, to encourage
	# trying different variants of the pattern

	DefaultNpPattern = ''.join([r'(<DT\|AT>?<RB>?)?',
	r'<JJ.\|CD.>*',
	r'(<JJ.\|CD.><,>)*',
	r'(<N.*>)+'])
	import nltk
	from nltk.corpus import wordnet as wn

	'''
	CSC418 2014 A4 Part 1: Hypernym Relations
	Methods for evaluation of cases for each confidence level are found here.

	In Main:
	Read output data from file containing low, med, and high confidence level
	data. Data is an array containing arrays in the format of [hyponym, hypernym, int]
	where int describes the number of additional times it has been seen.

	Time: <1min (for nyt_big data)'''

	RUNMAIN = 1
	FILENAME = "big_low.txt"
	ENTRIES = 50

	#confidence level dictionaries whose keys indicate the case
	low = {1:[], 2:[], 3:[], 4:[]}
	med = {1:[], 2:[], 3:[], 4:[]}
	high = {1:[], 2:[], 3:[], 4:[]}

	def dictToFile(dictionary, case, filename, description, entries):
	'''Given a confidence level (dictionary), an int key (case), the str
	name of the output file (filename), a str describing the contents
	of the file (description), and an int number (entries). Write number
	of entries to file, one array of format [hyponym, hypernym, int] per line.'''

	fp = open(filename, 'w+')
	fp.write(description + "\n")
	for i in dictionary[case]:
	if (entries==0):
	break
	fp.write(str(i) + "\n")
	entries -= 1
	fp.close()

	def addToDict(pairAndCount):
	''' Given an array of homonyms with the count [hyponym, hypernym, int].
	Evaluate case and add to confidence count dictionary.'''

	#case is an array [hyponym, hypernym, case]
	case = getCase(pairAndCount)

	#check confidence level
	if pairAndCount[-1] == 0:
	#add pair to case key in confidence level dictionary
	low[case[-1]].append(case[0:2])
	if pairAndCount[-1] == 1:
	med[case[-1]].append(case[0:2])
	if pairAndCount[-1] >= 2:
	#for high confidence want to know how often word appeared
	high[case[-1]].append(case[0:2] + pairAndCount[-1:-1])


	def getCase(pairAndCount):
	''' Given an array of words with the count: [hyponym, hypernym, int].
	Return array with pair of words and case [hyponym, hypernym, case].
	'''

	pair = pairAndCount[0:2]
	#find synsets for each word if it exists
	#first and second are arrays contain synsets
	[first, second, pair] = getWNSynsets(pair)
	return checkRelation(first,second,pair)

	def checkRelation(first, second, pair):
	'''Given list of synsets for first and second and the pair of words
	Return the pair and the given case.'''

	#Case4: Either synset list is empty, thus not found in WordNet
	if (first==[]):
	return [pair[0],pair[1], 4]
	if (second==[]):
	return [pair[0],pair[1], 4]

	#Only check for relations for nouns in synset list
	first = keepNouns(first)
	second = keepNouns(second)

	#flag containing if a relationship holds
	found = 0

	#check all synsets in first against all synsets in second
	for a in first:
	for b in second:
	lowest_common = a.lowest_common_hypernyms(b)
	#relationship holds
	if (b in lowest_common):
	found = 1
	#Case2: relationship contradicted (opposite relationship)
	elif (a in lowest_common):
	return [pair[0],pair[1], 2]
	#Case2: relationship contradiction (other type of relationship)
	elif (lowest_common != []):
	return [pair[0],pair[1], 2]

	#Case1: no contradictions so relationship holds for at least one
	if (found):
	return [pair[0],pair[1], 1]
	#Case3: no relationship found
	return [pair[0],pair[1], 3]

	def keepNouns(synsets):
	'''Takes list of synsets, removes all that are not nouns'''
	for i in synsets:
	if (str(i).find(".n.")==-1):
	synsets.remove(i)
	return synsets

	def getWNSynsets(pair):
	'''Given pair of hypernyms and return list of synsets associated
	with each word and hypernym pair in an array:
	[synset_list, synset_list, hypernym_pair].'''

	pair = formatHypernyms(pair)
	first = wn.synsets(pair[0])
	second = wn.synsets(pair[1])

	return (first, second, pair)

	#####################Methods for Hyponym Formatting############################


	def formatHypernyms(pair):
	'''Given a word pair [hyponym, hypernym] format each word so so that
	it is searchable in WordNet. Basic formatting done here. '''

	for i in range(0, len(pair)):

	pair[i] = removeDigits(pair[i])
	pair[i] = remPlural(pair[i])
	#remove leading apostrophe's
	pair[i] = pair[i].lstrip("'")
	#remove prefix "the " from string
	pair[i] = pair[i].replace("the ", "")
	#strip spaces from beginning and end of string
	pair[i] = pair[i].strip()
	#replace spaces with _
	pair[i] = pair[i].replace(" ", "_")
	#replace dashes with _
	pair[i] = pair[i].replace("-", "_")
	#replace double dashes with a single one
	pair[i] = pair[i].replace("__", "_")

	return pair

	def removeDigits(word):
	'''Return str word with leading digits removed'''

	wordArray = word.split(" ")

	if (wordArray[0].isdigit()):
	return word[len(wordArray[0])+1:]
	return word

	def remPlural(word):
	'''Returns str word in singular.'''

	#turn words like families -> family
	if len(word)>2 and word[-3:] == "ies":
	return word[:-3] + "y"
	#turn plural into singular
	if len(word)>1 and word[-1] == "s":
	return word[:-1]

	return word

	def process(filename):
	#get file pointer for reading
	fp = open(filename, 'r')
	#get data as an array
	data = eval(fp.read())

	for i in data:
	addToDict(i)

	def printStats():
	print("Low Confidence")
	print("> Case 1: " + str(len(low[1])))
	print("> Case 2: " + str(len(low[2])))
	print("> Case 3: " + str(len(low[3])))
	print("> Case 4: " + str(len(low[4])) + "\n\n")


	print("Med Confidence")
	print("> Case 1: " + str(len(med[1])))
	print("> Case 2: " + str(len(med[2])))
	print("> Case 3: " + str(len(med[3])))
	print("> Case 4: " + str(len(med[4])) + "\n\n")

	print("High Confidence\n")
	print("> Case 1: " + str(len(high[1])))
	print("> Case 2: " + str(len(high[2])))
	print("> Case 3: " + str(len(high[3])))
	print("> Case 4: " + str(len(high[4])) + "\n\n")

	if __name__=="__main__":


	if (RUNMAIN):
	process(FILENAME)
	printStats()


	for i in range(1,5):
	dictToFile(low, i, "lowCase" + str(i) + ".txt", "", ENTRIES)
	dictToFile(med, i, "medCase" + str(i) + ".txt", "", ENTRIES)
	dictToFile(high, i, "highCase" + str(i) + ".txt", "", ENTRIES)