Created
November 9, 2014 19:36
-
-
Save MissCatLady/e0fbe993f4dbb4adde25 to your computer and use it in GitHub Desktop.
Hyponym/Hypernym Pairs (using NLTK)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Components that you need for for Assignment 4 in CSC2501/485, Fall 2010. | |
Usage: | |
from Asst4 import nyt_big ## the full NYT corpus for A4 | |
from Asst4 import nyt_mini ## only the first 100K lines from nyt_big, for | |
## development | |
from Asst4 import wn17 ## WordNet 1.7, newer versions won't work for Q2 | |
from Asst4 import DefaultNpChunker ## a simple NP chunker to get things started | |
""" | |
# T(he original version of t)his code was written by Ulrich Germann (11/2010) | |
# It provides components necessary for Assignment 4 of CSC 485/2501h | |
###################################################################### | |
# | |
# ATTENTION! | |
# | |
# THE INFORMATION BELOW IS ***CRUCIAL*** FOR YOUR SUCCESS IN Q2 | |
# | |
# Question 2 of A4 works only with Wordnet 1.7 or earlier | |
# | |
# QUESTION 2 WILL NOT WORK WITH THE NLTK DEFAULT WORDNET INSTALLATION! | |
# | |
# We have installed version 1.7 in the following location | |
# /u/csc2501h/include/a4/nltk/corpora | |
# | |
# To use it we need to prepend the following directory to the | |
# NLTK search path for corpora and other data | |
import nltk | |
nltk.data.path[0:0] = ['/Users/Lia/Documents/CSC485/A4/nltk'] | |
# Now we can import wordnet 1.7 instead of later versions | |
from nltk.corpus import wordnet as wn17 | |
# The following code provides access to the tagged NY Times corpus | |
# nyt_big is the full corpus | |
# nyt_mini a small subset for development | |
from nltk.data import ZipFilePathPointer | |
from nltk.corpus import TaggedCorpusReader | |
nyt_zipped = ZipFilePathPointer('/Users/Lia/Documents/CSC485/A4/nltk/nyt.zip') | |
nyt_big = TaggedCorpusReader(nyt_zipped,'nyt/2004-tagged.txt',sep='/') | |
nyt_mini = TaggedCorpusReader(nyt_zipped,'nyt/nytimes-mini.txt',sep='/') | |
# Finally, let's set up a default pattern for NP chunking | |
# Setting up the NP chunker itself is left to the main script, to encourage | |
# trying different variants of the pattern | |
DefaultNpPattern = ''.join([r'(<DT|AT>?<RB>?)?', | |
r'<JJ.*|CD.*>*', | |
r'(<JJ.*|CD.*><,>)*', | |
r'(<N.*>)+']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import wordnet as wn | |
''' | |
CSC418 2014 A4 Part 1: Hypernym Relations | |
Methods for evaluation of cases for each confidence level are found here. | |
In Main: | |
Read output data from file containing low, med, and high confidence level | |
data. Data is an array containing arrays in the format of [hyponym, hypernym, int] | |
where int describes the number of additional times it has been seen. | |
Time: <1min (for nyt_big data)''' | |
RUNMAIN = 1 | |
FILENAME = "big_low.txt" | |
ENTRIES = 50 | |
#confidence level dictionaries whose keys indicate the case | |
low = {1:[], 2:[], 3:[], 4:[]} | |
med = {1:[], 2:[], 3:[], 4:[]} | |
high = {1:[], 2:[], 3:[], 4:[]} | |
def dictToFile(dictionary, case, filename, description, entries): | |
'''Given a confidence level (dictionary), an int key (case), the str | |
name of the output file (filename), a str describing the contents | |
of the file (description), and an int number (entries). Write number | |
of entries to file, one array of format [hyponym, hypernym, int] per line.''' | |
fp = open(filename, 'w+') | |
fp.write(description + "\n") | |
for i in dictionary[case]: | |
if (entries==0): | |
break | |
fp.write(str(i) + "\n") | |
entries -= 1 | |
fp.close() | |
def addToDict(pairAndCount): | |
''' Given an array of homonyms with the count [hyponym, hypernym, int]. | |
Evaluate case and add to confidence count dictionary.''' | |
#case is an array [hyponym, hypernym, case] | |
case = getCase(pairAndCount) | |
#check confidence level | |
if pairAndCount[-1] == 0: | |
#add pair to case key in confidence level dictionary | |
low[case[-1]].append(case[0:2]) | |
if pairAndCount[-1] == 1: | |
med[case[-1]].append(case[0:2]) | |
if pairAndCount[-1] >= 2: | |
#for high confidence want to know how often word appeared | |
high[case[-1]].append(case[0:2] + pairAndCount[-1:-1]) | |
def getCase(pairAndCount): | |
''' Given an array of words with the count: [hyponym, hypernym, int]. | |
Return array with pair of words and case [hyponym, hypernym, case]. | |
''' | |
pair = pairAndCount[0:2] | |
#find synsets for each word if it exists | |
#first and second are arrays contain synsets | |
[first, second, pair] = getWNSynsets(pair) | |
return checkRelation(first,second,pair) | |
def checkRelation(first, second, pair): | |
'''Given list of synsets for first and second and the pair of words | |
Return the pair and the given case.''' | |
#Case4: Either synset list is empty, thus not found in WordNet | |
if (first==[]): | |
return [pair[0],pair[1], 4] | |
if (second==[]): | |
return [pair[0],pair[1], 4] | |
#Only check for relations for nouns in synset list | |
first = keepNouns(first) | |
second = keepNouns(second) | |
#flag containing if a relationship holds | |
found = 0 | |
#check all synsets in first against all synsets in second | |
for a in first: | |
for b in second: | |
lowest_common = a.lowest_common_hypernyms(b) | |
#relationship holds | |
if (b in lowest_common): | |
found = 1 | |
#Case2: relationship contradicted (opposite relationship) | |
elif (a in lowest_common): | |
return [pair[0],pair[1], 2] | |
#Case2: relationship contradiction (other type of relationship) | |
elif (lowest_common != []): | |
return [pair[0],pair[1], 2] | |
#Case1: no contradictions so relationship holds for at least one | |
if (found): | |
return [pair[0],pair[1], 1] | |
#Case3: no relationship found | |
return [pair[0],pair[1], 3] | |
def keepNouns(synsets): | |
'''Takes list of synsets, removes all that are not nouns''' | |
for i in synsets: | |
if (str(i).find(".n.")==-1): | |
synsets.remove(i) | |
return synsets | |
def getWNSynsets(pair): | |
'''Given pair of hypernyms and return list of synsets associated | |
with each word and hypernym pair in an array: | |
[synset_list, synset_list, hypernym_pair].''' | |
pair = formatHypernyms(pair) | |
first = wn.synsets(pair[0]) | |
second = wn.synsets(pair[1]) | |
return (first, second, pair) | |
#####################Methods for Hyponym Formatting############################ | |
def formatHypernyms(pair): | |
'''Given a word pair [hyponym, hypernym] format each word so so that | |
it is searchable in WordNet. Basic formatting done here. ''' | |
for i in range(0, len(pair)): | |
pair[i] = removeDigits(pair[i]) | |
pair[i] = remPlural(pair[i]) | |
#remove leading apostrophe's | |
pair[i] = pair[i].lstrip("'") | |
#remove prefix "the " from string | |
pair[i] = pair[i].replace("the ", "") | |
#strip spaces from beginning and end of string | |
pair[i] = pair[i].strip() | |
#replace spaces with _ | |
pair[i] = pair[i].replace(" ", "_") | |
#replace dashes with _ | |
pair[i] = pair[i].replace("-", "_") | |
#replace double dashes with a single one | |
pair[i] = pair[i].replace("__", "_") | |
return pair | |
def removeDigits(word): | |
'''Return str word with leading digits removed''' | |
wordArray = word.split(" ") | |
if (wordArray[0].isdigit()): | |
return word[len(wordArray[0])+1:] | |
return word | |
def remPlural(word): | |
'''Returns str word in singular.''' | |
#turn words like families -> family | |
if len(word)>2 and word[-3:] == "ies": | |
return word[:-3] + "y" | |
#turn plural into singular | |
if len(word)>1 and word[-1] == "s": | |
return word[:-1] | |
return word | |
def process(filename): | |
#get file pointer for reading | |
fp = open(filename, 'r') | |
#get data as an array | |
data = eval(fp.read()) | |
for i in data: | |
addToDict(i) | |
def printStats(): | |
print("Low Confidence") | |
print("> Case 1: " + str(len(low[1]))) | |
print("> Case 2: " + str(len(low[2]))) | |
print("> Case 3: " + str(len(low[3]))) | |
print("> Case 4: " + str(len(low[4])) + "\n\n") | |
print("Med Confidence") | |
print("> Case 1: " + str(len(med[1]))) | |
print("> Case 2: " + str(len(med[2]))) | |
print("> Case 3: " + str(len(med[3]))) | |
print("> Case 4: " + str(len(med[4])) + "\n\n") | |
print("High Confidence\n") | |
print("> Case 1: " + str(len(high[1]))) | |
print("> Case 2: " + str(len(high[2]))) | |
print("> Case 3: " + str(len(high[3]))) | |
print("> Case 4: " + str(len(high[4])) + "\n\n") | |
if __name__=="__main__": | |
if (RUNMAIN): | |
process(FILENAME) | |
printStats() | |
for i in range(1,5): | |
dictToFile(low, i, "lowCase" + str(i) + ".txt", "", ENTRIES) | |
dictToFile(med, i, "medCase" + str(i) + ".txt", "", ENTRIES) | |
dictToFile(high, i, "highCase" + str(i) + ".txt", "", ENTRIES) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment