Skip to content

Instantly share code, notes, and snippets.

@MissCatLady
Created November 9, 2014 19:36
Show Gist options
  • Save MissCatLady/e0fbe993f4dbb4adde25 to your computer and use it in GitHub Desktop.
Save MissCatLady/e0fbe993f4dbb4adde25 to your computer and use it in GitHub Desktop.
Hyponym/Hypernym Pairs (using NLTK)
"""
Components that you need for for Assignment 4 in CSC2501/485, Fall 2010.
Usage:
from Asst4 import nyt_big ## the full NYT corpus for A4
from Asst4 import nyt_mini ## only the first 100K lines from nyt_big, for
## development
from Asst4 import wn17 ## WordNet 1.7, newer versions won't work for Q2
from Asst4 import DefaultNpChunker ## a simple NP chunker to get things started
"""
# T(he original version of t)his code was written by Ulrich Germann (11/2010)
# It provides components necessary for Assignment 4 of CSC 485/2501h
######################################################################
#
# ATTENTION!
#
# THE INFORMATION BELOW IS ***CRUCIAL*** FOR YOUR SUCCESS IN Q2
#
# Question 2 of A4 works only with Wordnet 1.7 or earlier
#
# QUESTION 2 WILL NOT WORK WITH THE NLTK DEFAULT WORDNET INSTALLATION!
#
# We have installed version 1.7 in the following location
# /u/csc2501h/include/a4/nltk/corpora
#
# To use it we need to prepend the following directory to the
# NLTK search path for corpora and other data
import nltk
nltk.data.path[0:0] = ['/Users/Lia/Documents/CSC485/A4/nltk']
# Now we can import wordnet 1.7 instead of later versions
from nltk.corpus import wordnet as wn17
# The following code provides access to the tagged NY Times corpus
# nyt_big is the full corpus
# nyt_mini a small subset for development
from nltk.data import ZipFilePathPointer
from nltk.corpus import TaggedCorpusReader
nyt_zipped = ZipFilePathPointer('/Users/Lia/Documents/CSC485/A4/nltk/nyt.zip')
nyt_big = TaggedCorpusReader(nyt_zipped,'nyt/2004-tagged.txt',sep='/')
nyt_mini = TaggedCorpusReader(nyt_zipped,'nyt/nytimes-mini.txt',sep='/')
# Finally, let's set up a default pattern for NP chunking
# Setting up the NP chunker itself is left to the main script, to encourage
# trying different variants of the pattern
DefaultNpPattern = ''.join([r'(<DT|AT>?<RB>?)?',
r'<JJ.*|CD.*>*',
r'(<JJ.*|CD.*><,>)*',
r'(<N.*>)+'])
import nltk
from nltk.corpus import wordnet as wn
'''
CSC418 2014 A4 Part 1: Hypernym Relations
Methods for evaluation of cases for each confidence level are found here.
In Main:
Read output data from file containing low, med, and high confidence level
data. Data is an array containing arrays in the format of [hyponym, hypernym, int]
where int describes the number of additional times it has been seen.
Time: <1min (for nyt_big data)'''
RUNMAIN = 1
FILENAME = "big_low.txt"
ENTRIES = 50
#confidence level dictionaries whose keys indicate the case
low = {1:[], 2:[], 3:[], 4:[]}
med = {1:[], 2:[], 3:[], 4:[]}
high = {1:[], 2:[], 3:[], 4:[]}
def dictToFile(dictionary, case, filename, description, entries):
'''Given a confidence level (dictionary), an int key (case), the str
name of the output file (filename), a str describing the contents
of the file (description), and an int number (entries). Write number
of entries to file, one array of format [hyponym, hypernym, int] per line.'''
fp = open(filename, 'w+')
fp.write(description + "\n")
for i in dictionary[case]:
if (entries==0):
break
fp.write(str(i) + "\n")
entries -= 1
fp.close()
def addToDict(pairAndCount):
''' Given an array of homonyms with the count [hyponym, hypernym, int].
Evaluate case and add to confidence count dictionary.'''
#case is an array [hyponym, hypernym, case]
case = getCase(pairAndCount)
#check confidence level
if pairAndCount[-1] == 0:
#add pair to case key in confidence level dictionary
low[case[-1]].append(case[0:2])
if pairAndCount[-1] == 1:
med[case[-1]].append(case[0:2])
if pairAndCount[-1] >= 2:
#for high confidence want to know how often word appeared
high[case[-1]].append(case[0:2] + pairAndCount[-1:-1])
def getCase(pairAndCount):
''' Given an array of words with the count: [hyponym, hypernym, int].
Return array with pair of words and case [hyponym, hypernym, case].
'''
pair = pairAndCount[0:2]
#find synsets for each word if it exists
#first and second are arrays contain synsets
[first, second, pair] = getWNSynsets(pair)
return checkRelation(first,second,pair)
def checkRelation(first, second, pair):
'''Given list of synsets for first and second and the pair of words
Return the pair and the given case.'''
#Case4: Either synset list is empty, thus not found in WordNet
if (first==[]):
return [pair[0],pair[1], 4]
if (second==[]):
return [pair[0],pair[1], 4]
#Only check for relations for nouns in synset list
first = keepNouns(first)
second = keepNouns(second)
#flag containing if a relationship holds
found = 0
#check all synsets in first against all synsets in second
for a in first:
for b in second:
lowest_common = a.lowest_common_hypernyms(b)
#relationship holds
if (b in lowest_common):
found = 1
#Case2: relationship contradicted (opposite relationship)
elif (a in lowest_common):
return [pair[0],pair[1], 2]
#Case2: relationship contradiction (other type of relationship)
elif (lowest_common != []):
return [pair[0],pair[1], 2]
#Case1: no contradictions so relationship holds for at least one
if (found):
return [pair[0],pair[1], 1]
#Case3: no relationship found
return [pair[0],pair[1], 3]
def keepNouns(synsets):
'''Takes list of synsets, removes all that are not nouns'''
for i in synsets:
if (str(i).find(".n.")==-1):
synsets.remove(i)
return synsets
def getWNSynsets(pair):
'''Given pair of hypernyms and return list of synsets associated
with each word and hypernym pair in an array:
[synset_list, synset_list, hypernym_pair].'''
pair = formatHypernyms(pair)
first = wn.synsets(pair[0])
second = wn.synsets(pair[1])
return (first, second, pair)
#####################Methods for Hyponym Formatting############################
def formatHypernyms(pair):
'''Given a word pair [hyponym, hypernym] format each word so so that
it is searchable in WordNet. Basic formatting done here. '''
for i in range(0, len(pair)):
pair[i] = removeDigits(pair[i])
pair[i] = remPlural(pair[i])
#remove leading apostrophe's
pair[i] = pair[i].lstrip("'")
#remove prefix "the " from string
pair[i] = pair[i].replace("the ", "")
#strip spaces from beginning and end of string
pair[i] = pair[i].strip()
#replace spaces with _
pair[i] = pair[i].replace(" ", "_")
#replace dashes with _
pair[i] = pair[i].replace("-", "_")
#replace double dashes with a single one
pair[i] = pair[i].replace("__", "_")
return pair
def removeDigits(word):
'''Return str word with leading digits removed'''
wordArray = word.split(" ")
if (wordArray[0].isdigit()):
return word[len(wordArray[0])+1:]
return word
def remPlural(word):
'''Returns str word in singular.'''
#turn words like families -> family
if len(word)>2 and word[-3:] == "ies":
return word[:-3] + "y"
#turn plural into singular
if len(word)>1 and word[-1] == "s":
return word[:-1]
return word
def process(filename):
#get file pointer for reading
fp = open(filename, 'r')
#get data as an array
data = eval(fp.read())
for i in data:
addToDict(i)
def printStats():
print("Low Confidence")
print("> Case 1: " + str(len(low[1])))
print("> Case 2: " + str(len(low[2])))
print("> Case 3: " + str(len(low[3])))
print("> Case 4: " + str(len(low[4])) + "\n\n")
print("Med Confidence")
print("> Case 1: " + str(len(med[1])))
print("> Case 2: " + str(len(med[2])))
print("> Case 3: " + str(len(med[3])))
print("> Case 4: " + str(len(med[4])) + "\n\n")
print("High Confidence\n")
print("> Case 1: " + str(len(high[1])))
print("> Case 2: " + str(len(high[2])))
print("> Case 3: " + str(len(high[3])))
print("> Case 4: " + str(len(high[4])) + "\n\n")
if __name__=="__main__":
if (RUNMAIN):
process(FILENAME)
printStats()
for i in range(1,5):
dictToFile(low, i, "lowCase" + str(i) + ".txt", "", ENTRIES)
dictToFile(med, i, "medCase" + str(i) + ".txt", "", ENTRIES)
dictToFile(high, i, "highCase" + str(i) + ".txt", "", ENTRIES)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment