Created
March 4, 2012 11:37
-
-
Save ptbrowne/1972600 to your computer and use it in GitHub Desktop.
NLP : PA2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding:utf-8 -*- | |
# #Natural Language Processing in Python | |
# #Assignment #2 | |
# Exercises: 4, 8, 12, 13, 18, 25, 26 (Chapter 2) | |
# Your Turn: Pages 44, 55 | |
from __future__ import division | |
from PA1 import count_initial_indent, strip_initial_indent, write_doc | |
from collections import defaultdict | |
import nltk | |
from nltk import FreqDist, ConditionalFreqDist | |
from nltk.corpus import wordnet as wn | |
from nltk.corpus import brown | |
def count_words(text, words): | |
counts = defaultdict(lambda : 0) | |
for w in text: | |
if w in words: | |
counts[w] +=1 | |
def average(values): | |
"""Computes the arithmetic mean of a list of numbers. | |
>>> print average([20, 30, 70]) | |
40.0 | |
""" | |
return sum(values, 0.0) / len(values) | |
@write_doc | |
def ex4(): | |
""" | |
4. ○ Read in the texts of the State of the Union addresses, using the state_union corpus | |
reader. Count occurrences of men, women, and people in each document. What has | |
happened to the usage of these words over time? | |
""" | |
su = nltk.corpus.state_union | |
targets = ["women", "men"] | |
counts = ConditionalFreqDist((target, doc[:4]) | |
for doc in su.fileids() | |
for w in su.words(doc) | |
for target in targets | |
if w.lower() == target) | |
# counts.plot() | |
@write_doc | |
def ex8(): | |
""" | |
8. ◑ Define a conditional frequency distribution over the Names Corpus that allows | |
you to see which initial letters are more frequent for males versus females (see | |
Figure 2-7). | |
""" | |
names = nltk.corpus.names | |
counts = ConditionalFreqDist((fileid[:-4], name[0]) | |
for fileid in names.fileids() | |
for name in names.words(fileid)) | |
# counts.plot() | |
@write_doc | |
def ex12(): | |
""" | |
12. ◑ The CMU Pronouncing Dictionary contains multiple pronunciations for certain | |
words. How many distinct words does it contain? What fraction of words in this | |
dictionary have more than one possible pronunciation? | |
""" | |
entries = nltk.corpus.cmudict.entries() | |
all_words = [e[0] for e in entries] | |
distinct_words = set(all_words) | |
counts = FreqDist(all_words) | |
polypron_words = [c for c in all_words if counts[c] > 1] | |
final_fraction = len(polypron_words) / len(distinct_words) | |
print "The CMU pronuncing dictionary contains %i distinct_words" % len(distinct_words) | |
print "The fraction of words in this dictionnary which have", | |
print " more than one possible pronunciation is %f" % final_fraction | |
@write_doc | |
def ex13(): | |
""" | |
13. ◑ What percentage of noun synsets have no hyponyms? You can get all noun syn- | |
sets using wn.all_synsets('n'). | |
""" | |
nb_syn = sum(1 for _ in wn.all_synsets('n')) | |
fac = len([s for s in wn.all_synsets('n') if len(s.hyponyms()) == 0]) / nb_syn | |
print "The percentage of noun synsets with no hyponyms is %f" % fac | |
@write_doc | |
def ex18(): | |
""" | |
18. ◑ Write a program to print the 50 most frequent bigrams (pairs of adjacent words) | |
of a text, omitting bigrams that contain stopwords. | |
""" | |
def freq_bigrams_wo_stopwords(text, stopwords_): | |
stop_words = set(s.lower() for s in stopwords_) | |
bigrams_wo_sw = [b for b in nltk.bigrams(text) | |
if (b[0].isalpha() and b[1].isalpha()) | |
and (len(set(k.lower() for k in b) & stop_words) == 0) | |
] | |
return FreqDist(bigrams_wo_sw).keys()[:50] | |
book = nltk.corpus.genesis | |
text = book.words("english-web.txt") | |
stop_words = nltk.corpus.stopwords.words("english") | |
print "The most frequent bigrams of the genesis are" | |
print [" ".join(b) for b in freq_bigrams_wo_stopwords(text, stop_words)] | |
@write_doc | |
def ex25(): | |
""" | |
25. ● Define a function find_language() that takes a string as its argument and returns | |
a list of languages that have that string as a word. Use the udhr corpus and limit | |
your searches to files in the Latin-1 encoding. | |
""" | |
def find_language(string): | |
udhr = nltk.corpus.udhr | |
for doc in udhr.fileids() : | |
if "Latin1" in doc : | |
if string.decode("utf-8") in udhr.words(doc) : | |
yield doc.replace("-Latin1", "") | |
w = "liberté" | |
print "%s is most likely from one of those languages %s" % (w, list(find_language("liberté"))) | |
@write_doc | |
def ex26(): | |
""" | |
26. ● What is the branching factor of the noun hypernym hierarchy? I.e., for every | |
noun synset that has hyponyms—or children in the hypernym hierarchy—how | |
many do they have on average? You can get all noun synsets using wn.all_syn | |
sets('n'). | |
""" | |
total = i = 0 | |
for s in wn.all_synsets('n') : | |
ln = len(s.hyponyms()) | |
if ln != 0 : | |
total += ln | |
i += 1 | |
print "The branching factor of the noun hypernym hierarchy is %f" % (total / i) | |
@write_doc | |
def your_turn_p44(): | |
""" | |
Your Turn: Choose a different section of the Brown Corpus, and adapt | |
the preceding example to count a selection of wh words, such as what, | |
when, where, who and why. | |
""" | |
news_text = brown.words(categories='science_fiction') | |
fdist = nltk.FreqDist([w.lower() for w in news_text if w.startswith("wh")]) | |
for (wh_word, nb) in fdist.items(): | |
print wh_word, ":", nb | |
@write_doc | |
def your_turn_p55(): | |
""" | |
Your Turn: Working with the news and romance genres from the | |
Brown Corpus, find out which days of the week are most newsworthy, | |
and which are most romantic. Define a variable called days containing | |
a list of days of the week, i.e., ['Monday', ...]. Now tabulate the counts | |
for these words using cfd.tabulate(samples=days). Now try the same | |
thing using plot in place of tabulate. You may control the output order | |
of days with the help of an extra parameter: condi | |
tions=['Monday', ...]. | |
""" | |
cfd = nltk.ConditionalFreqDist((genre, word) | |
for genre in brown.categories() | |
for word in brown.words(categories=genre)) | |
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] | |
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
cfd.tabulate(conditions=genres, samples=days) | |
def main(): | |
ex4() | |
ex8() | |
ex12() | |
ex13() | |
ex18() | |
ex25() | |
ex26() | |
your_turn_p44() | |
your_turn_p55() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment