Skip to content

Instantly share code, notes, and snippets.

@ptbrowne
Created March 4, 2012 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ptbrowne/1972600 to your computer and use it in GitHub Desktop.
Save ptbrowne/1972600 to your computer and use it in GitHub Desktop.
NLP : PA2
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# #Natural Language Processing in Python
# #Assignment #2
# Exercises: 4, 8, 12, 13, 18, 25, 26 (Chapter 2)
# Your Turn: Pages 44, 55
from __future__ import division
from PA1 import count_initial_indent, strip_initial_indent, write_doc
from collections import defaultdict
import nltk
from nltk import FreqDist, ConditionalFreqDist
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
def count_words(text, words):
counts = defaultdict(lambda : 0)
for w in text:
if w in words:
counts[w] +=1
def average(values):
"""Computes the arithmetic mean of a list of numbers.
>>> print average([20, 30, 70])
40.0
"""
return sum(values, 0.0) / len(values)
@write_doc
def ex4():
"""
4. ○ Read in the texts of the State of the Union addresses, using the state_union corpus
reader. Count occurrences of men, women, and people in each document. What has
happened to the usage of these words over time?
"""
su = nltk.corpus.state_union
targets = ["women", "men"]
counts = ConditionalFreqDist((target, doc[:4])
for doc in su.fileids()
for w in su.words(doc)
for target in targets
if w.lower() == target)
# counts.plot()
@write_doc
def ex8():
"""
8. ◑ Define a conditional frequency distribution over the Names Corpus that allows
you to see which initial letters are more frequent for males versus females (see
Figure 2-7).
"""
names = nltk.corpus.names
counts = ConditionalFreqDist((fileid[:-4], name[0])
for fileid in names.fileids()
for name in names.words(fileid))
# counts.plot()
@write_doc
def ex12():
"""
12. ◑ The CMU Pronouncing Dictionary contains multiple pronunciations for certain
words. How many distinct words does it contain? What fraction of words in this
dictionary have more than one possible pronunciation?
"""
entries = nltk.corpus.cmudict.entries()
all_words = [e[0] for e in entries]
distinct_words = set(all_words)
counts = FreqDist(all_words)
polypron_words = [c for c in all_words if counts[c] > 1]
final_fraction = len(polypron_words) / len(distinct_words)
print "The CMU pronuncing dictionary contains %i distinct_words" % len(distinct_words)
print "The fraction of words in this dictionnary which have",
print " more than one possible pronunciation is %f" % final_fraction
@write_doc
def ex13():
"""
13. ◑ What percentage of noun synsets have no hyponyms? You can get all noun syn-
sets using wn.all_synsets('n').
"""
nb_syn = sum(1 for _ in wn.all_synsets('n'))
fac = len([s for s in wn.all_synsets('n') if len(s.hyponyms()) == 0]) / nb_syn
print "The percentage of noun synsets with no hyponyms is %f" % fac
@write_doc
def ex18():
"""
18. ◑ Write a program to print the 50 most frequent bigrams (pairs of adjacent words)
of a text, omitting bigrams that contain stopwords.
"""
def freq_bigrams_wo_stopwords(text, stopwords_):
stop_words = set(s.lower() for s in stopwords_)
bigrams_wo_sw = [b for b in nltk.bigrams(text)
if (b[0].isalpha() and b[1].isalpha())
and (len(set(k.lower() for k in b) & stop_words) == 0)
]
return FreqDist(bigrams_wo_sw).keys()[:50]
book = nltk.corpus.genesis
text = book.words("english-web.txt")
stop_words = nltk.corpus.stopwords.words("english")
print "The most frequent bigrams of the genesis are"
print [" ".join(b) for b in freq_bigrams_wo_stopwords(text, stop_words)]
@write_doc
def ex25():
"""
25. ● Define a function find_language() that takes a string as its argument and returns
a list of languages that have that string as a word. Use the udhr corpus and limit
your searches to files in the Latin-1 encoding.
"""
def find_language(string):
udhr = nltk.corpus.udhr
for doc in udhr.fileids() :
if "Latin1" in doc :
if string.decode("utf-8") in udhr.words(doc) :
yield doc.replace("-Latin1", "")
w = "liberté"
print "%s is most likely from one of those languages %s" % (w, list(find_language("liberté")))
@write_doc
def ex26():
"""
26. ● What is the branching factor of the noun hypernym hierarchy? I.e., for every
noun synset that has hyponyms—or children in the hypernym hierarchy—how
many do they have on average? You can get all noun synsets using wn.all_syn
sets('n').
"""
total = i = 0
for s in wn.all_synsets('n') :
ln = len(s.hyponyms())
if ln != 0 :
total += ln
i += 1
print "The branching factor of the noun hypernym hierarchy is %f" % (total / i)
@write_doc
def your_turn_p44():
"""
Your Turn: Choose a different section of the Brown Corpus, and adapt
the preceding example to count a selection of wh words, such as what,
when, where, who and why.
"""
news_text = brown.words(categories='science_fiction')
fdist = nltk.FreqDist([w.lower() for w in news_text if w.startswith("wh")])
for (wh_word, nb) in fdist.items():
print wh_word, ":", nb
@write_doc
def your_turn_p55():
"""
Your Turn: Working with the news and romance genres from the
Brown Corpus, find out which days of the week are most newsworthy,
and which are most romantic. Define a variable called days containing
a list of days of the week, i.e., ['Monday', ...]. Now tabulate the counts
for these words using cfd.tabulate(samples=days). Now try the same
thing using plot in place of tabulate. You may control the output order
of days with the help of an extra parameter: condi
tions=['Monday', ...].
"""
cfd = nltk.ConditionalFreqDist((genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cfd.tabulate(conditions=genres, samples=days)
def main():
ex4()
ex8()
ex12()
ex13()
ex18()
ex25()
ex26()
your_turn_p44()
your_turn_p55()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment