ptbrowne/PA2.py

## PA2.py
#!/usr/bin/env python
#-*- coding:utf-8 -*-

# #Natural Language Processing in Python
# #Assignment #2
# Exercises: 4, 8, 12, 13, 18, 25, 26 (Chapter 2)
# Your Turn: Pages 44, 55
from __future__ import division

from PA1 import count_initial_indent, strip_initial_indent, write_doc
from collections import defaultdict
import nltk
from nltk import FreqDist, ConditionalFreqDist
from nltk.corpus import wordnet as wn
from nltk.corpus import brown

def count_words(text, words):
  counts = defaultdict(lambda : 0)
  for w in text:
    if w in words:
      counts[w] +=1

def average(values):
    """Computes the arithmetic mean of a list of numbers.

    >>> print average([20, 30, 70])
    40.0
    """
    return sum(values, 0.0) / len(values)

@write_doc
def ex4():
  """
  4. ○ Read in the texts of the State of the Union addresses, using the state_union corpus
  reader. Count occurrences of men, women, and people in each document. What has
  happened to the usage of these words over time?
  """
  su = nltk.corpus.state_union
  targets = ["women", "men"]
  counts = ConditionalFreqDist((target, doc[:4])
                                    for doc in su.fileids()
                                    for w in su.words(doc)
                                    for target in targets
                                    if w.lower() == target)
  # counts.plot()

@write_doc
def ex8():
  """
  8. ◑ Define a conditional frequency distribution over the Names Corpus that allows
  you to see which initial letters are more frequent for males versus females (see
  Figure 2-7).
  """
  names = nltk.corpus.names
  counts = ConditionalFreqDist((fileid[:-4], name[0])
                                for fileid in names.fileids()
                                for name in names.words(fileid))
  # counts.plot()


@write_doc
def ex12():
  """
  12. ◑ The CMU Pronouncing Dictionary contains multiple pronunciations for certain
  words. How many distinct words does it contain? What fraction of words in this
  dictionary have more than one possible pronunciation?
  """
  entries = nltk.corpus.cmudict.entries()
  all_words = [e[0] for e in entries]
  distinct_words = set(all_words)
  counts = FreqDist(all_words)
  polypron_words = [c for c in all_words if counts[c] > 1]
  final_fraction = len(polypron_words) / len(distinct_words)
  print "The CMU pronuncing dictionary contains %i distinct_words" % len(distinct_words)
  print "The fraction of words in this dictionnary which have",
  print " more than one possible pronunciation is %f" % final_fraction


@write_doc
def ex13():
  """
  13. ◑ What percentage of noun synsets have no hyponyms? You can get all noun syn-
  sets using wn.all_synsets('n').
  """
  nb_syn = sum(1 for _ in wn.all_synsets('n'))
  fac = len([s for s in wn.all_synsets('n') if len(s.hyponyms()) == 0]) / nb_syn
  print "The percentage of noun synsets with no hyponyms is %f" % fac

@write_doc
def ex18():
  """
  18. ◑ Write a program to print the 50 most frequent bigrams (pairs of adjacent words)
  of a text, omitting bigrams that contain stopwords.
  """
  def freq_bigrams_wo_stopwords(text, stopwords_):
    stop_words = set(s.lower() for s in stopwords_)
    bigrams_wo_sw = [b for b in nltk.bigrams(text)
                        if  (b[0].isalpha() and b[1].isalpha())
                        and (len(set(k.lower() for k in b) & stop_words) == 0)

                        ]
    return FreqDist(bigrams_wo_sw).keys()[:50]

  book = nltk.corpus.genesis
  text = book.words("english-web.txt")
  stop_words = nltk.corpus.stopwords.words("english")
  print "The most frequent bigrams of the genesis are"
  print [" ".join(b) for b in freq_bigrams_wo_stopwords(text, stop_words)]

@write_doc
def ex25():
  """
  25. ● Define a function find_language() that takes a string as its argument and returns
  a list of languages that have that string as a word. Use the udhr corpus and limit
  your searches to files in the Latin-1 encoding.
  """
  def find_language(string):
      udhr = nltk.corpus.udhr
      for doc in udhr.fileids() :
         if "Latin1" in doc :
          if string.decode("utf-8") in udhr.words(doc) :
            yield doc.replace("-Latin1", "")

  w = "liberté"
  print "%s is most likely from one of those languages %s" % (w, list(find_language("liberté")))

@write_doc
def ex26():
  """
  26. ● What is the branching factor of the noun hypernym hierarchy? I.e., for every
  noun synset that has hyponyms—or children in the hypernym hierarchy—how
  many do they have on average? You can get all noun synsets using wn.all_syn
  sets('n').
  """
  total = i = 0
  for s in wn.all_synsets('n') :
    ln = len(s.hyponyms())
    if ln != 0 :
      total += ln
      i += 1
  print "The branching factor of the noun hypernym hierarchy is %f" % (total / i)

@write_doc
def your_turn_p44():
  """
  Your Turn: Choose a different section of the Brown Corpus, and adapt
  the preceding example to count a selection of wh words, such as what,
  when, where, who and why.
  """
  news_text = brown.words(categories='science_fiction')
  fdist = nltk.FreqDist([w.lower() for w in news_text if w.startswith("wh")])

  for (wh_word, nb) in fdist.items():
    print wh_word, ":", nb

@write_doc
def your_turn_p55():
  """
  Your Turn: Working with the news and romance genres from the
  Brown Corpus, find out which days of the week are most newsworthy,
  and which are most romantic. Define a variable called days containing
  a list of days of the week, i.e., ['Monday', ...]. Now tabulate the counts
  for these words using cfd.tabulate(samples=days). Now try the same
  thing using plot in place of tabulate. You may control the output order
  of days with the help of an extra parameter: condi
  tions=['Monday', ...].
  """
  cfd = nltk.ConditionalFreqDist((genre, word)
                                    for genre in brown.categories()
                                      for word in brown.words(categories=genre))
  genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
  days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
  cfd.tabulate(conditions=genres, samples=days)


def main():
  ex4()
  ex8()
  ex12()
  ex13()
  ex18()
  ex25()
  ex26()
  your_turn_p44()
  your_turn_p55()

if __name__ == "__main__":
  main()
	#!/usr/bin/env python
	#-- coding:utf-8 --

	# #Natural Language Processing in Python
	# #Assignment #2
	# Exercises: 4, 8, 12, 13, 18, 25, 26 (Chapter 2)
	# Your Turn: Pages 44, 55
	from __future__ import division

	from PA1 import count_initial_indent, strip_initial_indent, write_doc
	from collections import defaultdict
	import nltk
	from nltk import FreqDist, ConditionalFreqDist
	from nltk.corpus import wordnet as wn
	from nltk.corpus import brown

	def count_words(text, words):
	counts = defaultdict(lambda : 0)
	for w in text:
	if w in words:
	counts[w] +=1

	def average(values):
	"""Computes the arithmetic mean of a list of numbers.

	>>> print average([20, 30, 70])
	40.0
	"""
	return sum(values, 0.0) / len(values)

	@write_doc
	def ex4():
	"""
	4. ○ Read in the texts of the State of the Union addresses, using the state_union corpus
	reader. Count occurrences of men, women, and people in each document. What has
	happened to the usage of these words over time?
	"""
	su = nltk.corpus.state_union
	targets = ["women", "men"]
	counts = ConditionalFreqDist((target, doc[:4])
	for doc in su.fileids()
	for w in su.words(doc)
	for target in targets
	if w.lower() == target)
	# counts.plot()

	@write_doc
	def ex8():
	"""
	8. ◑ Define a conditional frequency distribution over the Names Corpus that allows
	you to see which initial letters are more frequent for males versus females (see
	Figure 2-7).
	"""
	names = nltk.corpus.names
	counts = ConditionalFreqDist((fileid[:-4], name[0])
	for fileid in names.fileids()
	for name in names.words(fileid))
	# counts.plot()


	@write_doc
	def ex12():
	"""
	12. ◑ The CMU Pronouncing Dictionary contains multiple pronunciations for certain
	words. How many distinct words does it contain? What fraction of words in this
	dictionary have more than one possible pronunciation?
	"""
	entries = nltk.corpus.cmudict.entries()
	all_words = [e[0] for e in entries]
	distinct_words = set(all_words)
	counts = FreqDist(all_words)
	polypron_words = [c for c in all_words if counts[c] > 1]
	final_fraction = len(polypron_words) / len(distinct_words)
	print "The CMU pronuncing dictionary contains %i distinct_words" % len(distinct_words)
	print "The fraction of words in this dictionnary which have",
	print " more than one possible pronunciation is %f" % final_fraction


	@write_doc
	def ex13():
	"""
	13. ◑ What percentage of noun synsets have no hyponyms? You can get all noun syn-
	sets using wn.all_synsets('n').
	"""
	nb_syn = sum(1 for _ in wn.all_synsets('n'))
	fac = len([s for s in wn.all_synsets('n') if len(s.hyponyms()) == 0]) / nb_syn
	print "The percentage of noun synsets with no hyponyms is %f" % fac

	@write_doc
	def ex18():
	"""
	18. ◑ Write a program to print the 50 most frequent bigrams (pairs of adjacent words)
	of a text, omitting bigrams that contain stopwords.
	"""
	def freq_bigrams_wo_stopwords(text, stopwords_):
	stop_words = set(s.lower() for s in stopwords_)
	bigrams_wo_sw = [b for b in nltk.bigrams(text)
	if (b[0].isalpha() and b[1].isalpha())
	and (len(set(k.lower() for k in b) & stop_words) == 0)

	]
	return FreqDist(bigrams_wo_sw).keys()[:50]

	book = nltk.corpus.genesis
	text = book.words("english-web.txt")
	stop_words = nltk.corpus.stopwords.words("english")
	print "The most frequent bigrams of the genesis are"
	print [" ".join(b) for b in freq_bigrams_wo_stopwords(text, stop_words)]

	@write_doc
	def ex25():
	"""
	25. ● Define a function find_language() that takes a string as its argument and returns
	a list of languages that have that string as a word. Use the udhr corpus and limit
	your searches to files in the Latin-1 encoding.
	"""
	def find_language(string):
	udhr = nltk.corpus.udhr
	for doc in udhr.fileids() :
	if "Latin1" in doc :
	if string.decode("utf-8") in udhr.words(doc) :
	yield doc.replace("-Latin1", "")

	w = "liberté"
	print "%s is most likely from one of those languages %s" % (w, list(find_language("liberté")))

	@write_doc
	def ex26():
	"""
	26. ● What is the branching factor of the noun hypernym hierarchy? I.e., for every
	noun synset that has hyponyms—or children in the hypernym hierarchy—how
	many do they have on average? You can get all noun synsets using wn.all_syn
	sets('n').
	"""
	total = i = 0
	for s in wn.all_synsets('n') :
	ln = len(s.hyponyms())
	if ln != 0 :
	total += ln
	i += 1
	print "The branching factor of the noun hypernym hierarchy is %f" % (total / i)

	@write_doc
	def your_turn_p44():
	"""
	Your Turn: Choose a different section of the Brown Corpus, and adapt
	the preceding example to count a selection of wh words, such as what,
	when, where, who and why.
	"""
	news_text = brown.words(categories='science_fiction')
	fdist = nltk.FreqDist([w.lower() for w in news_text if w.startswith("wh")])

	for (wh_word, nb) in fdist.items():
	print wh_word, ":", nb

	@write_doc
	def your_turn_p55():
	"""
	Your Turn: Working with the news and romance genres from the
	Brown Corpus, find out which days of the week are most newsworthy,
	and which are most romantic. Define a variable called days containing
	a list of days of the week, i.e., ['Monday', ...]. Now tabulate the counts
	for these words using cfd.tabulate(samples=days). Now try the same
	thing using plot in place of tabulate. You may control the output order
	of days with the help of an extra parameter: condi
	tions=['Monday', ...].
	"""
	cfd = nltk.ConditionalFreqDist((genre, word)
	for genre in brown.categories()
	for word in brown.words(categories=genre))
	genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
	days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	cfd.tabulate(conditions=genres, samples=days)


	def main():
	ex4()
	ex8()
	ex12()
	ex13()
	ex18()
	ex25()
	ex26()
	your_turn_p44()
	your_turn_p55()

	if __name__ == "__main__":
	main()