theaspect/nltk.07.13.py

## nltk.07.13.py
#Pick one of the three chunk types in the CoNLL corpus.
#Write functions to do the following tasks for your chosen type:
#1. List all the tag sequences that occur with each instance of this chunk type.
#2. Count the frequency of each tag sequence, and produce a ranked list in order of decreasing
#   frequency; each line should consist of an integer (the frequency) and the tag sequence.
#3. Inspect the high-frequency tag sequences. Use these as the basis for developing a better chunker.

import nltk, sys
from nltk.corpus import conll2000
from nltk.probability import FreqDist

# We can choose NP, VP, PP
TYPE = "NP"
# User train set of conll2000
data = conll2000.chunked_sents('train.txt', chunk_types=[TYPE])

# Calculate statistics
def get_seq(data):
    tuples = list()
    for row in data:
        # Use window of 3 tags
        triplet = ("<START>","<START>","<START>")
        for st in row:
            # After each item shift out tuple left
            try:
                # Try to check if node of chosen type
                triplet = (triplet[1],triplet[2],st.node)
                if st.node == TYPE:
                    tuples.append(triplet)
            except:
                # Otherwise it is simple tag (tuple)
                triplet = (triplet[1],triplet[2],st[1])
    return tuples

# Using function from chapter 2 calculate statistics
def count_freq(tuples):
    fdist = FreqDist(tuples)
    for seq in fdist:
        print fdist[seq], seq

# Do first and second task
count_freq(get_seq(data))
	#Pick one of the three chunk types in the CoNLL corpus.
	#Write functions to do the following tasks for your chosen type:
	#1. List all the tag sequences that occur with each instance of this chunk type.
	#2. Count the frequency of each tag sequence, and produce a ranked list in order of decreasing
	# frequency; each line should consist of an integer (the frequency) and the tag sequence.
	#3. Inspect the high-frequency tag sequences. Use these as the basis for developing a better chunker.

	import nltk, sys
	from nltk.corpus import conll2000
	from nltk.probability import FreqDist

	# We can choose NP, VP, PP
	TYPE = "NP"
	# User train set of conll2000
	data = conll2000.chunked_sents('train.txt', chunk_types=[TYPE])

	# Calculate statistics
	def get_seq(data):
	tuples = list()
	for row in data:
	# Use window of 3 tags
	triplet = ("<START>","<START>","<START>")
	for st in row:
	# After each item shift out tuple left
	try:
	# Try to check if node of chosen type
	triplet = (triplet[1],triplet[2],st.node)
	if st.node == TYPE:
	tuples.append(triplet)
	except:
	# Otherwise it is simple tag (tuple)
	triplet = (triplet[1],triplet[2],st[1])
	return tuples

	# Using function from chapter 2 calculate statistics
	def count_freq(tuples):
	fdist = FreqDist(tuples)
	for seq in fdist:
	print fdist[seq], seq

	# Do first and second task
	count_freq(get_seq(data))