bdewilde/basic_chunking.py

## basic_chunking.py
def chunk_tagged_sents(tagged_sents):

    from nltk.chunk import regexp

    # define a chunk "grammar", i.e. chunking rules
    grammar = r"""
        NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase
        PP: {<IN><NP>}               # prepositional phrase
        VP: {<MD>?<VB.*><NP|PP>}     # verb phrase
        CLAUSE: {<NP><VP>}           # full clause
	"""
    chunker = regexp.RegexpParser(grammar, loop=2)
    chunked_sents = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents]

    return chunked_sents

def get_chunks(chunked_sents, chunk_type='NP'):

    all_chunks = []
    # chunked sentences are in the form of nested trees
    for tree in chunked_sents:
        chunks = []
        # iterate through subtrees / leaves to get individual chunks
        raw_chunks = [subtree.leaves() for subtree in tree.subtrees()
                      if subtree.node == chunk_type]
        for raw_chunk in raw_chunks:
            chunk = []
            for word_tag in raw_chunk:
            	# drop POS tags, keep words
                chunk.append(word_tag[0])
            chunks.append(' '.join(chunk))
        all_chunks.append(chunks)

    return all_chunks
	def chunk_tagged_sents(tagged_sents):

	from nltk.chunk import regexp

	# define a chunk "grammar", i.e. chunking rules
	grammar = r"""
	NP: {<DT\|PP\$>?<JJ><NN.>+} # noun phrase
	PP: {<IN><NP>} # prepositional phrase
	VP: {<MD>?<VB.*><NP\|PP>} # verb phrase
	CLAUSE: {<NP><VP>} # full clause
	"""
	chunker = regexp.RegexpParser(grammar, loop=2)
	chunked_sents = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents]

	return chunked_sents

	def get_chunks(chunked_sents, chunk_type='NP'):

	all_chunks = []
	# chunked sentences are in the form of nested trees
	for tree in chunked_sents:
	chunks = []
	# iterate through subtrees / leaves to get individual chunks
	raw_chunks = [subtree.leaves() for subtree in tree.subtrees()
	if subtree.node == chunk_type]
	for raw_chunk in raw_chunks:
	chunk = []
	for word_tag in raw_chunk:
	# drop POS tags, keep words
	chunk.append(word_tag[0])
	chunks.append(' '.join(chunk))
	all_chunks.append(chunks)

	return all_chunks