kkoch986/extraction.py

## extraction.py
import nltk, re, pprint
import json
import sys

# a function to convert a tree into an array for json encoding
def tree_to_dict(tree):
    return {tree.label(): [tree_to_dict(t)  if isinstance(t, nltk.Tree) else t for t in tree]}

# download required corpora
required_downloads = [
	"hmm_treebank_pos_tagger",
	"maxent_ne_chunker",
	"maxent_treebank_pos_tagger",
	"punkt",
	"treebank",
	"words"
]
for dl in required_downloads:
	nltk.download(dl)

# open and read the text into memory
file = "./a_game_of_thrones.txt"
fp = open(file, 'r')
document = fp.read()
fp.close()

# break it out by chapter
chapters = re.split("\n([A-Z][A-Z0-9]+)[ ]?[\r]?\n", document, 0, re.MULTILINE)[1:]
chapters = zip(chapters[0::2], chapters[1::2])

# prepare the parser for chunking
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
"""
chunker = nltk.RegexpParser(grammar)

# process each chapter
c = 0
for chapter in chapters:
	c = c + 1
	print "Chapter " + c.__str__() +  ": " + chapter[0]

	sentences = nltk.sent_tokenize(chapter[1])
	sys.stdout.write(sentences.__len__().__str__() + " sentences extracted...")
	sys.stdout.flush()

	sys.stdout.write("Tokenizing...")
	sys.stdout.flush()
	sentences = [nltk.word_tokenize(sent) for sent in sentences]

	sys.stdout.write("Tagging...")
	sys.stdout.flush()
	sentences = [nltk.pos_tag(sent) for sent in sentences]

	sys.stdout.write("Chunking...")
	sys.stdout.flush()
	sentences = [chunker.parse(sent) for sent in sentences]

	sentences = [tree_to_dict(sent) for sent in sentences]
	# sentences[0].draw()

	filename = "chapters/" + ("0" if c < 10 else "") + c.__str__() + "-" + chapter[0] + ".json"
	sys.stdout.write("Save to: " + filename + "...")
	sys.stdout.flush()
	fp = open(filename, "w")
	fp.write(json.dumps(sentences, sort_keys=True, indent=4, separators=(',', ': ')))
	fp.close()

	print "Done."
	import nltk, re, pprint
	import json
	import sys

	# a function to convert a tree into an array for json encoding
	def tree_to_dict(tree):
	return {tree.label(): [tree_to_dict(t) if isinstance(t, nltk.Tree) else t for t in tree]}

	# download required corpora
	required_downloads = [
	"hmm_treebank_pos_tagger",
	"maxent_ne_chunker",
	"maxent_treebank_pos_tagger",
	"punkt",
	"treebank",
	"words"
	]
	for dl in required_downloads:
	nltk.download(dl)

	# open and read the text into memory
	file = "./a_game_of_thrones.txt"
	fp = open(file, 'r')
	document = fp.read()
	fp.close()

	# break it out by chapter
	chapters = re.split("\n([A-Z][A-Z0-9]+)[ ]?[\r]?\n", document, 0, re.MULTILINE)[1:]
	chapters = zip(chapters[0::2], chapters[1::2])

	# prepare the parser for chunking
	grammar = r"""
	NP: {<DT\|JJ\|NN.*>+} # Chunk sequences of DT, JJ, NN
	PP: {<IN><NP>} # Chunk prepositions followed by NP
	VP: {<VB.*><NP\|PP\|CLAUSE>+$} # Chunk verbs and their arguments
	CLAUSE: {<NP><VP>} # Chunk NP, VP
	"""
	chunker = nltk.RegexpParser(grammar)

	# process each chapter
	c = 0
	for chapter in chapters:
	c = c + 1
	print "Chapter " + c.__str__() + ": " + chapter[0]

	sentences = nltk.sent_tokenize(chapter[1])
	sys.stdout.write(sentences.__len__().__str__() + " sentences extracted...")
	sys.stdout.flush()

	sys.stdout.write("Tokenizing...")
	sys.stdout.flush()
	sentences = [nltk.word_tokenize(sent) for sent in sentences]

	sys.stdout.write("Tagging...")
	sys.stdout.flush()
	sentences = [nltk.pos_tag(sent) for sent in sentences]

	sys.stdout.write("Chunking...")
	sys.stdout.flush()
	sentences = [chunker.parse(sent) for sent in sentences]

	sentences = [tree_to_dict(sent) for sent in sentences]
	# sentences[0].draw()

	filename = "chapters/" + ("0" if c < 10 else "") + c.__str__() + "-" + chapter[0] + ".json"
	sys.stdout.write("Save to: " + filename + "...")
	sys.stdout.flush()
	fp = open(filename, "w")
	fp.write(json.dumps(sentences, sort_keys=True, indent=4, separators=(',', ': ')))
	fp.close()

	print "Done."