Last active
April 12, 2018 05:33
-
-
Save kkoch986/dc2999488e0bb92cc8c3 to your computer and use it in GitHub Desktop.
NLTK Sentence extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk, re, pprint | |
import json | |
import sys | |
# a function to convert a tree into an array for json encoding | |
def tree_to_dict(tree): | |
return {tree.label(): [tree_to_dict(t) if isinstance(t, nltk.Tree) else t for t in tree]} | |
# download required corpora | |
required_downloads = [ | |
"hmm_treebank_pos_tagger", | |
"maxent_ne_chunker", | |
"maxent_treebank_pos_tagger", | |
"punkt", | |
"treebank", | |
"words" | |
] | |
for dl in required_downloads: | |
nltk.download(dl) | |
# open and read the text into memory | |
file = "./a_game_of_thrones.txt" | |
fp = open(file, 'r') | |
document = fp.read() | |
fp.close() | |
# break it out by chapter | |
chapters = re.split("\n([A-Z][A-Z0-9]+)[ ]?[\r]?\n", document, 0, re.MULTILINE)[1:] | |
chapters = zip(chapters[0::2], chapters[1::2]) | |
# prepare the parser for chunking | |
grammar = r""" | |
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN | |
PP: {<IN><NP>} # Chunk prepositions followed by NP | |
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments | |
CLAUSE: {<NP><VP>} # Chunk NP, VP | |
""" | |
chunker = nltk.RegexpParser(grammar) | |
# process each chapter | |
c = 0 | |
for chapter in chapters: | |
c = c + 1 | |
print "Chapter " + c.__str__() + ": " + chapter[0] | |
sentences = nltk.sent_tokenize(chapter[1]) | |
sys.stdout.write(sentences.__len__().__str__() + " sentences extracted...") | |
sys.stdout.flush() | |
sys.stdout.write("Tokenizing...") | |
sys.stdout.flush() | |
sentences = [nltk.word_tokenize(sent) for sent in sentences] | |
sys.stdout.write("Tagging...") | |
sys.stdout.flush() | |
sentences = [nltk.pos_tag(sent) for sent in sentences] | |
sys.stdout.write("Chunking...") | |
sys.stdout.flush() | |
sentences = [chunker.parse(sent) for sent in sentences] | |
sentences = [tree_to_dict(sent) for sent in sentences] | |
# sentences[0].draw() | |
filename = "chapters/" + ("0" if c < 10 else "") + c.__str__() + "-" + chapter[0] + ".json" | |
sys.stdout.write("Save to: " + filename + "...") | |
sys.stdout.flush() | |
fp = open(filename, "w") | |
fp.write(json.dumps(sentences, sort_keys=True, indent=4, separators=(',', ': '))) | |
fp.close() | |
print "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment