Skip to content

Instantly share code, notes, and snippets.

@kkoch986
Last active April 12, 2018 05:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkoch986/dc2999488e0bb92cc8c3 to your computer and use it in GitHub Desktop.
Save kkoch986/dc2999488e0bb92cc8c3 to your computer and use it in GitHub Desktop.
NLTK Sentence extraction
import nltk, re, pprint
import json
import sys
# a function to convert a tree into an array for json encoding
def tree_to_dict(tree):
return {tree.label(): [tree_to_dict(t) if isinstance(t, nltk.Tree) else t for t in tree]}
# download required corpora
required_downloads = [
"hmm_treebank_pos_tagger",
"maxent_ne_chunker",
"maxent_treebank_pos_tagger",
"punkt",
"treebank",
"words"
]
for dl in required_downloads:
nltk.download(dl)
# open and read the text into memory
file = "./a_game_of_thrones.txt"
fp = open(file, 'r')
document = fp.read()
fp.close()
# break it out by chapter
chapters = re.split("\n([A-Z][A-Z0-9]+)[ ]?[\r]?\n", document, 0, re.MULTILINE)[1:]
chapters = zip(chapters[0::2], chapters[1::2])
# prepare the parser for chunking
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
chunker = nltk.RegexpParser(grammar)
# process each chapter
c = 0
for chapter in chapters:
c = c + 1
print "Chapter " + c.__str__() + ": " + chapter[0]
sentences = nltk.sent_tokenize(chapter[1])
sys.stdout.write(sentences.__len__().__str__() + " sentences extracted...")
sys.stdout.flush()
sys.stdout.write("Tokenizing...")
sys.stdout.flush()
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sys.stdout.write("Tagging...")
sys.stdout.flush()
sentences = [nltk.pos_tag(sent) for sent in sentences]
sys.stdout.write("Chunking...")
sys.stdout.flush()
sentences = [chunker.parse(sent) for sent in sentences]
sentences = [tree_to_dict(sent) for sent in sentences]
# sentences[0].draw()
filename = "chapters/" + ("0" if c < 10 else "") + c.__str__() + "-" + chapter[0] + ".json"
sys.stdout.write("Save to: " + filename + "...")
sys.stdout.flush()
fp = open(filename, "w")
fp.write(json.dumps(sentences, sort_keys=True, indent=4, separators=(',', ': ')))
fp.close()
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment