Last active
April 12, 2019 15:18
-
-
Save bdewilde/5393079 to your computer and use it in GitHub Desktop.
basic regular expression chunker and chunk-getter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def chunk_tagged_sents(tagged_sents): | |
from nltk.chunk import regexp | |
# define a chunk "grammar", i.e. chunking rules | |
grammar = r""" | |
NP: {<DT|PP\$>?<JJ>*<NN.*>+} # noun phrase | |
PP: {<IN><NP>} # prepositional phrase | |
VP: {<MD>?<VB.*><NP|PP>} # verb phrase | |
CLAUSE: {<NP><VP>} # full clause | |
""" | |
chunker = regexp.RegexpParser(grammar, loop=2) | |
chunked_sents = [chunker.parse(tagged_sent) for tagged_sent in tagged_sents] | |
return chunked_sents | |
def get_chunks(chunked_sents, chunk_type='NP'): | |
all_chunks = [] | |
# chunked sentences are in the form of nested trees | |
for tree in chunked_sents: | |
chunks = [] | |
# iterate through subtrees / leaves to get individual chunks | |
raw_chunks = [subtree.leaves() for subtree in tree.subtrees() | |
if subtree.node == chunk_type] | |
for raw_chunk in raw_chunks: | |
chunk = [] | |
for word_tag in raw_chunk: | |
# drop POS tags, keep words | |
chunk.append(word_tag[0]) | |
chunks.append(' '.join(chunk)) | |
all_chunks.append(chunks) | |
return all_chunks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hey i appreciate your code, in my case, i wanna write a grammar like this :
but i find difficult to do so, do u have any documentation would help me.