Last active
July 24, 2020 19:06
-
-
Save EnkrateiaLucca/2472c878feff828a70234f72adf13d1e to your computer and use it in GitHub Desktop.
chunking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
sp = spacy.load("en_core_web_sm") | |
sentence = "Only accept truths that are absolutely evident" | |
#Tokenizing the sentence | |
sentence_token = word_tokenize(sentence) | |
# Tagging the tokens of the sentence | |
sentence_tagged = nltk.pos_tag(sentence_token) | |
print(sentence_tagged) | |
# Stablishing a pattern to give to the chunker | |
pattern = r""" | |
VP: {<ADJ_SIM><V_PRS>} | |
VP: {<ADJ_INO><V.*>} | |
VP: {<V_PRS><N_SING><V_SUB>} | |
NP: {<N_SING><ADJ.*><N_SING>} | |
NP: {<N.*><PRO>} | |
VP: {<N_SING><V_.*>} | |
VP: {<V.*>+} | |
NP: {<ADJ.*>?<N.*>+ <ADJ.*>?} | |
DNP: {<DET><NP>} | |
PP: {<ADJ_CMPR><P>} | |
PP: {<ADJ_SIM><P>} | |
PP: {<P><N_SING>} | |
PP: {<P>*} | |
DDNP: {<NP><DNP>} | |
NPP: {<PP><NP>+} | |
""" | |
# Chunking | |
chunker = nltk.RegexpParser(r'{}'.format(pattern)) | |
chunker.parse(sentence_tagged) | |
Output = chunker.parse(sentence_tagged) | |
Output.draw() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment