Last active
May 4, 2018 18:32
-
-
Save widiger-anna/7f8ff5098888691c7514de788861d2ea to your computer and use it in GitHub Desktop.
NLP recipe 1: processing tweets with spaCy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals, print_function | |
#pip install spacy && python -m spacy download en | |
import spacy | |
''' | |
NLP tasks: | |
1.tokenization | |
2.part of speech tagging | |
3.chunking | |
''' | |
#load the language model for English | |
nlp = spacy.load('en_core_web_sm') | |
# Link to tweet: https://twitter.com/dataandme/status/989938791744987137 | |
doc1 = nlp(u"⭐️ intro to a 🌟 tool! \"🕵️ RegExplain\" by @grrrck") | |
# test tokenization and POS tagging | |
for token in doc1: | |
if not token.is_punct: # filter punctuation | |
print(token.text, token.pos_) | |
# test noun chunks | |
for chunk in doc1.noun_chunks: | |
print(chunk.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment