Last active
May 4, 2018 18:32
-
-
Save widiger-anna/1641800362ac88ed44745e61198114c9 to your computer and use it in GitHub Desktop.
NLP recipe 2: processing Yelp reviews with spaCy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import unicode_literals, print_function | |
import spacy | |
from spacy.attrs import ORTH, LEMMA, NORM, TAG | |
import re | |
''' | |
NLP tasks: | |
1. expand contractions to improve tokenization | |
2. filter stopwords and punctuation | |
3. track named entities (locations, names, money, etc.) | |
''' | |
#load the language model for English | |
nlp = spacy.load('en_core_web_sm') | |
# link to review: https://www.yelp.com/biz/dandelion-chocolate-san-francisco?hrid=G24-l-IvBCdQ1JR5f6H-mg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct) | |
chocolate_neg = "We drove 30min out of our way to get some dessert and ordered a ton to go... \ | |
Only to find that they didn't give us what we ordered. \ | |
What we did get was not good. We called and they refunded us $4.... \ | |
Ridiculous. Don't waste your time or money here. Sixth Course right around the corner is WAY better." | |
# Expand contractions using regular expressions - not efficient! | |
chocolate_neg = re.sub(r'(\w+)n\'t', r'\g<1>' + " not", chocolate_neg) | |
# Expand contractions using spaCy's tokenizer exceptions | |
TOKENIZER_EXCEPTIONS = { | |
# do | |
"don't": [ | |
{ORTH: "do", LEMMA: "do"}, | |
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}], | |
"doesn't": [ | |
{ORTH: "does", LEMMA: "do"}, | |
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}], | |
"didn't": [ | |
{ORTH: "did", LEMMA: "do"}, | |
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}], | |
doc2 = nlp(chocolate_neg) | |
} | |
# Test 1: Filter stopwords and punctuation, look at POS | |
for token in doc2: | |
if not token.is_stop and not token.is_punct: # built in lists of stopwords and punctuation | |
print(token.lemma_, token.pos_) | |
# Test 2: look at NE labels | |
tracked_ne = ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY', 'CARDINAL'] # List of named entity labels | |
for named_entity in doc2.ents: | |
if named_entity.label_ in tracked_ne: | |
print(named_entity.text, named_entity.label_) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment