Skip to content

Instantly share code, notes, and snippets.

@widiger-anna
Last active May 4, 2018 18:32
Show Gist options
  • Save widiger-anna/1641800362ac88ed44745e61198114c9 to your computer and use it in GitHub Desktop.
Save widiger-anna/1641800362ac88ed44745e61198114c9 to your computer and use it in GitHub Desktop.
NLP recipe 2: processing Yelp reviews with spaCy
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
import re
'''
NLP tasks:
1. expand contractions to improve tokenization
2. filter stopwords and punctuation
3. track named entities (locations, names, money, etc.)
'''
#load the language model for English
nlp = spacy.load('en_core_web_sm')
# link to review: https://www.yelp.com/biz/dandelion-chocolate-san-francisco?hrid=G24-l-IvBCdQ1JR5f6H-mg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct)
chocolate_neg = "We drove 30min out of our way to get some dessert and ordered a ton to go... \
Only to find that they didn't give us what we ordered. \
What we did get was not good. We called and they refunded us $4.... \
Ridiculous. Don't waste your time or money here. Sixth Course right around the corner is WAY better."
# Expand contractions using regular expressions - not efficient!
chocolate_neg = re.sub(r'(\w+)n\'t', r'\g<1>' + " not", chocolate_neg)
# Expand contractions using spaCy's tokenizer exceptions
TOKENIZER_EXCEPTIONS = {
# do
"don't": [
{ORTH: "do", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"doesn't": [
{ORTH: "does", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
"didn't": [
{ORTH: "did", LEMMA: "do"},
{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
doc2 = nlp(chocolate_neg)
}
# Test 1: Filter stopwords and punctuation, look at POS
for token in doc2:
if not token.is_stop and not token.is_punct: # built in lists of stopwords and punctuation
print(token.lemma_, token.pos_)
# Test 2: look at NE labels
tracked_ne = ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY', 'CARDINAL'] # List of named entity labels
for named_entity in doc2.ents:
if named_entity.label_ in tracked_ne:
print(named_entity.text, named_entity.label_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment