widiger-anna/recipe_yelp_review.py

## recipe_yelp_review.py
from __future__ import unicode_literals, print_function
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
import re

'''
NLP tasks:
1. expand contractions to improve tokenization
2. filter stopwords and punctuation
3. track named entities (locations, names, money, etc.)
'''

#load the language model for English
nlp = spacy.load('en_core_web_sm')

# link to review: https://www.yelp.com/biz/dandelion-chocolate-san-francisco?hrid=G24-l-IvBCdQ1JR5f6H-mg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct)
chocolate_neg = "We drove 30min out of our way to get some dessert and ordered a ton to go... \
Only to find that they didn't give us what we ordered. \
What we did get was not good. We called and they refunded us $4.... \
Ridiculous. Don't waste your time or money here. Sixth Course right around the corner is WAY better."

# Expand contractions using regular expressions - not efficient!
chocolate_neg = re.sub(r'(\w+)n\'t', r'\g<1>' + " not", chocolate_neg)

# Expand contractions using spaCy's tokenizer exceptions
TOKENIZER_EXCEPTIONS = {
# do
    "don't": [
        {ORTH: "do", LEMMA: "do"},
        {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
    "doesn't": [
        {ORTH: "does", LEMMA: "do"},
        {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
    "didn't": [
        {ORTH: "did", LEMMA: "do"},
        {ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
doc2 = nlp(chocolate_neg)
}

# Test 1: Filter stopwords and punctuation, look at POS
for token in doc2:
    if not token.is_stop and not token.is_punct: # built in lists of stopwords and punctuation
            print(token.lemma_, token.pos_)

# Test 2: look at NE labels
tracked_ne = ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY', 'CARDINAL'] # List of named entity labels
for named_entity in doc2.ents:
    if named_entity.label_ in tracked_ne:
        print(named_entity.text, named_entity.label_)
	from __future__ import unicode_literals, print_function
	import spacy
	from spacy.attrs import ORTH, LEMMA, NORM, TAG
	import re

	'''
	NLP tasks:
	1. expand contractions to improve tokenization
	2. filter stopwords and punctuation
	3. track named entities (locations, names, money, etc.)
	'''

	#load the language model for English
	nlp = spacy.load('en_core_web_sm')

	# link to review: https://www.yelp.com/biz/dandelion-chocolate-san-francisco?hrid=G24-l-IvBCdQ1JR5f6H-mg&utm_campaign=www_review_share_popup&utm_medium=copy_link&utm_source=(direct)
	chocolate_neg = "We drove 30min out of our way to get some dessert and ordered a ton to go... \
	Only to find that they didn't give us what we ordered. \
	What we did get was not good. We called and they refunded us $4.... \
	Ridiculous. Don't waste your time or money here. Sixth Course right around the corner is WAY better."

	# Expand contractions using regular expressions - not efficient!
	chocolate_neg = re.sub(r'(\w+)n\'t', r'\g<1>' + " not", chocolate_neg)

	# Expand contractions using spaCy's tokenizer exceptions
	TOKENIZER_EXCEPTIONS = {
	# do
	"don't": [
	{ORTH: "do", LEMMA: "do"},
	{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
	"doesn't": [
	{ORTH: "does", LEMMA: "do"},
	{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
	"didn't": [
	{ORTH: "did", LEMMA: "do"},
	{ORTH: "n't", LEMMA: "not", NORM: "not", TAG: "RB"}],
	doc2 = nlp(chocolate_neg)
	}

	# Test 1: Filter stopwords and punctuation, look at POS
	for token in doc2:
	if not token.is_stop and not token.is_punct: # built in lists of stopwords and punctuation
	print(token.lemma_, token.pos_)

	# Test 2: look at NE labels
	tracked_ne = ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY', 'CARDINAL'] # List of named entity labels
	for named_entity in doc2.ents:
	if named_entity.label_ in tracked_ne:
	print(named_entity.text, named_entity.label_)