Last active
October 7, 2015 06:51
-
-
Save josiahdavis/c9fda162f21480f55926 to your computer and use it in GitHub Desktop.
Word / Rating Association
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
word | interestingness | stars | count | total | |
---|---|---|---|---|---|
urge | 0.000530717201025371 | 1 | 6 | 7 | |
bonus | 0.00042863450610675 | 5 | 9 | 17 | |
diva | 0.000368326157183909 | 2 | 2 | 2 | |
frappucinos | 0.000368326157183909 | 2 | 2 | 2 | |
urgency | 0.000467617762795816 | 2 | 8 | 21 | |
soap | 0.000621459510173934 | 1 | 4 | 4 | |
febreze | 0.000368326157183909 | 2 | 2 | 2 | |
keys | 0.000442264334187809 | 1 | 5 | 6 | |
cop | 0.000505316233925433 | 4 | 5 | 5 | |
routine | 0.000418793018978061 | 3 | 3 | 3 | |
doo | 0.000466094632630451 | 1 | 3 | 3 | |
view | 0.000449075878926089 | 4 | 14 | 27 | |
understanding | 0.000621459510173934 | 1 | 4 | 4 | |
fish | 0.000419393431119087 | 2 | 4 | 5 | |
crema | 0.000629090146678631 | 2 | 6 | 7 | |
peeve | 0.000552489235775863 | 2 | 3 | 3 | |
piadini | 0.000427144918859817 | 5 | 5 | 6 | |
music | 0.000464514776486304 | 3 | 24 | 81 | |
scence | 0.000466094632630451 | 1 | 3 | 3 | |
outlet | 0.000545306424410251 | 4 | 17 | 27 | |
earplugs | 0.00030318974035526 | 4 | 3 | 3 | |
indoor | 0.000808505974280692 | 4 | 8 | 8 | |
satisfaction | 0.000600214060976441 | 5 | 4 | 4 | |
reading | 0.000476260562340834 | 5 | 10 | 18 | |
butterbeer | 0.000600214060976441 | 5 | 4 | 4 | |
bitterness | 0.000397381138365707 | 3 | 5 | 6 | |
freak | 0.000427144918859817 | 5 | 5 | 6 | |
aid | 0.000368326157183909 | 2 | 2 | 2 | |
father | 0.000466094632630451 | 1 | 3 | 3 | |
airports | 0.000397381138365707 | 3 | 5 | 9 | |
towels | 0.000442264334187809 | 1 | 5 | 7 | |
nugget | 0.000629090146678631 | 2 | 6 | 10 | |
characters | 0.000345226053071781 | 4 | 6 | 8 | |
urgency | 0.000493118497267451 | 1 | 10 | 21 | |
vehicle | 0.000397381138365707 | 3 | 5 | 7 | |
denial | 0.000466094632630451 | 1 | 3 | 3 | |
toilet | 0.000409276266501163 | 1 | 19 | 35 | |
precision | 0.000450160545732331 | 5 | 3 | 3 | |
animals | 0.000552489235775863 | 2 | 3 | 3 | |
habits | 0.000600214060976441 | 5 | 4 | 4 | |
boca | 0.00030318974035526 | 4 | 3 | 3 | |
teacher | 0.000600214060976441 | 5 | 4 | 4 | |
busier | 0.000621459510173934 | 1 | 4 | 4 | |
visa | 0.000552489235775863 | 2 | 3 | 3 | |
elara | 0.000600214060976441 | 5 | 4 | 4 | |
flamingo | 0.000352845333441927 | 4 | 11 | 14 | |
leather | 0.00042863450610675 | 5 | 9 | 19 | |
condescending | 0.000368326157183909 | 2 | 2 | 2 | |
bloomfield | 0.000418793018978061 | 3 | 3 | 3 | |
sticks | 0.000552489235775863 | 2 | 3 | 3 | |
clouds | 0.000404252987140346 | 4 | 4 | 4 | |
kudos | 0.000352845333441927 | 4 | 11 | 25 | |
cheer | 0.000600214060976441 | 5 | 4 | 4 | |
refund | 0.000542430346994197 | 1 | 11 | 15 | |
dunkies | 0.000368326157183909 | 2 | 2 | 2 | |
donuts | 0.000481152727420809 | 4 | 15 | 19 | |
clover | 0.00042863450610675 | 5 | 9 | 13 | |
letter | 0.000466094632630451 | 1 | 3 | 3 | |
pet | 0.000419393431119087 | 2 | 4 | 5 | |
mead | 0.000402763728583745 | 4 | 7 | 10 | |
vicinity | 0.000345226053071781 | 4 | 6 | 7 | |
frequents | 0.000418793018978061 | 3 | 3 | 3 | |
lunches | 0.000450160545732331 | 5 | 3 | 3 | |
sugary | 0.000450160545732331 | 5 | 3 | 3 | |
kudos | 0.000571512674809 | 5 | 12 | 25 | |
excuse | 0.000409165542446339 | 2 | 7 | 16 | |
stocking | 0.000419393431119087 | 2 | 4 | 5 | |
study | 0.000445159994132708 | 3 | 23 | 81 | |
duck | 0.00055633359371199 | 3 | 7 | 8 | |
study | 0.000518447856837101 | 4 | 37 | 81 | |
convention | 0.000797533860096898 | 3 | 18 | 25 | |
hike | 0.000352845333441927 | 4 | 11 | 14 | |
tock | 0.000776824387717418 | 1 | 5 | 5 | |
players | 0.000621459510173934 | 1 | 4 | 4 | |
recommendations | 0.000427144918859817 | 5 | 5 | 8 | |
sidewalk | 0.000404252987140346 | 4 | 4 | 4 | |
douchebag | 0.000368326157183909 | 2 | 2 | 2 | |
fridays | 0.000368326157183909 | 2 | 2 | 2 | |
guide | 0.000368326157183909 | 2 | 2 | 2 | |
cubes | 0.000368326157183909 | 2 | 2 | 2 | |
garbage | 0.000493118497267451 | 1 | 10 | 17 | |
satan | 0.000450160545732331 | 5 | 3 | 3 | |
summary | 0.000418793018978061 | 3 | 3 | 3 | |
headset | 0.000368326157183909 | 2 | 2 | 2 | |
aliante | 0.00055633359371199 | 3 | 7 | 9 | |
crush | 0.000418793018978061 | 3 | 3 | 3 | |
ritual | 0.000450160545732331 | 5 | 3 | 3 | |
waters | 0.000524241788898859 | 2 | 5 | 6 | |
citycenter | 0.000404252987140346 | 4 | 4 | 4 | |
chameleon | 0.000418793018978061 | 3 | 3 | 3 | |
duckies | 0.000418793018978061 | 3 | 3 | 3 | |
cap | 0.000418793018978061 | 3 | 3 | 3 | |
woot | 0.000450160545732331 | 5 | 3 | 3 | |
surf | 0.000418793018978061 | 3 | 3 | 3 | |
greets | 0.000512573902631781 | 5 | 6 | 8 | |
locks | 0.000466094632630451 | 1 | 3 | 3 | |
treats | 0.000345226053071781 | 4 | 6 | 8 | |
cat | 0.000558390691970748 | 3 | 4 | 4 | |
trucks | 0.000450160545732331 | 5 | 3 | 3 | |
apologize | 0.000621459510173934 | 1 | 4 | 4 | |
resorts | 0.000552489235775863 | 2 | 3 | 3 | |
comicon | 0.000466094632630451 | 1 | 3 | 3 | |
insanity | 0.000466094632630451 | 1 | 3 | 3 | |
cali | 0.000368326157183909 | 2 | 2 | 2 | |
disgusting | 0.000707622934700495 | 1 | 8 | 11 | |
pic | 0.000450160545732331 | 5 | 3 | 3 | |
scam | 0.000621459510173934 | 1 | 4 | 4 | |
mist | 0.000404252987140346 | 4 | 4 | 4 | |
church | 0.000517839079607672 | 4 | 9 | 10 | |
espanol | 0.000418793018978061 | 3 | 3 | 3 | |
goldbar | 0.000418793018978061 | 3 | 3 | 3 | |
slams | 0.000466094632630451 | 1 | 3 | 3 | |
bulletin | 0.00030318974035526 | 4 | 3 | 3 | |
tres | 0.000697988364963435 | 3 | 5 | 5 | |
music | 0.000420363127165217 | 4 | 30 | 81 | |
pineville | 0.000418793018978061 | 3 | 3 | 3 | |
photos | 0.000418793018978061 | 3 | 3 | 3 | |
dunkin | 0.000575376755119635 | 4 | 10 | 13 | |
compassion | 0.000466094632630451 | 1 | 3 | 3 | |
cocktail | 0.000450160545732331 | 5 | 3 | 3 | |
verde | 0.000505316233925433 | 4 | 5 | 5 | |
bonanza | 0.000558390691970748 | 3 | 4 | 4 | |
pond | 0.000837586037956122 | 3 | 6 | 6 | |
arrangements | 0.00030318974035526 | 4 | 3 | 3 | |
cozy | 0.000598002886403744 | 5 | 7 | 12 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<style> | |
text { | |
font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif; | |
text-anchor: middle; | |
pointer-events: none; | |
} | |
circle { | |
fill: #ccc; | |
} | |
rect { | |
fill: white; | |
/*stroke: #ccc;*/ | |
} | |
.node:hover circle { | |
fill: #d62728; | |
opacity: 0.5; | |
} | |
.d3-tip { | |
line-height: 1; | |
font: 14px sans-serif; | |
padding: 12px; | |
background: rgba(0, 0, 0, 0.8); | |
color: rgb(185, 185, 185); | |
border-radius: 2px; | |
} | |
/* Creates a small triangle extender for the tooltip */ | |
.d3-tip:after { | |
box-sizing: border-box; | |
display: inline; | |
font-size: 10px; | |
width: 100%; | |
line-height: 1; | |
color: rgba(0, 0, 0, 0.8); | |
content: "\25BC"; | |
position: absolute; | |
text-align: center; | |
} | |
/* Style northward tooltips differently */ | |
.d3-tip.n:after { | |
margin: -1px 0 0 0; | |
top: 100%; | |
left: 0; | |
} | |
</style> | |
<body> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script> | |
<script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script> | |
<script> | |
// Modified from the Mike Bostock example here: http://bl.ocks.org/mbostock/1846692 | |
// Utilized Jerome Cukier's tutorial here: http://www.jeromecukier.net/blog/2012/05/28/manipulating-data-like-a-boss-with-d3/ | |
var margin = {top: 20, right: 20, bottom: 30, left: 50}, | |
width = 960 - margin.left - margin.right, | |
height = 500 - margin.top - margin.bottom, | |
bleed = 100; | |
var cwidth=175, cheight=210, cmargin=5; | |
var pack = d3.layout.pack() | |
.sort(null) | |
.size([cwidth, cheight]) | |
.padding(0) | |
.value(function(d) { return d.interestingness; }) | |
.children(function(d) { return d; }); | |
var color = d3.scale.ordinal().range(["#d62728", "#ff9896", "#c7c7c7", "#98df8a", "#2ca02c"]); | |
var svg = d3.select("body") | |
.append("svg") | |
.attr("width", width + margin.left + margin.right) | |
.attr("height", height + margin.top + margin.bottom) | |
.append("g") | |
.attr("transform", "translate(" + margin.left + "," + margin.top + ")"); | |
var tip = d3.tip() | |
.attr('class', 'd3-tip') | |
.offset([-10, 0]) | |
.html(function(d) { | |
return "<div><span>Word:</span> <span style='color:white'>" + d.word + "</span></div>" + | |
"<div><span>Frequency:</span> <span style='color:white'>" + d.count + | |
" (" + d3.round(100 * d.count / d.total, 0)+ "%)" + "</span></div>"; | |
}) | |
svg.call(tip); | |
d3.csv("data.csv", function(error, data) { | |
if (error) throw error; | |
data.forEach(function(d){ | |
d.count = + d.count; | |
d.interestingness = +d.interestingness; | |
d.total = +d.total; | |
d.stars = +d.stars; | |
return d | |
}) | |
data.sort(function(a,b){ return b.stars - a.stars; }) | |
var nested = d3.nest() | |
.key(function(d) {return d.stars;}) | |
.sortKeys(d3.ascending) | |
.entries(data) | |
console.log("NESTED", JSON.stringify(nested, null, 2)) | |
// Convert the data into a format copasetic for the pack layout | |
// data = { children: data }; | |
console.log(JSON.stringify("DATA", data, null, 2)); | |
// console.log(pack.nodes(data).filter(function(d) { return !d.children; })); | |
// Create 5 group variables | |
var group = svg.selectAll("cluster") | |
.data(nested) | |
.enter() | |
.append("g") | |
.attr("class", "cluster") | |
.attr("transform", function(d, i) { return "translate(" + (cwidth * i) + "," + height / 3 + ")"; }); | |
group.append("rect") | |
.attr("x", cmargin) | |
.attr("y", cmargin) | |
.attr("width",cwidth-2*cmargin) | |
.attr("height",cheight-2*cmargin); | |
group.append("text") | |
.attr("y",cheight+20) | |
.attr("x",cmargin + 10) | |
.text(function(d) {return d.key;}) | |
console.log("GROUP", group) | |
// Create the selection | |
var node = group.selectAll(".node") | |
.data(function(d) { return pack.nodes(d.values).filter(function(d) { return !d.children; }); }) | |
.enter().append("g") | |
.attr("class", "node") | |
.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; }) | |
.on('mouseover', tip.show) | |
.on('mouseout', tip.hide); | |
// Append the circles | |
node.append("circle") | |
.attr("r", function(d) { return d.r; }) | |
.style("fill", function(d) {return color(d.stars); }); | |
// Append the text | |
node.append("text") | |
.text(function(d) { return d.word; }) | |
.style("font-size", function(d) { return Math.min(2 * d.r, (2 * d.r - 8) / this.getComputedTextLength() * 24) + "px"; }) | |
.attr("dy", ".35em"); | |
}); | |
</script> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ANALYSIS OF YELP DATA | |
# Start the timer | |
ptm <- proc.time() | |
# Clear working space | |
rm(list = ls()) | |
gc() | |
# Load packages | |
library(NLP) | |
library(magrittr) | |
library(tm) | |
library(openNLP) | |
library(plyr) | |
library(RWeka) | |
library(RTextTools) | |
library(SnowballC) | |
library(reshape) | |
# Load the Data | |
loc <- '/Users/josiahdavis/Documents/GitHub/earl/' | |
dr <- read.csv(paste(loc, 'yelp_review.csv', sep="")) | |
# Subset only for Starbucks records (also filter out German records) | |
dr <- dr[(dr$name == "Starbucks") & !(grepl(pattern = "das", x = dr$text)) & | |
!(grepl(pattern = "haw", x = dr$text)) & !(grepl(pattern = "tres", x = dr$text)),] | |
# Collapse into five text blobs, one for each rating | |
texts = list() | |
for (i in 1:5){ | |
texts[[i]] <- paste(as.character(dr[dr$stars == i,]$text), sep="", collapse="") | |
} | |
# ===================================== | |
# Perform text mining | |
# transformations | |
# ===================================== | |
# Convert back to to dataframe | |
d <- data.frame(reviews = unlist(texts)) | |
# Replace new line characters with spaces | |
d$reviews <- gsub("\n", " ", d$reviews) | |
# Convert the relevant data into a corpus object with the tm package | |
d <- Corpus(VectorSource(d$reviews)) | |
# Convert everything to lower case | |
d <- tm_map(d, content_transformer(tolower)) | |
# Read in list of 5000+ stopwords compiled by Matthew Jockers | |
fileStopwords <- paste(loc, 'stopwords.txt', sep="") | |
stopwords <- readChar(fileStopwords, file.info(fileStopwords)$size) | |
stopwords <- unlist(strsplit(stopwords, split=", ")) | |
# Remove stopwords (loop through a set of stopwords at a time) | |
stopwords <- c(stopwords("english"), stopwords) | |
for (i in 1:5){ | |
if(i == 1){ | |
start <- 1 | |
}else{ | |
start <- i * 1000 | |
} | |
if(i < 5){ | |
end <- (i + 1) * 1000 | |
}else{ | |
end <- 5805 | |
} | |
d <- tm_map(d, removeWords, stopwords[start:end]) | |
} | |
# Remove punctuation | |
d <- tm_map(d, removePunctuation) | |
# Strip whitespace | |
d <- tm_map(d, stripWhitespace) | |
# Conver to list of strings | |
texts <- lapply(d, as.String) | |
# ===================================== | |
# Identify and reviews to only include nouns. | |
# This section modified from an excellent tutorial: | |
# http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html | |
# ===================================== | |
# Define function for performing the annotations | |
annotate_entities <- function(doc, annotation_pipeline) { | |
annotations <- annotate(doc, annotation_pipeline) | |
AnnotatedPlainTextDocument(doc, annotations) | |
} | |
# Define types of annotations to perform | |
tagging_pipeline <- list( | |
Maxent_Sent_Token_Annotator(), | |
Maxent_Word_Token_Annotator(), | |
Maxent_POS_Tag_Annotator() | |
) | |
# Annotate the texts (THIS STEP CAN TAKE A COUPLE OF MINUTES) | |
texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline) | |
# Define the POS getter function | |
POSGetter <- function(doc, parts) { | |
s <- doc$content | |
a <- annotations(doc)[[1]] | |
k <- sapply(a$features, `[[`, "POS") | |
if(sum(k %in% parts) == 0){ | |
"" | |
}else{ | |
s[a[k %in% parts]] | |
} | |
} | |
# Identify the nouns | |
nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS")) | |
# Turn each character vector into a single string | |
nouns <- nouns %>% lapply(as.String) | |
# ===================================== | |
# Get the counts and relative | |
# frequencies | |
# ===================================== | |
# Convert back to corpus object | |
d <- Corpus(VectorSource(nouns)) | |
# Convert to a document term matrix (rows are documents, columns are words) | |
dtm1 <- as.matrix(DocumentTermMatrix(d)) | |
dtm2 <- as.matrix(DocumentTermMatrix(d, control = list(weighting = weightTfIdf))) | |
# For each rating category, choose the top N most common words | |
# For the top N words, choose the top I most interesting words | |
N <- 7975 | |
I <- 30 | |
# Create the dataframe | |
words <- data.frame(word = as.character(), | |
interestingness = as.double(), | |
stars = as.integer(), | |
count = as.integer(), | |
total = as.integer()) | |
# Loop through each rating to determine the most interesting words | |
for(i in 1:5){ | |
mostIntWords <- dtm2[i,order(dtm2[i,], decreasing = TRUE)[1:I]] | |
words <- rbind(words, data.frame(word = names(mostIntWords), | |
interestingness = unname(mostIntWords), | |
stars = i, | |
count = unname(dtm1[i,names(mostIntWords)]), | |
total = unname(colSums(dtm1[,names(mostIntWords)])) | |
) | |
) | |
} | |
# Sort randomly | |
idx <- sample(1:nrow(words), nrow(words), replace=FALSE) | |
words <- words[idx,] | |
# Write to csv file | |
writeLoc <- "/Users/josiahdavis/Documents/d3/wordRating/" | |
write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE) | |
print(paste('Total Time: ', round((proc.time() - ptm)[3] / 60, 1))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment