josiahdavis/Readme.md

## Readme.md

      
    Raw
  

              Readme.md
            
          
    Analysis

Notes


## data.csv

          
            word
            interestingness
            stars
            count
            total

            
              urge
              0.000530717201025371
              1
              6
              7

            
              bonus
              0.00042863450610675
              5
              9
              17

            
              diva
              0.000368326157183909
              2
              2
              2

            
              frappucinos
              0.000368326157183909
              2
              2
              2

            
              urgency
              0.000467617762795816
              2
              8
              21

            
              soap
              0.000621459510173934
              1
              4
              4

            
              febreze
              0.000368326157183909
              2
              2
              2

            
              keys
              0.000442264334187809
              1
              5
              6

            
              cop
              0.000505316233925433
              4
              5
              5

            
              routine
              0.000418793018978061
              3
              3
              3

            
              doo
              0.000466094632630451
              1
              3
              3

            
              view
              0.000449075878926089
              4
              14
              27

            
              understanding
              0.000621459510173934
              1
              4
              4

            
              fish
              0.000419393431119087
              2
              4
              5

            
              crema
              0.000629090146678631
              2
              6
              7

            
              peeve
              0.000552489235775863
              2
              3
              3

            
              piadini
              0.000427144918859817
              5
              5
              6

            
              music
              0.000464514776486304
              3
              24
              81

            
              scence
              0.000466094632630451
              1
              3
              3

            
              outlet
              0.000545306424410251
              4
              17
              27

            
              earplugs
              0.00030318974035526
              4
              3
              3

            
              indoor
              0.000808505974280692
              4
              8
              8

            
              satisfaction
              0.000600214060976441
              5
              4
              4

            
              reading
              0.000476260562340834
              5
              10
              18

            
              butterbeer
              0.000600214060976441
              5
              4
              4

            
              bitterness
              0.000397381138365707
              3
              5
              6

            
              freak
              0.000427144918859817
              5
              5
              6

            
              aid
              0.000368326157183909
              2
              2
              2

            
              father
              0.000466094632630451
              1
              3
              3

            
              airports
              0.000397381138365707
              3
              5
              9

            
              towels
              0.000442264334187809
              1
              5
              7

            
              nugget
              0.000629090146678631
              2
              6
              10

            
              characters
              0.000345226053071781
              4
              6
              8

            
              urgency
              0.000493118497267451
              1
              10
              21

            
              vehicle
              0.000397381138365707
              3
              5
              7

            
              denial
              0.000466094632630451
              1
              3
              3

            
              toilet
              0.000409276266501163
              1
              19
              35

            
              precision
              0.000450160545732331
              5
              3
              3

            
              animals
              0.000552489235775863
              2
              3
              3

            
              habits
              0.000600214060976441
              5
              4
              4

            
              boca
              0.00030318974035526
              4
              3
              3

            
              teacher
              0.000600214060976441
              5
              4
              4

            
              busier
              0.000621459510173934
              1
              4
              4

            
              visa
              0.000552489235775863
              2
              3
              3

            
              elara
              0.000600214060976441
              5
              4
              4

            
              flamingo
              0.000352845333441927
              4
              11
              14

            
              leather
              0.00042863450610675
              5
              9
              19

            
              condescending
              0.000368326157183909
              2
              2
              2

            
              bloomfield
              0.000418793018978061
              3
              3
              3

            
              sticks
              0.000552489235775863
              2
              3
              3

            
              clouds
              0.000404252987140346
              4
              4
              4

            
              kudos
              0.000352845333441927
              4
              11
              25

            
              cheer
              0.000600214060976441
              5
              4
              4

            
              refund
              0.000542430346994197
              1
              11
              15

            
              dunkies
              0.000368326157183909
              2
              2
              2

            
              donuts
              0.000481152727420809
              4
              15
              19

            
              clover
              0.00042863450610675
              5
              9
              13

            
              letter
              0.000466094632630451
              1
              3
              3

            
              pet
              0.000419393431119087
              2
              4
              5

            
              mead
              0.000402763728583745
              4
              7
              10

            
              vicinity
              0.000345226053071781
              4
              6
              7

            
              frequents
              0.000418793018978061
              3
              3
              3

            
              lunches
              0.000450160545732331
              5
              3
              3

            
              sugary
              0.000450160545732331
              5
              3
              3

            
              kudos
              0.000571512674809
              5
              12
              25

            
              excuse
              0.000409165542446339
              2
              7
              16

            
              stocking
              0.000419393431119087
              2
              4
              5

            
              study
              0.000445159994132708
              3
              23
              81

            
              duck
              0.00055633359371199
              3
              7
              8

            
              study
              0.000518447856837101
              4
              37
              81

            
              convention
              0.000797533860096898
              3
              18
              25

            
              hike
              0.000352845333441927
              4
              11
              14

            
              tock
              0.000776824387717418
              1
              5
              5

            
              players
              0.000621459510173934
              1
              4
              4

            
              recommendations
              0.000427144918859817
              5
              5
              8

            
              sidewalk
              0.000404252987140346
              4
              4
              4

            
              douchebag
              0.000368326157183909
              2
              2
              2

            
              fridays
              0.000368326157183909
              2
              2
              2

            
              guide
              0.000368326157183909
              2
              2
              2

            
              cubes
              0.000368326157183909
              2
              2
              2

            
              garbage
              0.000493118497267451
              1
              10
              17

            
              satan
              0.000450160545732331
              5
              3
              3

            
              summary
              0.000418793018978061
              3
              3
              3

            
              headset
              0.000368326157183909
              2
              2
              2

            
              aliante
              0.00055633359371199
              3
              7
              9

            
              crush
              0.000418793018978061
              3
              3
              3

            
              ritual
              0.000450160545732331
              5
              3
              3

            
              waters
              0.000524241788898859
              2
              5
              6

            
              citycenter
              0.000404252987140346
              4
              4
              4

            
              chameleon
              0.000418793018978061
              3
              3
              3

            
              duckies
              0.000418793018978061
              3
              3
              3

            
              cap
              0.000418793018978061
              3
              3
              3

            
              woot
              0.000450160545732331
              5
              3
              3

            
              surf
              0.000418793018978061
              3
              3
              3

            
              greets
              0.000512573902631781
              5
              6
              8

            
              locks
              0.000466094632630451
              1
              3
              3

            
              treats
              0.000345226053071781
              4
              6
              8

            
              cat
              0.000558390691970748
              3
              4
              4

            
              trucks
              0.000450160545732331
              5
              3
              3

            
              apologize
              0.000621459510173934
              1
              4
              4

            
              resorts
              0.000552489235775863
              2
              3
              3

            
              comicon
              0.000466094632630451
              1
              3
              3

            
              insanity
              0.000466094632630451
              1
              3
              3

            
              cali
              0.000368326157183909
              2
              2
              2

            
              disgusting
              0.000707622934700495
              1
              8
              11

            
              pic
              0.000450160545732331
              5
              3
              3

            
              scam
              0.000621459510173934
              1
              4
              4

            
              mist
              0.000404252987140346
              4
              4
              4

            
              church
              0.000517839079607672
              4
              9
              10

            
              espanol
              0.000418793018978061
              3
              3
              3

            
              goldbar
              0.000418793018978061
              3
              3
              3

            
              slams
              0.000466094632630451
              1
              3
              3

            
              bulletin
              0.00030318974035526
              4
              3
              3

            
              tres
              0.000697988364963435
              3
              5
              5

            
              music
              0.000420363127165217
              4
              30
              81

            
              pineville
              0.000418793018978061
              3
              3
              3

            
              photos
              0.000418793018978061
              3
              3
              3

            
              dunkin
              0.000575376755119635
              4
              10
              13

            
              compassion
              0.000466094632630451
              1
              3
              3

            
              cocktail
              0.000450160545732331
              5
              3
              3

            
              verde
              0.000505316233925433
              4
              5
              5

            
              bonanza
              0.000558390691970748
              3
              4
              4

            
              pond
              0.000837586037956122
              3
              6
              6

            
              arrangements
              0.00030318974035526
              4
              3
              3

            
              cozy
              0.000598002886403744
              5
              7
              12

## index.html
<!DOCTYPE html>
<meta charset="utf-8">
<style>

text {
  font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif;
  text-anchor: middle;
  pointer-events: none;
}

circle {
  fill: #ccc;
}

rect {
  fill: white;
  /*stroke: #ccc;*/
}

.node:hover circle {
  fill: #d62728;
  opacity: 0.5;
}

.d3-tip {
  line-height: 1;
  font: 14px sans-serif;
  padding: 12px;
  background: rgba(0, 0, 0, 0.8);
  color: rgb(185, 185, 185);
  border-radius: 2px;
}

/* Creates a small triangle extender for the tooltip */
.d3-tip:after {
  box-sizing: border-box;
  display: inline;
  font-size: 10px;
  width: 100%;
  line-height: 1;
  color: rgba(0, 0, 0, 0.8);
  content: "\25BC";
  position: absolute;
  text-align: center;
}

/* Style northward tooltips differently */
.d3-tip.n:after {
  margin: -1px 0 0 0;
  top: 100%;
  left: 0;
}

</style>
<body>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
<script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script>
<script>

// Modified from the Mike Bostock example here: http://bl.ocks.org/mbostock/1846692
// Utilized Jerome Cukier's tutorial here: http://www.jeromecukier.net/blog/2012/05/28/manipulating-data-like-a-boss-with-d3/

var margin = {top: 20, right: 20, bottom: 30, left: 50},
    width = 960 - margin.left - margin.right,
    height = 500 - margin.top - margin.bottom,
    bleed = 100;

var cwidth=175, cheight=210, cmargin=5;

var pack = d3.layout.pack()
            .sort(null)
            .size([cwidth, cheight])
            .padding(0)
            .value(function(d) { return d.interestingness; })
            .children(function(d) { return d; });

var color = d3.scale.ordinal().range(["#d62728", "#ff9896", "#c7c7c7", "#98df8a", "#2ca02c"]);

var svg = d3.select("body")
    .append("svg")
      .attr("width", width + margin.left + margin.right)
      .attr("height", height + margin.top + margin.bottom)
    .append("g")
      .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var tip = d3.tip()
    .attr('class', 'd3-tip')
    .offset([-10, 0])
    .html(function(d) {
      return  "<div><span>Word:</span> <span style='color:white'>" + d.word + "</span></div>" +
      "<div><span>Frequency:</span> <span style='color:white'>" + d.count +
            " (" + d3.round(100 * d.count / d.total, 0)+ "%)" + "</span></div>";
    })

svg.call(tip);

d3.csv("data.csv", function(error, data) {
  if (error) throw error;

  data.forEach(function(d){
    d.count = + d.count;
    d.interestingness = +d.interestingness;
    d.total = +d.total;
    d.stars = +d.stars;
    return d
  })

  data.sort(function(a,b){ return b.stars - a.stars; })

  var nested = d3.nest()
                  .key(function(d) {return d.stars;})
                  .sortKeys(d3.ascending)
                  .entries(data)

  console.log("NESTED", JSON.stringify(nested, null, 2))

  // Convert the data into a format copasetic for the pack layout
  // data = { children: data };

  console.log(JSON.stringify("DATA", data, null, 2));
  // console.log(pack.nodes(data).filter(function(d) { return !d.children; }));

// Create 5 group variables
 var group = svg.selectAll("cluster")
                  .data(nested)
                  .enter()
                  .append("g")
                  .attr("class", "cluster")
                  .attr("transform", function(d, i) { return "translate(" + (cwidth * i) + "," + height / 3 +  ")"; });

  group.append("rect")
        .attr("x", cmargin)
        .attr("y", cmargin)
        .attr("width",cwidth-2*cmargin)
        .attr("height",cheight-2*cmargin);

  group.append("text")
          .attr("y",cheight+20)
          .attr("x",cmargin + 10)
          .text(function(d) {return d.key;})

  console.log("GROUP", group)

  // Create the selection
  var node = group.selectAll(".node")
    .data(function(d) { return pack.nodes(d.values).filter(function(d) { return !d.children; }); })
  .enter().append("g")
    .attr("class", "node")
    .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
    .on('mouseover', tip.show)
    .on('mouseout', tip.hide);

  // Append the circles
  node.append("circle")
      .attr("r", function(d) { return d.r; })
      .style("fill", function(d) {return color(d.stars); });

  // Append the text
  node.append("text")
      .text(function(d) { return d.word; })
      .style("font-size", function(d) { return Math.min(2 * d.r, (2 * d.r - 8) / this.getComputedTextLength() * 24) + "px"; })
      .attr("dy", ".35em");

});
</script>

## wordsRating.R
# ANALYSIS OF YELP DATA

# Start the timer
ptm <- proc.time()

# Clear working space
rm(list = ls())
gc()

# Load packages
library(NLP)
library(magrittr)
library(tm)
library(openNLP)
library(plyr)
library(RWeka)
library(RTextTools)
library(SnowballC)
library(reshape)

# Load the Data
loc <- '/Users/josiahdavis/Documents/GitHub/earl/'
dr <- read.csv(paste(loc, 'yelp_review.csv', sep=""))

# Subset only for Starbucks records (also filter out German records)
dr <- dr[(dr$name == "Starbucks") & !(grepl(pattern = "das", x = dr$text)) &
           !(grepl(pattern = "haw", x = dr$text)) & !(grepl(pattern = "tres", x = dr$text)),]

# Collapse into five text blobs, one for each rating
texts = list()
for (i in 1:5){
  texts[[i]] <- paste(as.character(dr[dr$stars == i,]$text), sep="", collapse="")
}

# =====================================
# Perform text mining
# transformations
# =====================================

# Convert back to to dataframe
d <- data.frame(reviews = unlist(texts))

# Replace new line characters with spaces
d$reviews <- gsub("\n", " ", d$reviews)

# Convert the relevant data into a corpus object with the tm package
d <- Corpus(VectorSource(d$reviews))

# Convert everything to lower case
d <- tm_map(d, content_transformer(tolower))

# Read in list of 5000+ stopwords compiled by Matthew Jockers
fileStopwords <- paste(loc, 'stopwords.txt', sep="")
stopwords <- readChar(fileStopwords, file.info(fileStopwords)$size)
stopwords <- unlist(strsplit(stopwords, split=", "))

# Remove stopwords (loop through a set of stopwords at a time)
stopwords <- c(stopwords("english"), stopwords)

for (i in 1:5){
  if(i == 1){
    start <- 1
  }else{
    start <- i * 1000
  }

  if(i < 5){
    end <- (i + 1) * 1000
  }else{
    end <- 5805
  }
  d <- tm_map(d, removeWords, stopwords[start:end])
}

# Remove punctuation
d <- tm_map(d, removePunctuation)

# Strip whitespace
d <- tm_map(d, stripWhitespace)

# Conver to list of strings
texts <- lapply(d, as.String)

# =====================================
# Identify and reviews to only include nouns.
# This section modified from an excellent tutorial:
# http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html
# =====================================

# Define function for performing the annotations
annotate_entities <- function(doc, annotation_pipeline) {
  annotations <- annotate(doc, annotation_pipeline)
  AnnotatedPlainTextDocument(doc, annotations)
}

# Define types of annotations to perform
tagging_pipeline <- list(
  Maxent_Sent_Token_Annotator(),
  Maxent_Word_Token_Annotator(),
  Maxent_POS_Tag_Annotator()
)

# Annotate the texts (THIS STEP CAN TAKE A COUPLE OF MINUTES)
texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline)

# Define the POS getter function
POSGetter <- function(doc, parts) {
  s <- doc$content
  a <- annotations(doc)[[1]]
  k <- sapply(a$features, `[[`, "POS")
  if(sum(k %in% parts) == 0){
    ""
  }else{
    s[a[k %in% parts]]
  }
}

# Identify the nouns
nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS"))

# Turn each character vector into a single string
nouns <- nouns %>% lapply(as.String)

# =====================================
# Get the counts and relative
# frequencies
# =====================================

# Convert back to corpus object
d <- Corpus(VectorSource(nouns))

# Convert to a document term matrix (rows are documents, columns are words)
dtm1 <- as.matrix(DocumentTermMatrix(d))
dtm2 <- as.matrix(DocumentTermMatrix(d, control = list(weighting = weightTfIdf)))

# For each rating category, choose the top N most common words
# For the top N words, choose the top I most interesting words
N <- 7975
I <- 30

# Create the dataframe
words <- data.frame(word = as.character(),
                    interestingness = as.double(),
                    stars = as.integer(),
                    count = as.integer(),
                    total = as.integer())

# Loop through each rating to determine the most interesting words
for(i in 1:5){
  mostIntWords <- dtm2[i,order(dtm2[i,], decreasing = TRUE)[1:I]]
  words <- rbind(words, data.frame(word = names(mostIntWords),
                                   interestingness = unname(mostIntWords),
                                   stars = i,
                                   count = unname(dtm1[i,names(mostIntWords)]),
                                   total = unname(colSums(dtm1[,names(mostIntWords)]))
                                  )
                  )
}

# Sort randomly
idx <- sample(1:nrow(words), nrow(words), replace=FALSE)
words <- words[idx,]

# Write to csv file
writeLoc <- "/Users/josiahdavis/Documents/d3/wordRating/"
write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE)

print(paste('Total Time: ', round((proc.time() - ptm)[3] / 60, 1)))
word	interestingness	stars	count	total
urge	0.000530717201025371	1	6	7
bonus	0.00042863450610675	5	9	17
diva	0.000368326157183909	2	2	2
frappucinos	0.000368326157183909	2	2	2
urgency	0.000467617762795816	2	8	21
soap	0.000621459510173934	1	4	4
febreze	0.000368326157183909	2	2	2
keys	0.000442264334187809	1	5	6
cop	0.000505316233925433	4	5	5
routine	0.000418793018978061	3	3	3
doo	0.000466094632630451	1	3	3
view	0.000449075878926089	4	14	27
understanding	0.000621459510173934	1	4	4
fish	0.000419393431119087	2	4	5
crema	0.000629090146678631	2	6	7
peeve	0.000552489235775863	2	3	3
piadini	0.000427144918859817	5	5	6
music	0.000464514776486304	3	24	81
scence	0.000466094632630451	1	3	3
outlet	0.000545306424410251	4	17	27
earplugs	0.00030318974035526	4	3	3
indoor	0.000808505974280692	4	8	8
satisfaction	0.000600214060976441	5	4	4
reading	0.000476260562340834	5	10	18
butterbeer	0.000600214060976441	5	4	4
bitterness	0.000397381138365707	3	5	6
freak	0.000427144918859817	5	5	6
aid	0.000368326157183909	2	2	2
father	0.000466094632630451	1	3	3
airports	0.000397381138365707	3	5	9
towels	0.000442264334187809	1	5	7
nugget	0.000629090146678631	2	6	10
characters	0.000345226053071781	4	6	8
urgency	0.000493118497267451	1	10	21
vehicle	0.000397381138365707	3	5	7
denial	0.000466094632630451	1	3	3
toilet	0.000409276266501163	1	19	35
precision	0.000450160545732331	5	3	3
animals	0.000552489235775863	2	3	3
habits	0.000600214060976441	5	4	4
boca	0.00030318974035526	4	3	3
teacher	0.000600214060976441	5	4	4
busier	0.000621459510173934	1	4	4
visa	0.000552489235775863	2	3	3
elara	0.000600214060976441	5	4	4
flamingo	0.000352845333441927	4	11	14
leather	0.00042863450610675	5	9	19
condescending	0.000368326157183909	2	2	2
bloomfield	0.000418793018978061	3	3	3
sticks	0.000552489235775863	2	3	3
clouds	0.000404252987140346	4	4	4
kudos	0.000352845333441927	4	11	25
cheer	0.000600214060976441	5	4	4
refund	0.000542430346994197	1	11	15
dunkies	0.000368326157183909	2	2	2
donuts	0.000481152727420809	4	15	19
clover	0.00042863450610675	5	9	13
letter	0.000466094632630451	1	3	3
pet	0.000419393431119087	2	4	5
mead	0.000402763728583745	4	7	10
vicinity	0.000345226053071781	4	6	7
frequents	0.000418793018978061	3	3	3
lunches	0.000450160545732331	5	3	3
sugary	0.000450160545732331	5	3	3
kudos	0.000571512674809	5	12	25
excuse	0.000409165542446339	2	7	16
stocking	0.000419393431119087	2	4	5
study	0.000445159994132708	3	23	81
duck	0.00055633359371199	3	7	8
study	0.000518447856837101	4	37	81
convention	0.000797533860096898	3	18	25
hike	0.000352845333441927	4	11	14
tock	0.000776824387717418	1	5	5
players	0.000621459510173934	1	4	4
recommendations	0.000427144918859817	5	5	8
sidewalk	0.000404252987140346	4	4	4
douchebag	0.000368326157183909	2	2	2
fridays	0.000368326157183909	2	2	2
guide	0.000368326157183909	2	2	2
cubes	0.000368326157183909	2	2	2
garbage	0.000493118497267451	1	10	17
satan	0.000450160545732331	5	3	3
summary	0.000418793018978061	3	3	3
headset	0.000368326157183909	2	2	2
aliante	0.00055633359371199	3	7	9
crush	0.000418793018978061	3	3	3
ritual	0.000450160545732331	5	3	3
waters	0.000524241788898859	2	5	6
citycenter	0.000404252987140346	4	4	4
chameleon	0.000418793018978061	3	3	3
duckies	0.000418793018978061	3	3	3
cap	0.000418793018978061	3	3	3
woot	0.000450160545732331	5	3	3
surf	0.000418793018978061	3	3	3
greets	0.000512573902631781	5	6	8
locks	0.000466094632630451	1	3	3
treats	0.000345226053071781	4	6	8
cat	0.000558390691970748	3	4	4
trucks	0.000450160545732331	5	3	3
apologize	0.000621459510173934	1	4	4
resorts	0.000552489235775863	2	3	3
comicon	0.000466094632630451	1	3	3
insanity	0.000466094632630451	1	3	3
cali	0.000368326157183909	2	2	2
disgusting	0.000707622934700495	1	8	11
pic	0.000450160545732331	5	3	3
scam	0.000621459510173934	1	4	4
mist	0.000404252987140346	4	4	4
church	0.000517839079607672	4	9	10
espanol	0.000418793018978061	3	3	3
goldbar	0.000418793018978061	3	3	3
slams	0.000466094632630451	1	3	3
bulletin	0.00030318974035526	4	3	3
tres	0.000697988364963435	3	5	5
music	0.000420363127165217	4	30	81
pineville	0.000418793018978061	3	3	3
photos	0.000418793018978061	3	3	3
dunkin	0.000575376755119635	4	10	13
compassion	0.000466094632630451	1	3	3
cocktail	0.000450160545732331	5	3	3
verde	0.000505316233925433	4	5	5
bonanza	0.000558390691970748	3	4	4
pond	0.000837586037956122	3	6	6
arrangements	0.00030318974035526	4	3	3
cozy	0.000598002886403744	5	7	12
	<!DOCTYPE html>
	<meta charset="utf-8">
	<style>

	text {
	font: 24px "Helvetica Neue", Helvetica, Arial, sans-serif;
	text-anchor: middle;
	pointer-events: none;
	}

	circle {
	fill: #ccc;
	}

	rect {
	fill: white;
	/stroke: #ccc;/
	}

	.node:hover circle {
	fill: #d62728;
	opacity: 0.5;
	}

	.d3-tip {
	line-height: 1;
	font: 14px sans-serif;
	padding: 12px;
	background: rgba(0, 0, 0, 0.8);
	color: rgb(185, 185, 185);
	border-radius: 2px;
	}

	/* Creates a small triangle extender for the tooltip */
	.d3-tip:after {
	box-sizing: border-box;
	display: inline;
	font-size: 10px;
	width: 100%;
	line-height: 1;
	color: rgba(0, 0, 0, 0.8);
	content: "\25BC";
	position: absolute;
	text-align: center;
	}

	/* Style northward tooltips differently */
	.d3-tip.n:after {
	margin: -1px 0 0 0;
	top: 100%;
	left: 0;
	}

	</style>
	<body>
	<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
	<script src="http://labratrevenge.com/d3-tip/javascripts/d3.tip.v0.6.3.js"></script>
	<script>

	// Modified from the Mike Bostock example here: http://bl.ocks.org/mbostock/1846692
	// Utilized Jerome Cukier's tutorial here: http://www.jeromecukier.net/blog/2012/05/28/manipulating-data-like-a-boss-with-d3/

	var margin = {top: 20, right: 20, bottom: 30, left: 50},
	width = 960 - margin.left - margin.right,
	height = 500 - margin.top - margin.bottom,
	bleed = 100;

	var cwidth=175, cheight=210, cmargin=5;

	var pack = d3.layout.pack()
	.sort(null)
	.size([cwidth, cheight])
	.padding(0)
	.value(function(d) { return d.interestingness; })
	.children(function(d) { return d; });

	var color = d3.scale.ordinal().range(["#d62728", "#ff9896", "#c7c7c7", "#98df8a", "#2ca02c"]);

	var svg = d3.select("body")
	.append("svg")
	.attr("width", width + margin.left + margin.right)
	.attr("height", height + margin.top + margin.bottom)
	.append("g")
	.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

	var tip = d3.tip()
	.attr('class', 'd3-tip')
	.offset([-10, 0])
	.html(function(d) {
	return "<div><span>Word:</span> <span style='color:white'>" + d.word + "</span></div>" +
	"<div><span>Frequency:</span> <span style='color:white'>" + d.count +
	" (" + d3.round(100 * d.count / d.total, 0)+ "%)" + "</span></div>";
	})

	svg.call(tip);

	d3.csv("data.csv", function(error, data) {
	if (error) throw error;

	data.forEach(function(d){
	d.count = + d.count;
	d.interestingness = +d.interestingness;
	d.total = +d.total;
	d.stars = +d.stars;
	return d
	})

	data.sort(function(a,b){ return b.stars - a.stars; })

	var nested = d3.nest()
	.key(function(d) {return d.stars;})
	.sortKeys(d3.ascending)
	.entries(data)

	console.log("NESTED", JSON.stringify(nested, null, 2))

	// Convert the data into a format copasetic for the pack layout
	// data = { children: data };

	console.log(JSON.stringify("DATA", data, null, 2));
	// console.log(pack.nodes(data).filter(function(d) { return !d.children; }));

	// Create 5 group variables
	var group = svg.selectAll("cluster")
	.data(nested)
	.enter()
	.append("g")
	.attr("class", "cluster")
	.attr("transform", function(d, i) { return "translate(" + (cwidth * i) + "," + height / 3 + ")"; });

	group.append("rect")
	.attr("x", cmargin)
	.attr("y", cmargin)
	.attr("width",cwidth-2*cmargin)
	.attr("height",cheight-2*cmargin);

	group.append("text")
	.attr("y",cheight+20)
	.attr("x",cmargin + 10)
	.text(function(d) {return d.key;})

	console.log("GROUP", group)

	// Create the selection
	var node = group.selectAll(".node")
	.data(function(d) { return pack.nodes(d.values).filter(function(d) { return !d.children; }); })
	.enter().append("g")
	.attr("class", "node")
	.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
	.on('mouseover', tip.show)
	.on('mouseout', tip.hide);

	// Append the circles
	node.append("circle")
	.attr("r", function(d) { return d.r; })
	.style("fill", function(d) {return color(d.stars); });

	// Append the text
	node.append("text")
	.text(function(d) { return d.word; })
	.style("font-size", function(d) { return Math.min(2 * d.r, (2 * d.r - 8) / this.getComputedTextLength() * 24) + "px"; })
	.attr("dy", ".35em");

	});
	</script>
	# ANALYSIS OF YELP DATA

	# Start the timer
	ptm <- proc.time()

	# Clear working space
	rm(list = ls())
	gc()

	# Load packages
	library(NLP)
	library(magrittr)
	library(tm)
	library(openNLP)
	library(plyr)
	library(RWeka)
	library(RTextTools)
	library(SnowballC)
	library(reshape)

	# Load the Data
	loc <- '/Users/josiahdavis/Documents/GitHub/earl/'
	dr <- read.csv(paste(loc, 'yelp_review.csv', sep=""))

	# Subset only for Starbucks records (also filter out German records)
	dr <- dr[(dr$name == "Starbucks") & !(grepl(pattern = "das", x = dr$text)) &
	!(grepl(pattern = "haw", x = dr$text)) & !(grepl(pattern = "tres", x = dr$text)),]

	# Collapse into five text blobs, one for each rating
	texts = list()
	for (i in 1:5){
	texts[[i]] <- paste(as.character(dr[dr$stars == i,]$text), sep="", collapse="")
	}

	# =====================================
	# Perform text mining
	# transformations
	# =====================================

	# Convert back to to dataframe
	d <- data.frame(reviews = unlist(texts))

	# Replace new line characters with spaces
	d$reviews <- gsub("\n", " ", d$reviews)

	# Convert the relevant data into a corpus object with the tm package
	d <- Corpus(VectorSource(d$reviews))

	# Convert everything to lower case
	d <- tm_map(d, content_transformer(tolower))

	# Read in list of 5000+ stopwords compiled by Matthew Jockers
	fileStopwords <- paste(loc, 'stopwords.txt', sep="")
	stopwords <- readChar(fileStopwords, file.info(fileStopwords)$size)
	stopwords <- unlist(strsplit(stopwords, split=", "))

	# Remove stopwords (loop through a set of stopwords at a time)
	stopwords <- c(stopwords("english"), stopwords)

	for (i in 1:5){
	if(i == 1){
	start <- 1
	}else{
	start <- i * 1000
	}

	if(i < 5){
	end <- (i + 1) * 1000
	}else{
	end <- 5805
	}
	d <- tm_map(d, removeWords, stopwords[start:end])
	}

	# Remove punctuation
	d <- tm_map(d, removePunctuation)

	# Strip whitespace
	d <- tm_map(d, stripWhitespace)

	# Conver to list of strings
	texts <- lapply(d, as.String)

	# =====================================
	# Identify and reviews to only include nouns.
	# This section modified from an excellent tutorial:
	# http://rstudio-pubs-static.s3.amazonaws.com/34069_9ab9f30646474af89ba7849174cab6e9.html
	# =====================================

	# Define function for performing the annotations
	annotate_entities <- function(doc, annotation_pipeline) {
	annotations <- annotate(doc, annotation_pipeline)
	AnnotatedPlainTextDocument(doc, annotations)
	}

	# Define types of annotations to perform
	tagging_pipeline <- list(
	Maxent_Sent_Token_Annotator(),
	Maxent_Word_Token_Annotator(),
	Maxent_POS_Tag_Annotator()
	)

	# Annotate the texts (THIS STEP CAN TAKE A COUPLE OF MINUTES)
	texts_annotated <- texts %>% lapply(annotate_entities, tagging_pipeline)

	# Define the POS getter function
	POSGetter <- function(doc, parts) {
	s <- doc$content
	a <- annotations(doc)[[1]]
	k <- sapply(a$features, `[[`, "POS")
	if(sum(k %in% parts) == 0){
	""
	}else{
	s[a[k %in% parts]]
	}
	}

	# Identify the nouns
	nouns <- texts_annotated %>% lapply(POSGetter, parts = c("NN", "NNS", "NNP", "NNPS"))

	# Turn each character vector into a single string
	nouns <- nouns %>% lapply(as.String)

	# =====================================
	# Get the counts and relative
	# frequencies
	# =====================================

	# Convert back to corpus object
	d <- Corpus(VectorSource(nouns))

	# Convert to a document term matrix (rows are documents, columns are words)
	dtm1 <- as.matrix(DocumentTermMatrix(d))
	dtm2 <- as.matrix(DocumentTermMatrix(d, control = list(weighting = weightTfIdf)))

	# For each rating category, choose the top N most common words
	# For the top N words, choose the top I most interesting words
	N <- 7975
	I <- 30

	# Create the dataframe
	words <- data.frame(word = as.character(),
	interestingness = as.double(),
	stars = as.integer(),
	count = as.integer(),
	total = as.integer())

	# Loop through each rating to determine the most interesting words
	for(i in 1:5){
	mostIntWords <- dtm2[i,order(dtm2[i,], decreasing = TRUE)[1:I]]
	words <- rbind(words, data.frame(word = names(mostIntWords),
	interestingness = unname(mostIntWords),
	stars = i,
	count = unname(dtm1[i,names(mostIntWords)]),
	total = unname(colSums(dtm1[,names(mostIntWords)]))
	)
	)
	}

	# Sort randomly
	idx <- sample(1:nrow(words), nrow(words), replace=FALSE)
	words <- words[idx,]

	# Write to csv file
	writeLoc <- "/Users/josiahdavis/Documents/d3/wordRating/"
	write.csv(words, paste(writeLoc, "data.csv", sep=""), row.names=FALSE)

	print(paste('Total Time: ', round((proc.time() - ptm)[3] / 60, 1)))