Skip to content

Instantly share code, notes, and snippets.

/yelp.R Secret

Created June 1, 2016 16:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/052c65ac01943dafe782415f2ce10e61 to your computer and use it in GitHub Desktop.
Save anonymous/052c65ac01943dafe782415f2ce10e61 to your computer and use it in GitHub Desktop.
Yelp Data Conversion and Cleaning Code
# Kevin Tse
# UChicago Economics
# May 2015
# Data Conversion and cleaning code for Yelp dataset
# install.packages("mosaic")
# install.packages("rjson")
require(rjson)
require(mosaic)
require(plyr)
# Converting the JSON file to .csv
# importing json to R
reviewfile <- "yelp_academic_dataset_review.json"
con <- file(reviewfile, "r")
input <- readLines(con, -1L)
close(con)
reviews <- ldply(lapply(input, function(x) t(unlist(fromJSON(x)))))
save(reviews, file= 'reviews.rdata')
write.csv(reviews, file = "reviews.csv")
# This works but does not give proper column names
# Can manually edit
checkinfile <- "yelp_academic_dataset_checkin.json"
con2 <- file(checkinfile, "r")
input2 <- readLines(con2, -1L)
close(con2)
checkins <- ldply(lapply(input2, function(x) t(unlist(fromJSON(x)))))
save(checkins, file= 'checkins.rdata')
write.csv(checkins, file = "checkins.csv")
# Business
businessfile <- "yelp_academic_dataset_business.json"
con3 <- file(businessfile, "r")
input3 <- readLines(con3, -1L)
close(con3)
businesses <- ldply(lapply(input3, function(x) t(unlist(fromJSON(x)))))
save(businesses, file= 'businesses.rdata')
write.csv(businesses, file = "businesses.csv")
# Tip
tipfile <- "yelp_academic_dataset_tip.json"
con4 <- file(tipfile, "r")
input4 <- readLines(con4, -1L)
close(con4)
tips <- ldply(lapply(input4, function(x) t(unlist(fromJSON(x)))))
save(tips, file= 'tips.rdata')
write.csv(tips, file = "tips.csv")
# User
userfile <- "yelp_academic_dataset_user.json"
con5 <- file(userfile, "r")
input5 <- readLines(con5, -1L)
close(con5)
users <- ldply(lapply(input5, function(x) t(unlist(fromJSON(x)))))
users_wofdlist <- users
for (i in ncol(users_wofdlist):8) {
users_wofdlist[,i] = NULL
}
save(users_wofdlist, file= 'users_wofdlist.rdata')
write.csv(users_wofdlist, file = "users_wofdlist.csv")
##############
# Filter by geographic location
# For our initial trial, We only wanted businesses in NC, or Charlotte Metro Area
nc_businesses = businesses[businesses$state == "NC",]
write.csv(nc_businesses, file = "nc_businesses.csv")
#Filter by type of restaurants
businesses <- read.csv("nc_businesses.csv")
# Creating variable with TRUE if business is categorized as a certain ethnicity
businesses$korea[businesses$categories1 == "Korean" | businesses$categories2 == "Korean" | businesses$categories3 == "Korean" | businesses$categories4 == "Korean"] <- TRUE
businesses$chinese[businesses$categories1 == "Chinese" | businesses$categories2 == "Chinese" | businesses$categories3 == "Chinese" | businesses$categories4 == "Chinese"] <- TRUE
businesses$indian[businesses$categories1 == "Indian" | businesses$categories2 == "Indian" | businesses$categories3 == "Indian" | businesses$categories4 == "Indian"] <- TRUE
businesses$mexican[businesses$categories1 == "Mexican" | businesses$categories2 == "Mexican" | businesses$categories3 == "Mexican" | businesses$categories4 == "Mexican"] <- TRUE
businesses$japanese[businesses$categories1 == "Japanese" | businesses$categories2 == "Japanese" | businesses$categories3 == "Japanese" | businesses$categories4 == "Japanese"] <- TRUE
businesses$greek[businesses$categories1 == "Greek" | businesses$categories2 == "Greek" | businesses$categories3 == "Greek" | businesses$categories4 == "Greek"] <- TRUE
businesses$vietnamese[businesses$categories1 == "Vietnamese" | businesses$categories2 == "Vietnamese" | businesses$categories3 == "Vietnamese" | businesses$categories4 == "Vietnamese"] <- TRUE
businesses$thai[businesses$categories1 == "Thai" | businesses$categories2 == "Thai" | businesses$categories3 == "Thai" | businesses$categories4 == "Thai"] <- TRUE
businesses$asianfusion[businesses$categories1 == "Asian Fusion" | businesses$categories2 == "Asian Fusion" | businesses$categories3 == "Asian Fusion" | businesses$categories4 == "Asian Fusion"] <- TRUE
businesses$mediterranean[businesses$categories1 == "Mediterranean" | businesses$categories2 == "Mediterranean" | businesses$categories3 == "Mediterranean" | businesses$categories4 == "Mediterranean"] <- TRUE
# subsetting the data by ethnicity
bizkor <- businesses[which(businesses$korea == TRUE),] #9 restaurants
bizchina <- businesses[which(businesses$chinese == TRUE),] #147
bizindia <- businesses[which(businesses$indian == TRUE),] #38
bizmex <- businesses[which(businesses$mexican == TRUE),] #179
bizjapan <- businesses[which(businesses$japanese == TRUE),] #75
bizgreek <- businesses[which(businesses$greek == TRUE),] #41
bizvietnam <- businesses[which(businesses$vietnamese == TRUE),] #27
bizthai <- businesses[which(businesses$thai == TRUE),] #24
bizafusion <- businesses[which(businesses$asianfusion == TRUE),] #49
bizmed <- businesses[which(businesses$mediterranean == TRUE),] #42
# pull out business IDs
ids <- bizafusion$business_id
# how many total reviews? needed to see if viable ethnic choice
revkor <- sum(bizkor$review_count) #399
revchina <- sum(bizchina$review_count) #3732
revindia <- sum(bizindia$review_count) #2014
revmex <- sum(bizmex$review_count) #8161
revjapan <- sum(bizjapan$review_count) #3769
revgreek <- sum(bizgreek$review_count) #1544
revvietnam <- sum(bizvietnam$review_count) #2172
revthai <- sum(bizthai$review_count) #2123
revafusion <- sum(bizafusion$review_count) #4647
revmed <- sum(bizmed$review_count) #2644
#afusion, greek, japan, mex, india, med
######################
# The following segment of codes specifically target
# Asian fusion restaurants within NC and parse the text
# to look for first time eaters of Asian Fusion food
# subsetting to useful info on asian fusion restaurants
bizafusion1 <- subset(bizafusion, select = c("business_id", "full_address", "categories1", "categories2", "categories3", "categories4", "review_count", "name", "stars", "attributes.Drive.Thru", "attributes.Ambience.hipster", "attributes.Ambience.touristy", "attributes.Accepts.Credit.Cards", "attributes.Price.Range", "attributes.Smoking", "attributes.Wi.Fi", "attributes.BYOB", "attributes.Music.live", "attributes.Music.karaoke"))
write.csv(bizafusion1, "bizfusion.csv")
#list of buisness IDs
bizafuzid <- as.vector(bizafusion1$business_id)
bizafuzid2 <- iconv(bizafuzid, from = "UTF-8", to = "ASCII")
afuzreviews <- reviews[grep(paste(bizafuzid, collapse = "|"), reviews$business_id), ]
write.csv(afuzreviews,"afuzreviews.csv")
words = c("first time", "never been", "maiden")
afuzmatchedreviews <- afuzreviews[grep(paste(words,collapse = "|"), afuzreviews$text,ignore.case = TRUE),]
badwords = c("chinese","japanese","sushi") #according to lindsey
afuzmatchedreviews2 <- afuzmatchedreviews[grep(paste(badwords,collapse = "|"), afuzmatchedreviews$text,ignore.case = TRUE, invert = TRUE),]
afuzmatchedreviews2 <- afuzmatchedreviews[grep(paste(badwords,collaspe ="|"), afuzmatchedreviews$text, ignore.case = TRUE, invert = TRUE),]
######################
# This is the same as above but for med
bizmed1 <- subset(bizmed, select = c("business_id", "full_address", "categories1", "categories2", "categories3", "categories4", "review_count", "name", "stars", "attributes.Drive.Thru", "attributes.Ambience.hipster", "attributes.Ambience.touristy", "attributes.Accepts.Credit.Cards", "attributes.Price.Range", "attributes.Smoking", "attributes.Wi.Fi", "attributes.BYOB", "attributes.Music.live", "attributes.Music.karaoke"))
write.csv(bizmed1, "bizmed.csv")
#list of buisness IDs
bizmedid <- as.vector(bizmed1$business_id)
bizmedid2 <- iconv(bizmedid, from = "UTF-8", to = "ASCII")
medreviews <- reviews[grep(paste(bizmedid, collapse = "|"), reviews$business_id), ]
write.csv(medreviews,"afuzreviews.csv")
words = c("first time", "never been", "maiden","new experience")
medmatchedreviews <- medreviews[grep(paste(words,collapse = "|"), medreviews$text,ignore.case = TRUE),]
####################
# The followings use the entire dataset regardless of location
# We use pattern matching to look through all reviews that contain
# the desired phrases
words = c("never had korean", "never eaten korean", "first time eating korean")
kormatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(kormatchedreviews,"korreviews.csv")
words = c("never had chinese", "never eaten chinese", "first time eating chinese")
chnmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(chnmatchedreviews,"chnreviews.csv")
words = c("never had indian", "never eaten indian", "first time eating indian")
indmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(indmatchedreviews,"indreviews.csv")
words = c("never had viet", "never eaten viet", "first time eating viet")
vietmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(vietmatchedreviews,"vietreviews.csv")
words = c("never had greek", "never eaten greek", "first time eating greek")
greekmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(greekmatchedreviews,"greekreviews.csv")
words = c("never had japanese", "never eaten japanese", "first time eating japanese")
jpmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(jpmatchedreviews,"jpreviews.csv")
words = c("never had thai", "never eaten thai", "first time eating thai")
thaimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(thaimatchedreviews,"thaireviews.csv")
words = c("never had mongolian", "never eaten mongolian", "first time eating mongolian")
mongolianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(mongolianmatchedreviews,"mongolianreviews.csv")
words = c("never had turkish", "never eaten turkish", "first time eating turkish")
turkishmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(turkishmatchedreviews,"turkishreviews.csv")
words = c("never had ethiopian", "never eaten ethiopian", "first time eating ethiopian")
ethiopianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(ethiopianmatchedreviews,"ethiopianreviews.csv")
words = c("never had brazilian", "never eaten brazilian", "first time eating brazilian")
brazilianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(brazilianmatchedreviews,"brazilianreviews.csv")
words = c("never had sushi", "never eaten sushi", "first time eating sushi")
sushimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(sushimatchedreviews,"sushireviews.csv")
words = c("never had malaysian", "never eaten malaysian", "first time eating malaysian")
malaysianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(malaysianmatchedreviews,"malaysianreviews.csv")
words = c("never had tibetan", "never eaten tibetan", "first time eating tibetan")
tibetanmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(tibetanmatchedreviews,"tibetanreviews.csv")
words = c("never had taiwanese", "never eaten taiwanese", "first time eating taiwanese")
taiwanesematchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(taiwanesematchedreviews,"taiwanesereviews.csv")
words = c("never had dim sum", "never eaten dim sum", "first time eating dim sum", "never had dimsum", "never eaten dimsum", "first time eating dimsum")
dimsummatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(dimsummatchedreviews,"dimsumreviews.csv")
words = c("never had indonesian", "never eaten indonesian", "first time eating indonesian")
indonesianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(indonesianmatchedreviews,"indonesianreviews.csv")
words = c("never had falafel", "never eaten falafel", "first time eating falafel")
falafelmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(falafelmatchedreviews,"falafelreviews.csv")
words = c("never had curry", "never eaten curry", "first time eating curry")
currymatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(currymatchedreviews,"curryreviews.csv")
words = c("never had kimchi", "never eaten kimchi", "first time eating kimchi")
kimchimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(kimchimatchedreviews,"kimchireviews.csv")
words = c("never had philippine", "never eaten philippine", "first time eating philippine")
philippinematchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),]
write.csv(philippinematchedreviews,"philippinereviews.csv")
#################
# We proceeded to manually inspect each of the abovetextual
# review to find out which reviews actually show the user's
# first encounter with the certain type of food
# We only keep those entries
# bra_reviews_edited <- read.csv("Brazilianfixed.csv")
# bra_reviews_edited$foodtype = "Brazilian"
ethio_reviews_edited <- read.csv("ethiopianfixed.csv")
ethio_reviews_edited$foodtype = "Ethiopian"
ethio_reviews_edited$foodtype2 = "Non-Asian"
greek_reviews_edited <- read.csv("greekfixed.csv")
greek_reviews_edited$foodtype = "Greek"
ethio_reviews_edited$foodtype2 = "Non-Asian"
ind_reviews_edited <- read.csv("indfixed.csv")
ind_reviews_edited$foodtype = "Indian"
ind_reviews_edited$foodtype2 = "Asian"
jp_reviews_edited <- read.csv("jpfixed.csv")
jp_reviews_edited$foodtype = "Japanese"
jp_reviews_edited$foodtype2 = "Asian"
kor_reviews_edited <- read.csv("korreviewsfixed.csv")
kor_reviews_edited$foodtype = "Korean"
kor_reviews_edited$foodtype2 = "Asian"
mongol_reviews_edited <- read.csv("mongolianfixed.csv")
mongol_reviews_edited$foodtype = "Mongolian"
mongol_reviews_edited$foodtype2 = "Asian"
thai_reviews_edited <- read.csv("thaifixed.csv")
thai_reviews_edited$foodtype = "Thai"
thai_reviews_edited$foodtype2 = "Asian"
# turk_reviews_edited <- read.csv("turkishfixed.csv")
# turk_reviews_edited$foodtype = "Turkish"
viet_reviews_edited <- read.csv("vietfixed.csv")
viet_reviews_edited$foodtype = "Vietnamese"
viet_reviews_edited$foodtype2 = "Asian"
### Additional
tibetan_reviews_edited <- read.csv("editedtibetanreviews.csv")
tibetan_reviews_edited$foodtype = "Chinese"
tibetan_reviews_edited$foodtype2 = "Asian"
taiwanese_reviews_edited <- read.csv("editedtaiwanesereviews.csv")
taiwanese_reviews_edited$foodtype = "Chinese"
taiwanese_reviews_edited$foodtype2 = "Asian"
sushi_reviews_edited <- read.csv("editedsushireviews.csv")
sushi_reviews_edited$foodtype = "Japanese"
sushi_reviews_edited$foodtype2 = "Asian"
philippine_reviews_edited <- read.csv("editedphilippinereviews.csv")
philippine_reviews_edited$foodtype = "Philippine"
philippine_reviews_edited$foodtype2 = "Asian"
malaysian_reviews_edited <- read.csv("editedmalaysianreviews.csv")
malaysian_reviews_edited$foodtype = "Malaysian"
malaysian_reviews_edited$foodtype2 = "Asian"
indonesian_reviews_edited <- read.csv("editedindonesianreviews.csv")
indonesian_reviews_edited$foodtype = "Indonesian"
indonesian_reviews_edited$foodtype2 = "Asian"
falafel_reviews_edited <- read.csv("editedfalafelreviews.csv")
falafel_reviews_edited$foodtype = "Middle Eastern"
falafel_reviews_edited$foodtype2 = "Asian"
dimsum_reviews_edited <- read.csv("editeddimsumreviews.csv")
dimsum_reviews_edited$foodtype = "Chinese"
dimsum_reviews_edited$foodtype2 = "Asian"
curry_reviews_edited <- read.csv("editedcurryreviews.csv")
curry_reviews_edited$foodtype = "Indian"
curry_reviews_edited$foodtype2 = "Asian"
# We then aggregate all the above reviews into one data.frame
agg_reviews <- Reduce(function (x,y) merge(x,y,all=TRUE), list(ethio_reviews_edited,greek_reviews_edited, ind_reviews_edited, jp_reviews_edited, kor_reviews_edited, thai_reviews_edited,viet_reviews_edited, tibetan_reviews_edited,taiwanese_reviews_edited,sushi_reviews_edited,philippine_reviews_edited,malaysian_reviews_edited,indonesian_reviews_edited,falafel_reviews_edited,dimsum_reviews_edited,curry_reviews_edited))
users_edited <- read.csv("users_wofdlist.csv")
# bra_ids <- bra_reviews_edited$user_id
# ethio_ids <- ethio_reviews_edited$user_id
# greek_ids <- greek_reviews_edited$user_id
# ind_ids <- ind_reviews_edited$user_id
# jp_ids <- jp_reviews_edited$user_id
# kor_ids <- kor_reviews_edited$user_id
# mongol_ids <- mongol_reviews_edited$user_id
# thai_ids <- thai_reviews_edited$user_id
# turk_ids <- turk_reviews_edited$user_id
# viet_ids <- viet_reviews_edited$user_id
agg_ids <- agg_reviews$user_id
# We now extracts the user information for all the users
# who have left a review after an initial encounter with
# a food type
users_of_interest <- users_edited[grep(paste(agg_ids, collapse = "|"), users_edited$user_id),]
# Filter down the list to solely contain users with > 30 review counts
users_30 <- users_of_interest[users_of_interest$review_count > 30,]
# Using the the user_ids of the above list
# We filter the list of our reviews again
agg_reviews_30 = agg_reviews[grep(paste(users_30$user_id, collapse = "|"), agg_reviews$user_id),]
agg_reviews_30 = unique(agg_reviews_30)
# We need to check for duplicates of ids in the above data
# There are 3 as of now
# At this point,
# We have the lists of users and their account info in users_r30
# and their first review and food type in agg_reviews_30
# And we back them up
# write.csv(users_30, "users_30.csv")
# write.csv(agg_reviews_30, "agg_reviews_30.csv")
# Now we want to extract all the reviews of each user
reviews_30 = reviews[grep(paste(users_30$user_id, collapse = "|"), reviews$user_id),]
backup = reviews_30
# This cut us down to over 11k reviews
#write.csv(reviews_30, "reviews_30.csv")
# Next we only want the reviews that occur after each person's
# Initial encounter with the foodtype
# Code something like
# if (userid == reviews_30$user_id) && (intial.date > date)
# then (we keep the review)
reviews_30 <- reviews_30[with(reviews_30, order (user_id, date)),]
reviews_30_ascii = reviews_30
reviews_30_ascii$user_id <- iconv(reviews_30$user_id, from = "UTF-8", to = "ASCII")
reviews_30_ascii$review_id <- iconv(reviews_30$review_id, from = "UTF-8", to = "ASCII")
reviews_30_ascii$date <- iconv(reviews_30$date, from = "UTF-8", to = "ASCII")
reviews_30_ascii$business_id <- iconv(reviews_30$business_id, from = "UTF-8", to = "ASCII")
reviews_30_ascii$stars <- iconv(reviews_30$stars, from = "UTF-8", to = "ASCII")
all_reviews_30 = reviews_30_ascii
agg_reviews_30_ascii = agg_reviews_30
agg_reviews_30_ascii$user_id <- iconv(agg_reviews_30$user_id, from = "UTF-8", to = "ASCII")
agg_reviews_30_ascii$review_id <- iconv(agg_reviews_30$review_id, from = "UTF-8", to = "ASCII")
agg_reviews_30_ascii$business_id <- iconv(agg_reviews_30$business_id, from = "UTF-8", to = "ASCII")
agg_reviews_30_ascii$foodtype <- iconv(agg_reviews_30$foodtype, from = "UTF-8", to = "ASCII")
# Filtering out reviews that are before the initial date
k = 0
for (i in 1:nrow(reviews_30_ascii)) {
curr_id = reviews_30_ascii$user_id[i-k]
index = match(curr_id, agg_reviews_30_ascii$user_id)
init_date = agg_reviews_30_ascii$date[index]
if (is.na(index)) {break}
if (as.integer(as.Date(reviews_30_ascii$date[i-k])) < as.integer(as.Date(init_date))) {
reviews_30_ascii <- reviews_30_ascii[-(i-k),]
k = k + 1
}
}
# write.csv(reviews_30_ascii,"date_filtered_reviews3.csv")
# write.csv(agg_reviews_30_ascii, "users_initial_reviews3.csv")
# if (!is.null(reviews_30_ascii$date[i])) {
# ....
# }
# if (is.null(reviews_30_ascii$date[i])) {
# reviews_30_ascii[i,] = NULL
# i = i - 1
# }
# a good solution is to make a new data.frame
# Then i will not have to reference multiple frames
# user_id, initial_review_id, initial date, food type, initial rating, text, list
# list of total reviews, lists of subsequent reviews, number of reviews of type
# consider dividing reviews up in time periods
users_and_reviews <- data.frame(user_id = agg_reviews_30$user_id, review_id = agg_reviews_30$review_id, initial_rating = agg_reviews_30$stars, initial_date = agg_reviews_30$date, food_type = agg_reviews_30$foodtype, text = agg_reviews_30$text)
# After this, we want to count them and get a sense of
# how many reviews are written after the initial
reviews_30_ascii$initial = 0
reviews_30_ascii$initial2 = 0
reviews_30_ascii$categories1 = 0
reviews_30_ascii$categories2 = 0
reviews_30_ascii$categories3 = 0
reviews_30_ascii$categories4 = 0
for (i in 1:nrow(reviews_30_ascii)) {
biz_id = reviews_30_ascii$business_id[i]
index = match(biz_id,businesses$business_id)
reviews_30_ascii$categories1[i] = toString(businesses$categories1[index])
reviews_30_ascii$categories2[i] = toString(businesses$categories2[index])
reviews_30_ascii$categories3[i] = toString(businesses$categories3[index])
reviews_30_ascii$categories4[i] = toString(businesses$categories4[index])
}
for (i in 1:nrow(reviews_30_ascii)) {
u_id = reviews_30_ascii$user_id[i]
index = match(u_id, agg_reviews_30_ascii$user_id)
reviews_30_ascii$initial[i] = toString(agg_reviews_30_ascii$foodtype[index])
reviews_30_ascii$initial2[i] = toString(agg_reviews_30_ascii$foodtype2[index])
}
# write.csv(reviews_30_ascii, "date_filtered_reviews_with_types3.csv")
reviews_30_ascii$match = 0
for (i in 1:nrow(reviews_30_ascii)){
initial = toString(reviews_30_ascii$initial[i])
c1 = toString(reviews_30_ascii$categories1[i])
c2 = toString(reviews_30_ascii$categories2[i])
c3 = toString(reviews_30_ascii$categories3[i])
c4 = toString(reviews_30_ascii$categories4[i])
if ((initial == c1) || (initial == c2) || (initial == c3) || (initial == c4)){
reviews_30_ascii$match[i] = 1
}
}
users_and_reviews$number = 0
for (i in 1:nrow(users_and_reviews)){
for (j in 1:nrow(reviews_30_ascii)){
if (reviews_30_ascii$user_id[j]== users_and_reviews$user_id[i]) {
users_and_reviews$number[i] = users_and_reviews$number[i] + as.numeric(reviews_30_ascii$match[j])
}
}
}
# Find the average rating of each user and save it into users' initial review file
agg_reviews_30$averagestars = 0
for (i in 1:nrow(agg_reviews_30)) {
u_id = agg_reviews_30$user_id[i]
agg_reviews_30$averagestars[i] = mean(reviews_30[reviews_30$user_id == u_id,]$stars)
}
write.csv(users_and_reviews, "users_and_reviews2.csv")
write.csv(reviews_30_ascii, "date_filtered_reviews_with_types2.csv")
users_and_reviews$number <- users_and_reviews$number - 1
summary(lm (users_and_reviews$number ~ users_and_reviews$initial_rating))
###### Model 2
# Filter user joined after 2012/12
users_edited = users_wofdlist
users_edited2 = users_edited
users_edited2 <- users_edited2[with(users_edited2, order (yelping_since)),]
users_edited2 = users_edited2[1:308107,]
# We only want users with more than 30 reviews
users_edited30 = users_edited2[users_edited2$review_count > 30,]
# We only care about reviews 2013 - 2015
reviews2 = reviews
reviews2 <- reviews2[with(reviews2, order (date)),]
reviews2 <- reviews2[as.integer(as.Date(reviews2$date)) > 15705,]
# We want to count the number of reviews each user has made in 2013, 2014,2015
users_edited30$reivews_2013 = 0
users_edited30$reivews_2014 = 0
users_edited30$reivews_2015 = 0
x2013 = 15706
x2014 = 16071
x2015 = 16436
for (i in 1:nrow(users_edited30)) {
u_id = users_edited30$user_id[i]
temp = reviews2[reviews2$user_id == u_id,]
users_edited30$reivews_2013[i] = length(temp[temp$date < x2014,])
users_edited30$reivews_2015[i] = length(temp[temp$date >= x2015,])
users_edited30$reivews_2014[i] = nrow(temp) - users_edited30$reivews_2015[i] - users_edited30$reivews_2013[i]
}
##### All reviews of users of interest is saved in all_reviews_30
# Now we want to record the categories of all the restaurant that the user
# has visited
all_reviews_30$initial = 0
all_reviews_30$initial2 = 0
all_reviews_30$categories1 = 0
all_reviews_30$categories2 = 0
all_reviews_30$categories3 = 0
all_reviews_30$categories4 = 0
for (i in 1:nrow(all_reviews_30)) {
biz_id = all_reviews_30$business_id[i]
index = match(biz_id,businesses$business_id)
all_reviews_30$categories1[i] = toString(businesses$categories1[index])
all_reviews_30$categories2[i] = toString(businesses$categories2[index])
all_reviews_30$categories3[i] = toString(businesses$categories3[index])
all_reviews_30$categories4[i] = toString(businesses$categories4[index])
}
for (i in 1:nrow(all_reviews_30)) {
u_id = all_reviews_30$user_id[i]
index = match(u_id, agg_reviews_30_ascii$user_id)
all_reviews_30$initial[i] = toString(agg_reviews_30_ascii$foodtype[index])
all_reviews_30$initial2[i] = toString(agg_reviews_30_ascii$foodtype2[index])
}
write.csv(users_30, "users_30.csv")
write.csv(all_reviews_30, "all_users_30_reviews_with_types.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment