-
-
Save anonymous/052c65ac01943dafe782415f2ce10e61 to your computer and use it in GitHub Desktop.
Yelp Data Conversion and Cleaning Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Kevin Tse | |
# UChicago Economics | |
# May 2015 | |
# Data Conversion and cleaning code for Yelp dataset | |
# install.packages("mosaic") | |
# install.packages("rjson") | |
require(rjson) | |
require(mosaic) | |
require(plyr) | |
# Converting the JSON file to .csv | |
# importing json to R | |
reviewfile <- "yelp_academic_dataset_review.json" | |
con <- file(reviewfile, "r") | |
input <- readLines(con, -1L) | |
close(con) | |
reviews <- ldply(lapply(input, function(x) t(unlist(fromJSON(x))))) | |
save(reviews, file= 'reviews.rdata') | |
write.csv(reviews, file = "reviews.csv") | |
# This works but does not give proper column names | |
# Can manually edit | |
checkinfile <- "yelp_academic_dataset_checkin.json" | |
con2 <- file(checkinfile, "r") | |
input2 <- readLines(con2, -1L) | |
close(con2) | |
checkins <- ldply(lapply(input2, function(x) t(unlist(fromJSON(x))))) | |
save(checkins, file= 'checkins.rdata') | |
write.csv(checkins, file = "checkins.csv") | |
# Business | |
businessfile <- "yelp_academic_dataset_business.json" | |
con3 <- file(businessfile, "r") | |
input3 <- readLines(con3, -1L) | |
close(con3) | |
businesses <- ldply(lapply(input3, function(x) t(unlist(fromJSON(x))))) | |
save(businesses, file= 'businesses.rdata') | |
write.csv(businesses, file = "businesses.csv") | |
# Tip | |
tipfile <- "yelp_academic_dataset_tip.json" | |
con4 <- file(tipfile, "r") | |
input4 <- readLines(con4, -1L) | |
close(con4) | |
tips <- ldply(lapply(input4, function(x) t(unlist(fromJSON(x))))) | |
save(tips, file= 'tips.rdata') | |
write.csv(tips, file = "tips.csv") | |
# User | |
userfile <- "yelp_academic_dataset_user.json" | |
con5 <- file(userfile, "r") | |
input5 <- readLines(con5, -1L) | |
close(con5) | |
users <- ldply(lapply(input5, function(x) t(unlist(fromJSON(x))))) | |
users_wofdlist <- users | |
for (i in ncol(users_wofdlist):8) { | |
users_wofdlist[,i] = NULL | |
} | |
save(users_wofdlist, file= 'users_wofdlist.rdata') | |
write.csv(users_wofdlist, file = "users_wofdlist.csv") | |
############## | |
# Filter by geographic location | |
# For our initial trial, We only wanted businesses in NC, or Charlotte Metro Area | |
nc_businesses = businesses[businesses$state == "NC",] | |
write.csv(nc_businesses, file = "nc_businesses.csv") | |
#Filter by type of restaurants | |
businesses <- read.csv("nc_businesses.csv") | |
# Creating variable with TRUE if business is categorized as a certain ethnicity | |
businesses$korea[businesses$categories1 == "Korean" | businesses$categories2 == "Korean" | businesses$categories3 == "Korean" | businesses$categories4 == "Korean"] <- TRUE | |
businesses$chinese[businesses$categories1 == "Chinese" | businesses$categories2 == "Chinese" | businesses$categories3 == "Chinese" | businesses$categories4 == "Chinese"] <- TRUE | |
businesses$indian[businesses$categories1 == "Indian" | businesses$categories2 == "Indian" | businesses$categories3 == "Indian" | businesses$categories4 == "Indian"] <- TRUE | |
businesses$mexican[businesses$categories1 == "Mexican" | businesses$categories2 == "Mexican" | businesses$categories3 == "Mexican" | businesses$categories4 == "Mexican"] <- TRUE | |
businesses$japanese[businesses$categories1 == "Japanese" | businesses$categories2 == "Japanese" | businesses$categories3 == "Japanese" | businesses$categories4 == "Japanese"] <- TRUE | |
businesses$greek[businesses$categories1 == "Greek" | businesses$categories2 == "Greek" | businesses$categories3 == "Greek" | businesses$categories4 == "Greek"] <- TRUE | |
businesses$vietnamese[businesses$categories1 == "Vietnamese" | businesses$categories2 == "Vietnamese" | businesses$categories3 == "Vietnamese" | businesses$categories4 == "Vietnamese"] <- TRUE | |
businesses$thai[businesses$categories1 == "Thai" | businesses$categories2 == "Thai" | businesses$categories3 == "Thai" | businesses$categories4 == "Thai"] <- TRUE | |
businesses$asianfusion[businesses$categories1 == "Asian Fusion" | businesses$categories2 == "Asian Fusion" | businesses$categories3 == "Asian Fusion" | businesses$categories4 == "Asian Fusion"] <- TRUE | |
businesses$mediterranean[businesses$categories1 == "Mediterranean" | businesses$categories2 == "Mediterranean" | businesses$categories3 == "Mediterranean" | businesses$categories4 == "Mediterranean"] <- TRUE | |
# subsetting the data by ethnicity | |
bizkor <- businesses[which(businesses$korea == TRUE),] #9 restaurants | |
bizchina <- businesses[which(businesses$chinese == TRUE),] #147 | |
bizindia <- businesses[which(businesses$indian == TRUE),] #38 | |
bizmex <- businesses[which(businesses$mexican == TRUE),] #179 | |
bizjapan <- businesses[which(businesses$japanese == TRUE),] #75 | |
bizgreek <- businesses[which(businesses$greek == TRUE),] #41 | |
bizvietnam <- businesses[which(businesses$vietnamese == TRUE),] #27 | |
bizthai <- businesses[which(businesses$thai == TRUE),] #24 | |
bizafusion <- businesses[which(businesses$asianfusion == TRUE),] #49 | |
bizmed <- businesses[which(businesses$mediterranean == TRUE),] #42 | |
# pull out business IDs | |
ids <- bizafusion$business_id | |
# how many total reviews? needed to see if viable ethnic choice | |
revkor <- sum(bizkor$review_count) #399 | |
revchina <- sum(bizchina$review_count) #3732 | |
revindia <- sum(bizindia$review_count) #2014 | |
revmex <- sum(bizmex$review_count) #8161 | |
revjapan <- sum(bizjapan$review_count) #3769 | |
revgreek <- sum(bizgreek$review_count) #1544 | |
revvietnam <- sum(bizvietnam$review_count) #2172 | |
revthai <- sum(bizthai$review_count) #2123 | |
revafusion <- sum(bizafusion$review_count) #4647 | |
revmed <- sum(bizmed$review_count) #2644 | |
#afusion, greek, japan, mex, india, med | |
###################### | |
# The following segment of codes specifically target | |
# Asian fusion restaurants within NC and parse the text | |
# to look for first time eaters of Asian Fusion food | |
# subsetting to useful info on asian fusion restaurants | |
bizafusion1 <- subset(bizafusion, select = c("business_id", "full_address", "categories1", "categories2", "categories3", "categories4", "review_count", "name", "stars", "attributes.Drive.Thru", "attributes.Ambience.hipster", "attributes.Ambience.touristy", "attributes.Accepts.Credit.Cards", "attributes.Price.Range", "attributes.Smoking", "attributes.Wi.Fi", "attributes.BYOB", "attributes.Music.live", "attributes.Music.karaoke")) | |
write.csv(bizafusion1, "bizfusion.csv") | |
#list of buisness IDs | |
bizafuzid <- as.vector(bizafusion1$business_id) | |
bizafuzid2 <- iconv(bizafuzid, from = "UTF-8", to = "ASCII") | |
afuzreviews <- reviews[grep(paste(bizafuzid, collapse = "|"), reviews$business_id), ] | |
write.csv(afuzreviews,"afuzreviews.csv") | |
words = c("first time", "never been", "maiden") | |
afuzmatchedreviews <- afuzreviews[grep(paste(words,collapse = "|"), afuzreviews$text,ignore.case = TRUE),] | |
badwords = c("chinese","japanese","sushi") #according to lindsey | |
afuzmatchedreviews2 <- afuzmatchedreviews[grep(paste(badwords,collapse = "|"), afuzmatchedreviews$text,ignore.case = TRUE, invert = TRUE),] | |
afuzmatchedreviews2 <- afuzmatchedreviews[grep(paste(badwords,collaspe ="|"), afuzmatchedreviews$text, ignore.case = TRUE, invert = TRUE),] | |
###################### | |
# This is the same as above but for med | |
bizmed1 <- subset(bizmed, select = c("business_id", "full_address", "categories1", "categories2", "categories3", "categories4", "review_count", "name", "stars", "attributes.Drive.Thru", "attributes.Ambience.hipster", "attributes.Ambience.touristy", "attributes.Accepts.Credit.Cards", "attributes.Price.Range", "attributes.Smoking", "attributes.Wi.Fi", "attributes.BYOB", "attributes.Music.live", "attributes.Music.karaoke")) | |
write.csv(bizmed1, "bizmed.csv") | |
#list of buisness IDs | |
bizmedid <- as.vector(bizmed1$business_id) | |
bizmedid2 <- iconv(bizmedid, from = "UTF-8", to = "ASCII") | |
medreviews <- reviews[grep(paste(bizmedid, collapse = "|"), reviews$business_id), ] | |
write.csv(medreviews,"afuzreviews.csv") | |
words = c("first time", "never been", "maiden","new experience") | |
medmatchedreviews <- medreviews[grep(paste(words,collapse = "|"), medreviews$text,ignore.case = TRUE),] | |
#################### | |
# The followings use the entire dataset regardless of location | |
# We use pattern matching to look through all reviews that contain | |
# the desired phrases | |
words = c("never had korean", "never eaten korean", "first time eating korean") | |
kormatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(kormatchedreviews,"korreviews.csv") | |
words = c("never had chinese", "never eaten chinese", "first time eating chinese") | |
chnmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(chnmatchedreviews,"chnreviews.csv") | |
words = c("never had indian", "never eaten indian", "first time eating indian") | |
indmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(indmatchedreviews,"indreviews.csv") | |
words = c("never had viet", "never eaten viet", "first time eating viet") | |
vietmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(vietmatchedreviews,"vietreviews.csv") | |
words = c("never had greek", "never eaten greek", "first time eating greek") | |
greekmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(greekmatchedreviews,"greekreviews.csv") | |
words = c("never had japanese", "never eaten japanese", "first time eating japanese") | |
jpmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(jpmatchedreviews,"jpreviews.csv") | |
words = c("never had thai", "never eaten thai", "first time eating thai") | |
thaimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(thaimatchedreviews,"thaireviews.csv") | |
words = c("never had mongolian", "never eaten mongolian", "first time eating mongolian") | |
mongolianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(mongolianmatchedreviews,"mongolianreviews.csv") | |
words = c("never had turkish", "never eaten turkish", "first time eating turkish") | |
turkishmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(turkishmatchedreviews,"turkishreviews.csv") | |
words = c("never had ethiopian", "never eaten ethiopian", "first time eating ethiopian") | |
ethiopianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(ethiopianmatchedreviews,"ethiopianreviews.csv") | |
words = c("never had brazilian", "never eaten brazilian", "first time eating brazilian") | |
brazilianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(brazilianmatchedreviews,"brazilianreviews.csv") | |
words = c("never had sushi", "never eaten sushi", "first time eating sushi") | |
sushimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(sushimatchedreviews,"sushireviews.csv") | |
words = c("never had malaysian", "never eaten malaysian", "first time eating malaysian") | |
malaysianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(malaysianmatchedreviews,"malaysianreviews.csv") | |
words = c("never had tibetan", "never eaten tibetan", "first time eating tibetan") | |
tibetanmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(tibetanmatchedreviews,"tibetanreviews.csv") | |
words = c("never had taiwanese", "never eaten taiwanese", "first time eating taiwanese") | |
taiwanesematchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(taiwanesematchedreviews,"taiwanesereviews.csv") | |
words = c("never had dim sum", "never eaten dim sum", "first time eating dim sum", "never had dimsum", "never eaten dimsum", "first time eating dimsum") | |
dimsummatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(dimsummatchedreviews,"dimsumreviews.csv") | |
words = c("never had indonesian", "never eaten indonesian", "first time eating indonesian") | |
indonesianmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(indonesianmatchedreviews,"indonesianreviews.csv") | |
words = c("never had falafel", "never eaten falafel", "first time eating falafel") | |
falafelmatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(falafelmatchedreviews,"falafelreviews.csv") | |
words = c("never had curry", "never eaten curry", "first time eating curry") | |
currymatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(currymatchedreviews,"curryreviews.csv") | |
words = c("never had kimchi", "never eaten kimchi", "first time eating kimchi") | |
kimchimatchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(kimchimatchedreviews,"kimchireviews.csv") | |
words = c("never had philippine", "never eaten philippine", "first time eating philippine") | |
philippinematchedreviews <- reviews[grep(paste(words,collapse = "|"), reviews$text,ignore.case = TRUE),] | |
write.csv(philippinematchedreviews,"philippinereviews.csv") | |
################# | |
# We proceeded to manually inspect each of the abovetextual | |
# review to find out which reviews actually show the user's | |
# first encounter with the certain type of food | |
# We only keep those entries | |
# bra_reviews_edited <- read.csv("Brazilianfixed.csv") | |
# bra_reviews_edited$foodtype = "Brazilian" | |
ethio_reviews_edited <- read.csv("ethiopianfixed.csv") | |
ethio_reviews_edited$foodtype = "Ethiopian" | |
ethio_reviews_edited$foodtype2 = "Non-Asian" | |
greek_reviews_edited <- read.csv("greekfixed.csv") | |
greek_reviews_edited$foodtype = "Greek" | |
ethio_reviews_edited$foodtype2 = "Non-Asian" | |
ind_reviews_edited <- read.csv("indfixed.csv") | |
ind_reviews_edited$foodtype = "Indian" | |
ind_reviews_edited$foodtype2 = "Asian" | |
jp_reviews_edited <- read.csv("jpfixed.csv") | |
jp_reviews_edited$foodtype = "Japanese" | |
jp_reviews_edited$foodtype2 = "Asian" | |
kor_reviews_edited <- read.csv("korreviewsfixed.csv") | |
kor_reviews_edited$foodtype = "Korean" | |
kor_reviews_edited$foodtype2 = "Asian" | |
mongol_reviews_edited <- read.csv("mongolianfixed.csv") | |
mongol_reviews_edited$foodtype = "Mongolian" | |
mongol_reviews_edited$foodtype2 = "Asian" | |
thai_reviews_edited <- read.csv("thaifixed.csv") | |
thai_reviews_edited$foodtype = "Thai" | |
thai_reviews_edited$foodtype2 = "Asian" | |
# turk_reviews_edited <- read.csv("turkishfixed.csv") | |
# turk_reviews_edited$foodtype = "Turkish" | |
viet_reviews_edited <- read.csv("vietfixed.csv") | |
viet_reviews_edited$foodtype = "Vietnamese" | |
viet_reviews_edited$foodtype2 = "Asian" | |
### Additional | |
tibetan_reviews_edited <- read.csv("editedtibetanreviews.csv") | |
tibetan_reviews_edited$foodtype = "Chinese" | |
tibetan_reviews_edited$foodtype2 = "Asian" | |
taiwanese_reviews_edited <- read.csv("editedtaiwanesereviews.csv") | |
taiwanese_reviews_edited$foodtype = "Chinese" | |
taiwanese_reviews_edited$foodtype2 = "Asian" | |
sushi_reviews_edited <- read.csv("editedsushireviews.csv") | |
sushi_reviews_edited$foodtype = "Japanese" | |
sushi_reviews_edited$foodtype2 = "Asian" | |
philippine_reviews_edited <- read.csv("editedphilippinereviews.csv") | |
philippine_reviews_edited$foodtype = "Philippine" | |
philippine_reviews_edited$foodtype2 = "Asian" | |
malaysian_reviews_edited <- read.csv("editedmalaysianreviews.csv") | |
malaysian_reviews_edited$foodtype = "Malaysian" | |
malaysian_reviews_edited$foodtype2 = "Asian" | |
indonesian_reviews_edited <- read.csv("editedindonesianreviews.csv") | |
indonesian_reviews_edited$foodtype = "Indonesian" | |
indonesian_reviews_edited$foodtype2 = "Asian" | |
falafel_reviews_edited <- read.csv("editedfalafelreviews.csv") | |
falafel_reviews_edited$foodtype = "Middle Eastern" | |
falafel_reviews_edited$foodtype2 = "Asian" | |
dimsum_reviews_edited <- read.csv("editeddimsumreviews.csv") | |
dimsum_reviews_edited$foodtype = "Chinese" | |
dimsum_reviews_edited$foodtype2 = "Asian" | |
curry_reviews_edited <- read.csv("editedcurryreviews.csv") | |
curry_reviews_edited$foodtype = "Indian" | |
curry_reviews_edited$foodtype2 = "Asian" | |
# We then aggregate all the above reviews into one data.frame | |
agg_reviews <- Reduce(function (x,y) merge(x,y,all=TRUE), list(ethio_reviews_edited,greek_reviews_edited, ind_reviews_edited, jp_reviews_edited, kor_reviews_edited, thai_reviews_edited,viet_reviews_edited, tibetan_reviews_edited,taiwanese_reviews_edited,sushi_reviews_edited,philippine_reviews_edited,malaysian_reviews_edited,indonesian_reviews_edited,falafel_reviews_edited,dimsum_reviews_edited,curry_reviews_edited)) | |
users_edited <- read.csv("users_wofdlist.csv") | |
# bra_ids <- bra_reviews_edited$user_id | |
# ethio_ids <- ethio_reviews_edited$user_id | |
# greek_ids <- greek_reviews_edited$user_id | |
# ind_ids <- ind_reviews_edited$user_id | |
# jp_ids <- jp_reviews_edited$user_id | |
# kor_ids <- kor_reviews_edited$user_id | |
# mongol_ids <- mongol_reviews_edited$user_id | |
# thai_ids <- thai_reviews_edited$user_id | |
# turk_ids <- turk_reviews_edited$user_id | |
# viet_ids <- viet_reviews_edited$user_id | |
agg_ids <- agg_reviews$user_id | |
# We now extracts the user information for all the users | |
# who have left a review after an initial encounter with | |
# a food type | |
users_of_interest <- users_edited[grep(paste(agg_ids, collapse = "|"), users_edited$user_id),] | |
# Filter down the list to solely contain users with > 30 review counts | |
users_30 <- users_of_interest[users_of_interest$review_count > 30,] | |
# Using the the user_ids of the above list | |
# We filter the list of our reviews again | |
agg_reviews_30 = agg_reviews[grep(paste(users_30$user_id, collapse = "|"), agg_reviews$user_id),] | |
agg_reviews_30 = unique(agg_reviews_30) | |
# We need to check for duplicates of ids in the above data | |
# There are 3 as of now | |
# At this point, | |
# We have the lists of users and their account info in users_r30 | |
# and their first review and food type in agg_reviews_30 | |
# And we back them up | |
# write.csv(users_30, "users_30.csv") | |
# write.csv(agg_reviews_30, "agg_reviews_30.csv") | |
# Now we want to extract all the reviews of each user | |
reviews_30 = reviews[grep(paste(users_30$user_id, collapse = "|"), reviews$user_id),] | |
backup = reviews_30 | |
# This cut us down to over 11k reviews | |
#write.csv(reviews_30, "reviews_30.csv") | |
# Next we only want the reviews that occur after each person's | |
# Initial encounter with the foodtype | |
# Code something like | |
# if (userid == reviews_30$user_id) && (intial.date > date) | |
# then (we keep the review) | |
reviews_30 <- reviews_30[with(reviews_30, order (user_id, date)),] | |
reviews_30_ascii = reviews_30 | |
reviews_30_ascii$user_id <- iconv(reviews_30$user_id, from = "UTF-8", to = "ASCII") | |
reviews_30_ascii$review_id <- iconv(reviews_30$review_id, from = "UTF-8", to = "ASCII") | |
reviews_30_ascii$date <- iconv(reviews_30$date, from = "UTF-8", to = "ASCII") | |
reviews_30_ascii$business_id <- iconv(reviews_30$business_id, from = "UTF-8", to = "ASCII") | |
reviews_30_ascii$stars <- iconv(reviews_30$stars, from = "UTF-8", to = "ASCII") | |
all_reviews_30 = reviews_30_ascii | |
agg_reviews_30_ascii = agg_reviews_30 | |
agg_reviews_30_ascii$user_id <- iconv(agg_reviews_30$user_id, from = "UTF-8", to = "ASCII") | |
agg_reviews_30_ascii$review_id <- iconv(agg_reviews_30$review_id, from = "UTF-8", to = "ASCII") | |
agg_reviews_30_ascii$business_id <- iconv(agg_reviews_30$business_id, from = "UTF-8", to = "ASCII") | |
agg_reviews_30_ascii$foodtype <- iconv(agg_reviews_30$foodtype, from = "UTF-8", to = "ASCII") | |
# Filtering out reviews that are before the initial date | |
k = 0 | |
for (i in 1:nrow(reviews_30_ascii)) { | |
curr_id = reviews_30_ascii$user_id[i-k] | |
index = match(curr_id, agg_reviews_30_ascii$user_id) | |
init_date = agg_reviews_30_ascii$date[index] | |
if (is.na(index)) {break} | |
if (as.integer(as.Date(reviews_30_ascii$date[i-k])) < as.integer(as.Date(init_date))) { | |
reviews_30_ascii <- reviews_30_ascii[-(i-k),] | |
k = k + 1 | |
} | |
} | |
# write.csv(reviews_30_ascii,"date_filtered_reviews3.csv") | |
# write.csv(agg_reviews_30_ascii, "users_initial_reviews3.csv") | |
# if (!is.null(reviews_30_ascii$date[i])) { | |
# .... | |
# } | |
# if (is.null(reviews_30_ascii$date[i])) { | |
# reviews_30_ascii[i,] = NULL | |
# i = i - 1 | |
# } | |
# a good solution is to make a new data.frame | |
# Then i will not have to reference multiple frames | |
# user_id, initial_review_id, initial date, food type, initial rating, text, list | |
# list of total reviews, lists of subsequent reviews, number of reviews of type | |
# consider dividing reviews up in time periods | |
users_and_reviews <- data.frame(user_id = agg_reviews_30$user_id, review_id = agg_reviews_30$review_id, initial_rating = agg_reviews_30$stars, initial_date = agg_reviews_30$date, food_type = agg_reviews_30$foodtype, text = agg_reviews_30$text) | |
# After this, we want to count them and get a sense of | |
# how many reviews are written after the initial | |
reviews_30_ascii$initial = 0 | |
reviews_30_ascii$initial2 = 0 | |
reviews_30_ascii$categories1 = 0 | |
reviews_30_ascii$categories2 = 0 | |
reviews_30_ascii$categories3 = 0 | |
reviews_30_ascii$categories4 = 0 | |
for (i in 1:nrow(reviews_30_ascii)) { | |
biz_id = reviews_30_ascii$business_id[i] | |
index = match(biz_id,businesses$business_id) | |
reviews_30_ascii$categories1[i] = toString(businesses$categories1[index]) | |
reviews_30_ascii$categories2[i] = toString(businesses$categories2[index]) | |
reviews_30_ascii$categories3[i] = toString(businesses$categories3[index]) | |
reviews_30_ascii$categories4[i] = toString(businesses$categories4[index]) | |
} | |
for (i in 1:nrow(reviews_30_ascii)) { | |
u_id = reviews_30_ascii$user_id[i] | |
index = match(u_id, agg_reviews_30_ascii$user_id) | |
reviews_30_ascii$initial[i] = toString(agg_reviews_30_ascii$foodtype[index]) | |
reviews_30_ascii$initial2[i] = toString(agg_reviews_30_ascii$foodtype2[index]) | |
} | |
# write.csv(reviews_30_ascii, "date_filtered_reviews_with_types3.csv") | |
reviews_30_ascii$match = 0 | |
for (i in 1:nrow(reviews_30_ascii)){ | |
initial = toString(reviews_30_ascii$initial[i]) | |
c1 = toString(reviews_30_ascii$categories1[i]) | |
c2 = toString(reviews_30_ascii$categories2[i]) | |
c3 = toString(reviews_30_ascii$categories3[i]) | |
c4 = toString(reviews_30_ascii$categories4[i]) | |
if ((initial == c1) || (initial == c2) || (initial == c3) || (initial == c4)){ | |
reviews_30_ascii$match[i] = 1 | |
} | |
} | |
users_and_reviews$number = 0 | |
for (i in 1:nrow(users_and_reviews)){ | |
for (j in 1:nrow(reviews_30_ascii)){ | |
if (reviews_30_ascii$user_id[j]== users_and_reviews$user_id[i]) { | |
users_and_reviews$number[i] = users_and_reviews$number[i] + as.numeric(reviews_30_ascii$match[j]) | |
} | |
} | |
} | |
# Find the average rating of each user and save it into users' initial review file | |
agg_reviews_30$averagestars = 0 | |
for (i in 1:nrow(agg_reviews_30)) { | |
u_id = agg_reviews_30$user_id[i] | |
agg_reviews_30$averagestars[i] = mean(reviews_30[reviews_30$user_id == u_id,]$stars) | |
} | |
write.csv(users_and_reviews, "users_and_reviews2.csv") | |
write.csv(reviews_30_ascii, "date_filtered_reviews_with_types2.csv") | |
users_and_reviews$number <- users_and_reviews$number - 1 | |
summary(lm (users_and_reviews$number ~ users_and_reviews$initial_rating)) | |
###### Model 2 | |
# Filter user joined after 2012/12 | |
users_edited = users_wofdlist | |
users_edited2 = users_edited | |
users_edited2 <- users_edited2[with(users_edited2, order (yelping_since)),] | |
users_edited2 = users_edited2[1:308107,] | |
# We only want users with more than 30 reviews | |
users_edited30 = users_edited2[users_edited2$review_count > 30,] | |
# We only care about reviews 2013 - 2015 | |
reviews2 = reviews | |
reviews2 <- reviews2[with(reviews2, order (date)),] | |
reviews2 <- reviews2[as.integer(as.Date(reviews2$date)) > 15705,] | |
# We want to count the number of reviews each user has made in 2013, 2014,2015 | |
users_edited30$reivews_2013 = 0 | |
users_edited30$reivews_2014 = 0 | |
users_edited30$reivews_2015 = 0 | |
x2013 = 15706 | |
x2014 = 16071 | |
x2015 = 16436 | |
for (i in 1:nrow(users_edited30)) { | |
u_id = users_edited30$user_id[i] | |
temp = reviews2[reviews2$user_id == u_id,] | |
users_edited30$reivews_2013[i] = length(temp[temp$date < x2014,]) | |
users_edited30$reivews_2015[i] = length(temp[temp$date >= x2015,]) | |
users_edited30$reivews_2014[i] = nrow(temp) - users_edited30$reivews_2015[i] - users_edited30$reivews_2013[i] | |
} | |
##### All reviews of users of interest is saved in all_reviews_30 | |
# Now we want to record the categories of all the restaurant that the user | |
# has visited | |
all_reviews_30$initial = 0 | |
all_reviews_30$initial2 = 0 | |
all_reviews_30$categories1 = 0 | |
all_reviews_30$categories2 = 0 | |
all_reviews_30$categories3 = 0 | |
all_reviews_30$categories4 = 0 | |
for (i in 1:nrow(all_reviews_30)) { | |
biz_id = all_reviews_30$business_id[i] | |
index = match(biz_id,businesses$business_id) | |
all_reviews_30$categories1[i] = toString(businesses$categories1[index]) | |
all_reviews_30$categories2[i] = toString(businesses$categories2[index]) | |
all_reviews_30$categories3[i] = toString(businesses$categories3[index]) | |
all_reviews_30$categories4[i] = toString(businesses$categories4[index]) | |
} | |
for (i in 1:nrow(all_reviews_30)) { | |
u_id = all_reviews_30$user_id[i] | |
index = match(u_id, agg_reviews_30_ascii$user_id) | |
all_reviews_30$initial[i] = toString(agg_reviews_30_ascii$foodtype[index]) | |
all_reviews_30$initial2[i] = toString(agg_reviews_30_ascii$foodtype2[index]) | |
} | |
write.csv(users_30, "users_30.csv") | |
write.csv(all_reviews_30, "all_users_30_reviews_with_types.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment