Skip to content

Instantly share code, notes, and snippets.

Created Jun 3, 2016
Embed
What would you like to do?
Yelp Regression
# Johnny Ma
# Yelp Food Taste Formation Project
# 6-1-2016
##################
# Section 1: Reading Data and Instantiation
##################
rm()
library(stringr)
install.packages("stargazer")
library(stargazer)
# setwd("~/Desktop")
# setwd("~/Downloads")
# setwd("D:/Dropbox (Personal)/UChicago/Research/Yelp Addiction")
setwd("C:/Users/johnn/Dropbox/UChicago/Research/Yelp Addiction")
data <- read.csv("date_filtered_reviews_with_types2.csv", stringsAsFactors=FALSE) # all future reviews after initial period
biz <- read.csv("businesses.csv", stringsAsFactors=FALSE) # all businesses
initial <- read.csv("users_initial_reviews.csv", stringsAsFactors = FALSE) # initial review
user <- read.csv("all_users_30_reviews_with_types.csv", stringsAsFactors = FALSE) # all the >30 users reviews
data$eated <- 0 # did they eat at same type again? 1 for same type as initial
data$asian <- 0 # is it asian? 1 for asian
data$initialasian <- 0 # was the initial place asian? 1 for initial asian
data$againasian <- 0 # did they eat at an asian restaurant again? 1 for NOT initial and asian return
data$howmany <- 1 # how many places did they eat at
initial$howmany <- 0 # how many places did they eat at, stored in initial
initial$avgrev <- 0 # average review score of user
initial$count12 <- 0
initial$count_bef <- 0
initial$count14 <- 0 # blahv la
initial$count15 <- 0 #lasdjf
initial$location <- 0 #location dummy
initial$user_avg_asian <- 0 # average review of asian restaurants prior
asian <- c("Korean", "Chinese", "Japanese", "Indian", "Vietnamese", "Thai", "Sushi", "Asian") # making the asian criteria
compareNA <- function(v1,v2) { # function to compare NAs
same <- (v1 == v2) | (is.na(v1) & is.na(v2))
same[is.na(same)] <- FALSE
return(same)
}
for(i in 1:nrow(initial)) # function to generate prices and ratings for initial restaurants
{
index <- match(initial$business_id[i], biz$business_id)
initial$price[i] <- biz$attributes.Price.Range[index]
initial$avgstar[i] <- biz$stars[index]
initial$location[i] <- biz$state[index]
}
#calculating average price and rating of asian restaurants, in order to replace 0s in price15 and avgstar15 with the averages
count <- 0
avg2015asian <- 0
avg2015price <- 0
for(i in 1:nrow(biz))
{
if(biz$categories1[i] %in% asian | biz$categories2[i] %in% asian | biz$categories3[i] %in% asian | biz$categories4[i] %in% asian )
{
count <- count + 1
avg2015asian <- avg2015asian + biz$stars[i]
}
}
avg2015asian <- avg2015asian/count
bizz <- biz[complete.cases(biz$attributes.Price.Range),]
count <- 0
for(i in 1:nrow(bizz))
{
if(bizz$categories1[i] %in% asian | bizz$categories2[i] %in% asian | bizz$categories3[i] %in% asian | bizz$categories4[i] %in% asian )
{
count <- count + 1
avg2015price <- avg2015price + bizz$attributes.Price.Range[i]
}
}
avg2015price <- avg2015price/count
#######################
# State Level Characteristics
#######################
initial$isPA <- 0
initial$isNC <- 0
initial$isAZ <- 0
initial$isNV <- 0
initial$isWI <- 0
initial$isQC <- 0
initial$isburg <- 0
for(i in 1:nrow(initial))
{
if(initial$location[i] == "PA")
{
initial$isPA[i] <- 1
}
if(initial$location[i] == "NC")
{
initial$isNC[i] <- 1
}
if(initial$location[i] == "AZ")
{
initial$isAZ[i] <- 1
}
if(initial$location[i] == "NV")
{
initial$isNV[i] <- 1
}
if(initial$location[i] == "WI")
{
initial$isWI[i] <- 1
}
if(initial$location[i] == "QC")
{
initial$isQC[i] <- 1
}
}
three <- length(unique(data$business_id[(substring(data$date, 1, 4) == "2013") & data$asian == 1]))
four <- length(unique(data$business_id[(substring(data$date, 1, 4) == "2014") & data$asian == 1]))
five <- length(unique(data$business_id[(substring(data$date, 1, 4) == "2015") & data$asian == 1]))
initial$gdpcap = 0
for (i in 1:nrow(initial)){
if(initial$location[i] == "PA")
{
initial$gdpcap[i] <- 0
}
if(initial$location[i] == "NC")
{
initial$gdpcap[i] <- 1.1
}
if(initial$location[i] == "AZ")
{
initial$gdpcap[i] <- 0.7
}
if(initial$location[i] == "NV")
{
initial$gdpcap[i] <- 1.3
}
if(initial$location[i] == "WI")
{
initial$gdpcap[i] <- .3
}
if(initial$location[i] == "QC")
{
initial$gdpcap[i] <- 1.45
}
}
# years stuff
#######################
# Adding to Data
#######################
for(i in 1:nrow(data)) # loop to assign 1 to all asian restaurants
{
if(data$categories1[i] %in% asian | data$categories2[i] %in% asian | data$categories3[i] %in% asian | data$categories4[i] %in% asian )
{
data$asian[i] <- 1
}
}
for(i in 1:nrow(data)) # loop to assign 1 if initial restaurant is asian
{
if(data$initial[i] %in% asian)
{
data$initialasian[i] <- 1
}
}
#######################
# Adding to Initial
#######################
initial$avgrev <- 0
for (i in 1:nrow(initial)) { # average review score of reviewer
u_id = initial$user_id[i]
initial$avgrev[i] = mean(data[data$user_id == u_id,]$stars)
}
initial$like <- 0
for(i in 1:nrow(initial)) # initial review "liked" if above 3
{
if(initial$star[i] > 3)
{
initial$like[i] <- 1
}
}
for(i in 1:nrow(initial)) # counting how many times they reviewed after initial
{
initial$howmany[i] <- sum(str_count(data$user_id, initial$user_id[i]))
}
initial$isasian <- 0
for(i in 1:nrow(initial)) # loop to assign 1 if initial restaurant is asian
{
if(initial$foodtype[i] %in% asian)
{
initial$isasian[i] <- 1
}
}
for(i in 1:nrow(data)) # if the initial is the same type of restaurant as later reviews
{
if((compareNA(data$initial[i],data$categories1[i])) | (compareNA(data$initial[i],data$categories2[i])) | (compareNA(data$initial[i],data$categories3[i])) | (compareNA(data$initial[i],data$categories4[i])))
{
data$eated[i] <- 1
}
}
for(i in 1:nrow(data)) # loop to test if they eat at asian place again that is NOT the initial place
{
for(j in 1:nrow(initial))
{
if((data$initialasian[i] == 1) && (data$asian[i] == data$initialasian[i]) && (initial$text[j] != data$text[i]))
{
data$againasian[i] <- 1
}
}
}
initial$countreturnasian <- 0 # how many times they return to asian places
initial$goback <- 0 # if they go back to the same type of place
data$secondtime <- 0 # if the restaurant is NOT the initial one
for(i in 1:nrow(data)) # do they go back to the same type of restaurant?
{
for(j in 1:nrow(initial))
{
if((initial$user_id[j] == data$user_id[i]) && (data$eated[i] == 1) && (initial$text[j] != data$text[i]))
{
initial$goback[j] <- 1
data$secondtime[i] <- 1 # it is the second time, not the initial
}
}
}
initial$returnasian <- 0
initial$is_bef <- 0
initial$is2014 <- 0
initial$is2015 <- 0
initial$price_bef <- 0
initial$avgstar_bef <- 0
initial$price14 <- 0
initial$avgstar14 <- 0
initial$price15 <- 0
initial$avgstar15 <- 0
for(i in 1:nrow(data)) # counting up the number of returns to asian restauarants
{
for (j in 1:nrow(initial))
{
if((data$againasian[i] == 1) && (initial$user_id[j] == data$user_id[i]))
{
initial$returnasian[j] <- 1
initial$countreturnasian[j] = initial$countreturnasian[j] + 1
}
}
}
###########################################
# Controlling for User Characteristics
###########################################
user$asian <- 0
for(i in 1:nrow(user)) # loop to assign 1 to all asian restaurants
{
if(user$categories1[i] %in% asian | user$categories2[i] %in% asian | user$categories3[i] %in% asian | user$categories4[i] %in% asian )
{
user$asian[i] <- 1
}
}
#
# for(i in 1:nrow(user))
# {
# for(j in 1:nrow(initial))
# {
# if(user$asian[i] == 1)
# {
# if((substring(user$date[i], 1, 4) == "2014"))
# {
# initial$is2014[j] <- 1
# initial$count14[j] = initial$count14[j] + 1
# index <- match(user$business_id[i], biz$business_id)
# initial$price14[j] <- initial$price14[j] + biz$attributes.Price.Range[index]
# initial$avgstar14[j] <- initial$avgstar14[j] + biz$stars[index]
# }
# if((substring(user$date[i], 1, 4) == "2015"))
# {
# initial$is2015[j] <- 1
# initial$count15[j] = initial$count15[j] + 1
# index <- match(user$business_id[j], biz$business_id)
# initial$price15[j] <- initial$price15[j] + biz$attributes.Price.Range[index]
# initial$avgstar15[j] <- initial$avgstar15[j] + biz$stars[index]
# }
# if((substring(user$date[i], 1, 4) != "2014") & (substring(user$date[i], 1, 4) != "2015"))
# {
# initial$is_bef[j] <- 1
# initial$count_bef[j] = initial$count_bef[j] + 1
# index <- match(user$business_id[i], biz$business_id)
# initial$price_bef[j] <- initial$price_bef[j] + biz$attributes.Price.Range[index]
# initial$avgstar_bef[j] <- initial$avgstar_bef[j] + biz$stars[index]
# }
# }
# }
# }
user$numdate <- 0
user$numdate = as.numeric(as.Date(user$date))
initial$numdate <-0
initial$numdate = as.numeric(as.Date(initial$date))
for (j in 1:nrow(initial))
{
for(i in 1:nrow(user))
{
if( (initial$user_id[j] == user$user_id[i]) & (user$asian[i] == 1) & (substring(user$date[i], 1, 4) == "2015"))
{
initial$is2015[j] <- 1
initial$count15[j] = initial$count15[j] + 1
index <- match(user$business_id[i], biz$business_id)
initial$price15[j] <- initial$price15[j] + biz$attributes.Price.Range[index]
initial$avgstar15[j] <- initial$avgstar15[j] + biz$stars[index]
}
if( (initial$user_id[j] == user$user_id[i]) & (user$asian[i] == 1) & (substring(user$date[i], 1, 4) == "2014"))
{
initial$is2014[j] <- 1
initial$count14[j] = initial$count14[j] + 1
index <- match(user$business_id[i], biz$business_id)
initial$price14[j] <- initial$price14[j] + biz$attributes.Price.Range[index]
initial$avgstar14[j] <- initial$avgstar14[j] + biz$stars[index]
}
if( (initial$user_id[j] == user$user_id[i]) & (user$asian[i] == 1) & (user$numdate[i] < initial$numdate[j]))
{
initial$user_avg_asian[j] = initial$user_avg_asian[j] + user$stars[i]
initial$is_bef[j] <- 1
initial$count_bef[j] = initial$count_bef[j] + 1
index <- match(user$business_id[i], biz$business_id)
initial$price_bef[j] <- initial$price_bef[j] + biz$attributes.Price.Range[index]
initial$avgstar_bef[j] <- initial$avgstar_bef[j] + biz$stars[index]
}
}
}
initial[is.na(initial)] <- 0
for(i in 1:nrow(initial))
{
if(initial$count14[i] > 0)
{
initial$price14[i] <- initial$price14[i]/initial$count14[i]
initial$avgstar14[i] <- initial$avgstar14[i]/initial$count14[i]
}
if(initial$count15[i] > 0)
{
initial$price15[i] <- initial$price15[i]/initial$count15[i]
initial$avgstar15[i] <- initial$avgstar15[i]/initial$count15[i]
}
if(initial$count_bef[i] > 0)
{
initial$price_bef[i] <- initial$price_bef[i]/initial$count_bef[i]
initial$avgstar_bef[i] <- initial$avgstar_bef[i]/initial$count_bef[i]
initial$user_avg_asian[i] <- initial$user_avg_asian[i]/initial$count_bef[i]
}
}
initial$porpreturn <- 0
for(i in 1:nrow(initial))
{
initial$porpreturn[i] <- initial$countreturnasian[i] / initial$howmany[i]
}
for(i in 1:nrow(initial))
{
if(initial$user_avg_asian[i] < 1)
{
initial$user_avg_asian[i] <- avg2015asian
}
}
initial$diffstar <- initial$star - initial$user_avg_asian # difference in reviewer and average review
initial$shock <- 0
for (i in 1:nrow(initial)) # shock if initial is greater than their usual
{
if(initial$diffstar[i] > 0)
{
initial$shock[i] <-1
}
}
data$price2 <- 0
data$avgstar2 <- 0
for(i in 1:nrow(data))
{
index <- match(data$business_id[i], biz$business_id)
data$price2[i] <- biz$attributes.Price.Range[index]
data$avgstar2[i] <- biz$stars[index]
}
initial$price2 <- 0
initial$avgstar2 <- 0
initial$rating2 <- 0
for(i in 1:nrow(data)) # need to consider more than once revisit
{
if(data$secondtime[i] == 1)
{
index <- match(data$user_id[i], initial$user_id)
initial$price2[index] <- data$price2[i]
initial$avgstar2[index] <- data$avgstar2[i]
initial$rating2[index] <- data$stars[i]
}
}
for(i in 1:nrow(initial))
{
if(initial$price15[i] < 1)
{
initial$price15[i] <- avg2015price
}
if(initial$avgstar15[i] < 1)
{
initial$avgstar15[i] <- avg2015asian
}
}
########################################
# TIME TO EXPLAIN ALL THE VARIBLES
########################################
# returnasian = binary, 1 if they return to an asian restaurant after initial exposure
# stars = initial rating
# price = initial price
# avgstar = initial place's average rating
# avgrev = the reviewer's average review
# goback = binary, 1 if they return to the SAME TYPE of restaurant as initial. Eg go back to Thai if Thai was initial exposure
# countreturnasian = how many times you return to asian place
# rating2 = IF they go back to the same type of restaurant (goback == 1), the rating of the SECOND restaurant
# avgstar 2, price2 = the characteristics of the SECOND restaurant
# count15 = how many times did they go to an asian restaurant in 2015
# count14 = how many times did they go to an asian restaurant in 2014
# is___ = if the reviwer is from said state.
# NOTE lm(rating2 ~ stars + price) is the regression: RATING2 = beta0 + beta1*stars + beta2*price + error
newinitial <- subset(initial, rating2 > 0)
newasian <- subset(initial, returnasian > 0)
newasian <- subset(initial, ((initial$count14 > 0) | (initial$count15 >0) ))
initialbef2015 <- subset(initial, (substring(initial$date, 1, 4) != "2015"))
initialbef2014 <- subset(initialbef2015, (substring(initial$date, 1, 4) != "2014"))
initialbef2014NEW <-
initial2014 <- subset(initial, (substring(initial$date, 1, 4) == "2014") | (substring(initial$date, 1, 4) == "2013"))
probitmodel <- glm(returnasian ~ stars + user_avg_asian + price + avgstar + isPA + isNC + isAZ, family=binomial(link="probit"), data=initial2014) # regressing on if they return to asian restaurants
summary(probitmodel)
probitmodel2 <- glm(goback ~ stars + avgrev + isPA + isNC + isAZ + isNV + isWI + isQC, family=binomial(link="probit"), data=initial2014) # regressing on if they go back
summary(probitmodel2)
summary (lm(porpreturn ~ stars + avgstar + price + avgrev, data = newasian)) # include
summary (lm(countreturnasian ~ stars + price + avgstar + avgrev, data = initial))
summary (lm(rating2 ~ stars + avgstar + avgstar2 + avgrev + isPA + isNC + isAZ + isNV + isWI + isQC, data = newinitial))
summary (lm(avgstar2 ~ rating2 + avgrev, data = newinitial))
summary (lm(avgstar ~ stars + price + avgrev, data = initial))
summary (lm(stars ~ avgstar + price + avgrev + isPA + isNC + isAZ + isNV + isWI + isQC, data = initial)) # regression on initial rating
summary (lm(count14 ~ price14 + avgstar14 + isPA + isNC + isAZ + isNV + isWI + isQC, data = initialbef2014))
summary (lm(count14 ~ count13 + isPA + isNC + isAZ + isNV + isWI + isQC, data = initial))
summary (lm(count13 ~ count12 + isPA + isNC + isAZ + isNV + isWI + isQC, data = initial))
sumstat <- data.frame(c(initial2014$howmany))
sumstat$avg_review <- initial2014$avgrev
avg_price <- mean(initial2014$price)
sumstat$avg_asian_review <- initial2014$user_avg_asian
sumstat$first_rating <- initial2014$stars
sumstat$visits_2014 <- initial2014$count14
sumstat$visits_2015 <- initial2014$count15
################################
# Output Tables: Stargazer
################################
write.csv(initial2014, file = "FINALasiandata.csv")
write.csv(initial, file = "FINALdata.csv")
probitmodel <- glm(goback ~ diffstar + isPA + isNC + isAZ, family=binomial(link="probit"), data=initial2014) # regressing on if they go back
summary(probitmodel) # should add 2015 quality control for each type of restaurant
probitmodel2 <- glm(returnasian ~ diffstar + isPA + isNC + isAZ, family=binomial(link="probit"), data=initial) # regressing on if they return to asian restaurants
summary(probitmodel2)
badfirstreview <- (lm(diffstar ~ price + avgstar + isPA + isNC + isAZ + isNV + isWI + isQC, data = initial)) # regression on initial rating
summary(badfirstreview)
habit <- lm(count15 ~ count14 + price15 + avgstar15 + isPA + isNC + isAZ, data = initial2014) # regressing 2014 going to 2015 going, controlling for states
summary(habit)
consumption <- lm(count15 ~ diffstar + price15 + avgstar15 + isPA + isNC + isAZ, data = initial2014) # regressing 2014 going to 2015 going, controlling for states
summary(consumption)
stargazer(sumstat)
stargazer(probitmodel, probitmodel2, title = "Two Probit Models", style = "qje")
stargazer(badfirstreview, title = "First Review is Trash")
stargazer(habit, title = "Habit Model")
stargazer(consumption, title = "Diff Star model")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment