### Mexian food capital of the USA #####
### Zhen Fu (zhen.fu@tamu.edu)
### ====== load R libraries, set up directory ======
library(ggplot2)
library(magrittr)
library(plyr)
library(usmap)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("~/Others/TAMU_related/datathon/Taco")
list.files()
## [1] "figure_out_taco.html" "just_tacos_burritos.csv"
## [3] "Mexican_food.R" "Mexican_food.spin.R"
## [5] "Mexican_food.spin.Rmd" "Rplot.pdf"
## [7] "tacos_burritos.csv"
df <- read.csv("./tacos_burritos.csv", header = T)
#head(df)
#str(df)
### ====== cleaning the data and excluding provinces that have lower than 300 locations in that province/state
## getting that list first
state_ex <- data.frame(unclass(sort(table(df$province), decreasing = T)))
states <- row.names(head(state_ex, n= 55))
states <- states[-c(41,43,53:55)]
df_sub <- df[df$province %in% states, ] %>% droplevels()
#########
#str(df_sub)
table(df_sub$province)
##
## AL AR AZ CA CO
## 576 427 2355 23664 1485
## CT DC DE FL GA
## 593 328 157 2977 1816
## HI IA ID IL IN
## 139 492 438 2725 780
## KS KY LA La Habra Hts MA
## 520 417 613 74 1407
## MD ME MI MN MO
## 798 114 1108 691 680
## MS MT NC NE NH
## 327 147 1396 271 348
## NJ NM NV NY Nyc
## 913 769 994 2290 508
## OH OK OR PA RI
## 1024 845 1229 970 239
## SC SD TN TX UT
## 515 118 1005 6099 547
## VA WA WI WV WY
## 1226 1008 746 145 144
## 1.1 which state has the most Mexican food items on average
by_rest <-ddply(df_sub, "id",
summarise, N=length(id))
by_rest$lat <- df_sub$latitude[match(by_rest$id, df_sub$id)]
by_rest$lon <- df_sub$longitude[match(by_rest$id, df_sub$id)]
by_rest$zip <- df_sub$postalCode[match(by_rest$id, df_sub$id)]
by_rest$state <- df_sub$province[match(by_rest$id, df_sub$id)]
## plot this data
by_rest_clean <- by_rest[complete.cases(by_rest), ]
by_rest_clean <- by_rest_clean[by_rest_clean$lat > 22 & by_rest_clean$lon < -50 & by_rest_clean$lon > -130, ]
all_states <- map_data("state")
ggplot(data = all_states) +
geom_polygon(aes(x = long, y = lat, fill = region), color = "white") +
guides(fill=FALSE) +
geom_point(data = by_rest_clean, aes(x = lon, y = lat, size = N), alpha = 1/8) +
scale_size("# of items \nthe menu", limits = c(1, 30) ) +
ggtitle("Mexican food capital of the USA") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.title.x=element_blank(),
axis.title.y = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y =element_blank(),
plot.title = element_text(size = 26, face = "bold"),
axis.ticks.y = element_blank(),
legend.position = "bottom")
## Warning: Removed 28 rows containing missing values (geom_point).
{width="672"}
## 2.where is the taco capital in the USA. Texas, CA, FL, or something else?
## pull the data from the top 5 states
top5 <- row.names(head(state_ex, n=5))
top5_by_rest <- by_rest_clean[by_rest_clean$state %in% top5, ]
ggplot(top5_by_rest, aes(x=state, y=N)) +
geom_boxplot() +
ggtitle("Mexian food diversity of the top 5 states") +
theme_classic(base_size = 14) +
theme(plot.title = element_text(size = 20, face = "bold")) +
ylab("# of Mexian food item on the menu") +
ylim(0, 15)
## Warning: Removed 273 rows containing non-finite values (stat_boxplot).
{width="672"}
######
## one way ANOVA, Tukey multiple pairewise-comparisons for means
group_by(top5_by_rest, state) %>%
summarise(
count = n(),
mean = mean(N, na.rm = TRUE),
sd = sd(N, na.rm = TRUE)
)
## # A tibble: 5 x 4
## state count mean sd
## <fct> <int> <dbl> <dbl>
## 1 AZ 405 3.75 4.77
## 2 CA 3062 4.82 5.75
## 3 FL 566 3.89 4.97
## 4 IL 574 3.30 4.17
## 5 TX 1219 3.63 4.47
res.aov <- aov(N ~ state, data = top5_by_rest)
## there is effect of state on Mexican food diversity among five states, meaning
## location makes a difference
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = N ~ state, data = top5_by_rest)
##
## $state
## diff lwr upr p adj
## CA-AZ 1.0786188 0.3254078 1.8318298 0.0008958
## FL-AZ 0.1430136 -0.7841230 1.0701502 0.9934483
## IL-AZ -0.4477696 -1.3722075 0.4766683 0.6776973
## TX-AZ -0.1107323 -0.9277538 0.7062892 0.9960219
## FL-CA -0.9356053 -1.5873727 -0.2838378 0.0008621
## IL-CA -1.5263884 -2.1743112 -0.8784657 0.0000000
## TX-CA -1.1893512 -1.6717853 -0.7069171 0.0000000
## IL-FL -0.5907832 -1.4346193 0.2530529 0.3117896
## TX-FL -0.2537459 -0.9783135 0.4708216 0.8748528
## TX-IL 0.3370373 -0.3840738 1.0581484 0.7064697
## California has a highest variety of Mexian food