stenius/Mexican_food.md

## Mexican_food.md

      
    Raw
  

              Mexican_food.md
            
          
Mexican_food.R {#mexican_food.r .title .toc-ignore}

18646 {#section .author}

2019-10-20 {#section-1 .date}


### Mexian food capital of the USA #####
### Zhen Fu (zhen.fu@tamu.edu)
### ====== load R libraries, set up directory ======
library(ggplot2)
library(magrittr)
library(plyr)
library(usmap)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

setwd("~/Others/TAMU_related/datathon/Taco")
list.files()

## [1] "figure_out_taco.html"    "just_tacos_burritos.csv"
## [3] "Mexican_food.R"          "Mexican_food.spin.R"    
## [5] "Mexican_food.spin.Rmd"   "Rplot.pdf"              
## [7] "tacos_burritos.csv"

df <- read.csv("./tacos_burritos.csv", header = T)
#head(df)
#str(df)

### ====== cleaning the data and excluding provinces that have lower than 300 locations in that province/state
## getting that list first

state_ex <- data.frame(unclass(sort(table(df$province), decreasing = T)))
states <- row.names(head(state_ex, n= 55)) 
states <- states[-c(41,43,53:55)]
                     
df_sub <- df[df$province %in% states, ] %>% droplevels()


######### 
#str(df_sub)
table(df_sub$province)

## 
##           AL           AR           AZ           CA           CO 
##          576          427         2355        23664         1485 
##           CT           DC           DE           FL           GA 
##          593          328          157         2977         1816 
##           HI           IA           ID           IL           IN 
##          139          492          438         2725          780 
##           KS           KY           LA La Habra Hts           MA 
##          520          417          613           74         1407 
##           MD           ME           MI           MN           MO 
##          798          114         1108          691          680 
##           MS           MT           NC           NE           NH 
##          327          147         1396          271          348 
##           NJ           NM           NV           NY          Nyc 
##          913          769          994         2290          508 
##           OH           OK           OR           PA           RI 
##         1024          845         1229          970          239 
##           SC           SD           TN           TX           UT 
##          515          118         1005         6099          547 
##           VA           WA           WI           WV           WY 
##         1226         1008          746          145          144

## 1.1 which state has the most Mexican food items on average
by_rest <-ddply(df_sub, "id", 
                summarise, N=length(id))

by_rest$lat <- df_sub$latitude[match(by_rest$id, df_sub$id)]
by_rest$lon <- df_sub$longitude[match(by_rest$id, df_sub$id)]
by_rest$zip <- df_sub$postalCode[match(by_rest$id, df_sub$id)]
by_rest$state <- df_sub$province[match(by_rest$id, df_sub$id)]

## plot this data

by_rest_clean <- by_rest[complete.cases(by_rest), ]
by_rest_clean <- by_rest_clean[by_rest_clean$lat > 22 & by_rest_clean$lon < -50 & by_rest_clean$lon > -130, ]

                                 
all_states <- map_data("state")
  
ggplot(data = all_states) + 
  geom_polygon(aes(x = long, y = lat, fill = region), color = "white") +
  guides(fill=FALSE)       +
  geom_point(data = by_rest_clean, aes(x = lon, y = lat, size = N), alpha = 1/8) +
  scale_size("# of items \nthe menu", limits = c(1, 30) )    +
  ggtitle("Mexican food capital of the USA")     +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.title.x=element_blank(),
        axis.title.y = element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y =element_blank(),
        plot.title = element_text(size = 26, face = "bold"),
        axis.ticks.y = element_blank(),
        legend.position = "bottom")

## Warning: Removed 28 rows containing missing values (geom_point).

{width="672"}
## 2.where is the taco capital in the USA. Texas, CA, FL, or something else?
## pull the data from the top 5 states
top5 <- row.names(head(state_ex, n=5))
top5_by_rest <- by_rest_clean[by_rest_clean$state %in% top5, ]

ggplot(top5_by_rest, aes(x=state, y=N)) + 
  geom_boxplot()  + 
  ggtitle("Mexian food diversity of the top 5 states")     +
  theme_classic(base_size = 14) +
  theme(plot.title = element_text(size = 20, face = "bold")) +
  ylab("# of Mexian food item on the menu") +
  ylim(0, 15)

## Warning: Removed 273 rows containing non-finite values (stat_boxplot).

{width="672"}
######
## one way ANOVA, Tukey multiple pairewise-comparisons for means
group_by(top5_by_rest, state) %>%
  summarise(
    count = n(),
    mean = mean(N, na.rm = TRUE),
    sd = sd(N, na.rm = TRUE)
  )

## # A tibble: 5 x 4
##   state count  mean    sd
##   <fct> <int> <dbl> <dbl>
## 1 AZ      405  3.75  4.77
## 2 CA     3062  4.82  5.75
## 3 FL      566  3.89  4.97
## 4 IL      574  3.30  4.17
## 5 TX     1219  3.63  4.47

res.aov <- aov(N ~ state, data = top5_by_rest)
## there is effect of state on Mexican food diversity among five states, meaning
## location makes a difference

TukeyHSD(res.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = N ~ state, data = top5_by_rest)
## 
## $state
##             diff        lwr        upr     p adj
## CA-AZ  1.0786188  0.3254078  1.8318298 0.0008958
## FL-AZ  0.1430136 -0.7841230  1.0701502 0.9934483
## IL-AZ -0.4477696 -1.3722075  0.4766683 0.6776973
## TX-AZ -0.1107323 -0.9277538  0.7062892 0.9960219
## FL-CA -0.9356053 -1.5873727 -0.2838378 0.0008621
## IL-CA -1.5263884 -2.1743112 -0.8784657 0.0000000
## TX-CA -1.1893512 -1.6717853 -0.7069171 0.0000000
## IL-FL -0.5907832 -1.4346193  0.2530529 0.3117896
## TX-FL -0.2537459 -0.9783135  0.4708216 0.8748528
## TX-IL  0.3370373 -0.3840738  1.0581484 0.7064697

## California has a highest variety of Mexian food