Skip to content

Instantly share code, notes, and snippets.

@kaskarn
Last active October 20, 2020 11:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaskarn/7e2447f00499f8ecc3e6231c8dd36b13 to your computer and use it in GitHub Desktop.
Save kaskarn/7e2447f00499f8ecc3e6231c8dd36b13 to your computer and use it in GitHub Desktop.
Cluster baked goods (page proposal)
library(data.table)
library(igraph)
library(ggplot2)
library(RColorBrewer)
library(plotly)
#get usda data from https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_csv_2020-04-29.zip
#change those paths
mydir <- "C:/Users/balda/Dropbox/usda_cluster/"
datadir <- "C:/Users/balda/Dropbox/usda_cluster/data/"
#read in usda files
nutrient <- fread(paste0(datadir, "nutrient.csv"))
food_nutrient <- fread(paste0(datadir, "food_nutrient.csv"),
select = c("id", "fdc_id", "nutrient_id", "amount", "derivation_id"))
food <- fread(paste0(datadir, "food.csv"))
foodcats <- fread(paste0(datadir, "food_category.csv"))
#sort by id for merges
setkey(nutrient, id)
setkey(food, fdc_id)
setkey(foodcats, id)
nutr_names <- nutrient[food_nutrient[, .(nutrient_id)], 2, with = F]
#add food descriptions to food_nutrient
food_nutrient[, c("desc", "cat_id") := food[food_nutrient[, .(fdc_id)], .(description, food_category_id) ]]
food_nutrient[, nutr_name := nutr_names]
#add categories to food_nutrient
food_nutrient[, cat := foodcats[food_nutrient[, .(cat_id)], description ]]
#function creating nodes and edges from food category
create_foodgraph <- function(food, food_nutrient, mycat, min_nut = 3){
food_sel <- food_nutrient[cat_id %in% mycat]
food_sel[, nutrient_id := paste0("id_", nutrient_id)]
food_sel[, amount_std := (amount - mean(amount)) / sd(amount), .(nutrient_id, cat_id) ]
food_sel[, nutr_n := .N, .(nutrient_id, cat_id) ]
all_nuts <- unique(food_sel[, nutrient_id])
food_sel_wide <- dcast(food_sel, fdc_id + desc + cat + cat_id ~ nutrient_id, value.var = "amount_std")
food_sel_wide[, nnut := length(.SD) - Reduce('+', lapply(.SD, is.na)), .SDcols = all_nuts]
food_sel_wide <- food_sel_wide[nnut > min_nut]
food_nmat <- food_sel_wide[, lapply(.SD, function(i) !is.na(i)), .SDcols = all_nuts] %>% as.matrix
nnut <- dim(food_nmat)[2]
# We make sure to compute covariance among nonzero-pairs, since most nutrients are 0's
# For most items
# Math: not(A) NAND not(B) = A OR B
food_pairs_nut_n <- nnut - (1- food_nmat) %*% t(1 - food_nmat)
food_nutrient_mat <- food_sel_wide[, (all_nuts), with = F] %>% as.matrix
food_nutrient_mat[is.na(food_nutrient_mat)] <- 0
food_xxt <- food_nutrient_mat %*% t(food_nutrient_mat)
food_mu_xy <- food_xxt / food_pairs_nut_n
food_mu_x <- apply(food_nutrient_mat, 1, sum, na.rm = T) / diag(food_pairs_nut_n)
#Cov(X,Y) = Var(XY) - E(X)E(Y)
food_cov <- food_mu_xy - food_mu_x %*% t(food_mu_x)
food_var <- diag(food_cov)
food_cor <- t(food_cov / sqrt(food_var)) / sqrt(food_var)
colnames(food_cor) <- food_sel_wide$fdc_id
rownames(food_cor) <- food_sel_wide$fdc_id
ndist <- as.data.table(food_cor) %>%
.[, from := colnames(food_cor) ] %>%
melt(id.vars = "from", variable.name = "to") %>%
.[, keep := 1:.N > .GRP, from] %>%
.[(keep)]
ndist[, from := as.integer(from)]
ndist[, to := as.character(to) %>% as.integer]
ndist[, unique(c(from,to))]
setkey(food, fdc_id)
id_dt <- food[.(food_sel_wide[, fdc_id]), .(id = fdc_id, name = description, cat_id = food_category_id)]
edges <- ndist[, .(from, to, weight = value)]
list(edges = unique(edges), nodes = unique(id_dt))
}
#function making all plotting items from edges and nodes
make_graph_items <- function(graphobj, edgecor = 0.5){
net <- graph_from_data_frame(d = graphobj$edges[weight > edgecor], vertices = graphobj$nodes, directed = F)
deg <- degree(net)
g <- delete_vertices(net, deg < 1)
V(g)$size <- degree(g) %>% sqrt
V(g)$label <- NA
#make cluster and assign colors based on membership
clp <- cluster_louvain(g, weights = E(g)$weight^2)
V(g)$comm <- clp$membership
V(g)$color <- sample(rainbow(length(clp$membership)))[clp$membership]
#create data.table and write each community to a file, sorted by size
g_dt <- get.vertex.attribute(g) %>% setDT
g_dt[, group := 1 + NROW(g_dt[,.N,comm]) - rank(g_dt[,.N,comm]$N, ties.method = "random")[.GRP], comm]
g_dt[, n := .N, group]
g_dt[group %in% 1:5, fwrite(.SD, paste0(mydir, "usda_grp_", .BY[[1]], ".txt"), sep = "\t"), group]
V(g)$group <- g_dt$group
#set plotting parameters
V(g)$size <- 3
lw <- layout_with_fr(g, weights = E(g)$weight^2)
#look at the largest five communities and plot
interesting_bakes <- c(1:5)
#highlight only largest 5 communities
g2 <- delete_vertices(g, V(g)[! group %in% c(1, 2, 3, 4, 5)])
V(g2)$color <- brewer.pal(length(V(g2)$color %>% unique), "Dark2")[V(g2)$group]
lw2 <- layout_with_fr(g2, weights = E(g2)$weight^2)
g_dt2 <- g_dt[group %in% c(1, 2, 3, 4, 5)]
g_dt2[, c("x", "y") := as.data.table(lw2)]
g_nodup <- g_dt2[, .SD[1], name]
a <- attributes(E(g2))$vnames %>% as.data.table %>% setnames("name")
a[, c("from", "to") := tstrsplit(name,"\\|")]
setkey(g_nodup, name)
a[, c("x", "xend", "y", "yend", "from", "to") :=
.(g_nodup[a$from, x], g_nodup[a$to, x], g_nodup[a$from, y], g_nodup[a$to, y], g_nodup[a$from, name], g_nodup[a$to, name])]
list(full_graph = g, sub_graph = g2, sub_edges = a, sub_nodes = g_nodup, full_layout = lw, sub_layout = lw2)
}
#function to highlight out one or two groups from graph plot
graph_printone <- function(x, layout, num){
allv <- get.vertex.attribute(graph = x) %>% setDT
lab <- allv[group %in% num[1], name]
col <- V(x)$color
col[!V(x)$group %in% num] <- "grey"
plot(x, layout=layout, vertex.frame.color = "grey40", vertex.color = col)
text(1.1, 1 - 1:length(lab)*0.04, lab, adj = 0, col = allv[group == num[1], color])
if(length(num) > 1){
lab2 <- allv[group %in% num[2], name]
text(-2, 1 - 1:length(lab2)*0.04, lab2, adj = 0, col = allv[group == num[2], color])
}
}
#function to convert graph objects from make_graph_items into a ggplot object
subgraph_viz <- function(graphing){
gg_edges <- ggplot(graphing$sub_nodes, aes(x,y,name = name, cat = group)) +
theme_bw() +
# theme(panel.grid = element_blank(), panel.border = element_blank(), axis.line = element_blank())+
theme(axis.line=element_blank(),axis.text.x=element_blank(),
axis.text.y=element_blank(),axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),legend.position="none",
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank()) +
geom_segment(data=graphing$sub_edges, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes = FALSE, color = "grey80")
gg_edges + geom_point(aes(fill=as.factor(group)), size = 4, pch = 21, color = "black")
}
#look at all categories
#fast food: 21 (works)
#baking: 18 (works)
food_nutrient[!is.na(cat_id), .(length(unique(fdc_id)), cat[1]), cat_id]
#baking!
bakeobj <- create_foodgraph(food, food_nutrient, 18, min_nut = 3)
baking <- make_graph_items(bakeobj, edgecor = 0.7)
#plot full graph
plot(baking$full_graph, layout = baking$full_layout)
#plot 5 largest communities
plot(baking$sub_graph, layout = baking$sub_layout)
#highlight groups 1 and 2
graph_printone(baking$sub_graph, baking$sub_layout, c(4,3))
#create ggplot object for interactive viz with ggplotly
gg <- subgraph_viz(baking)
ggplotly(gg, tooltip = "name")
#same with fast food
ffobj <- create_foodgraph(food, food_nutrient, 21)
fastfood <- make_graph_items(ffobj, edgecor = 0.4)
plot(fastfood$full_graph, layout = fastfood$full_layout)
plot(fastfood$sub_graph, layout = fastfood$sub_layout)
graph_printone(fastfood$sub_graph, fastfood$sub_layout, c(1,5))
gg <- subgraph_viz(fastfood)
ggplotly(gg, tooltip = c("name", "cat"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment