Last active
October 20, 2020 11:09
-
-
Save kaskarn/7e2447f00499f8ecc3e6231c8dd36b13 to your computer and use it in GitHub Desktop.
Cluster baked goods (page proposal)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(igraph) | |
library(ggplot2) | |
library(RColorBrewer) | |
library(plotly) | |
#get usda data from https://fdc.nal.usda.gov/fdc-datasets/FoodData_Central_csv_2020-04-29.zip | |
#change those paths | |
mydir <- "C:/Users/balda/Dropbox/usda_cluster/" | |
datadir <- "C:/Users/balda/Dropbox/usda_cluster/data/" | |
#read in usda files | |
nutrient <- fread(paste0(datadir, "nutrient.csv")) | |
food_nutrient <- fread(paste0(datadir, "food_nutrient.csv"), | |
select = c("id", "fdc_id", "nutrient_id", "amount", "derivation_id")) | |
food <- fread(paste0(datadir, "food.csv")) | |
foodcats <- fread(paste0(datadir, "food_category.csv")) | |
#sort by id for merges | |
setkey(nutrient, id) | |
setkey(food, fdc_id) | |
setkey(foodcats, id) | |
nutr_names <- nutrient[food_nutrient[, .(nutrient_id)], 2, with = F] | |
#add food descriptions to food_nutrient | |
food_nutrient[, c("desc", "cat_id") := food[food_nutrient[, .(fdc_id)], .(description, food_category_id) ]] | |
food_nutrient[, nutr_name := nutr_names] | |
#add categories to food_nutrient | |
food_nutrient[, cat := foodcats[food_nutrient[, .(cat_id)], description ]] | |
#function creating nodes and edges from food category | |
create_foodgraph <- function(food, food_nutrient, mycat, min_nut = 3){ | |
food_sel <- food_nutrient[cat_id %in% mycat] | |
food_sel[, nutrient_id := paste0("id_", nutrient_id)] | |
food_sel[, amount_std := (amount - mean(amount)) / sd(amount), .(nutrient_id, cat_id) ] | |
food_sel[, nutr_n := .N, .(nutrient_id, cat_id) ] | |
all_nuts <- unique(food_sel[, nutrient_id]) | |
food_sel_wide <- dcast(food_sel, fdc_id + desc + cat + cat_id ~ nutrient_id, value.var = "amount_std") | |
food_sel_wide[, nnut := length(.SD) - Reduce('+', lapply(.SD, is.na)), .SDcols = all_nuts] | |
food_sel_wide <- food_sel_wide[nnut > min_nut] | |
food_nmat <- food_sel_wide[, lapply(.SD, function(i) !is.na(i)), .SDcols = all_nuts] %>% as.matrix | |
nnut <- dim(food_nmat)[2] | |
# We make sure to compute covariance among nonzero-pairs, since most nutrients are 0's | |
# For most items | |
# Math: not(A) NAND not(B) = A OR B | |
food_pairs_nut_n <- nnut - (1- food_nmat) %*% t(1 - food_nmat) | |
food_nutrient_mat <- food_sel_wide[, (all_nuts), with = F] %>% as.matrix | |
food_nutrient_mat[is.na(food_nutrient_mat)] <- 0 | |
food_xxt <- food_nutrient_mat %*% t(food_nutrient_mat) | |
food_mu_xy <- food_xxt / food_pairs_nut_n | |
food_mu_x <- apply(food_nutrient_mat, 1, sum, na.rm = T) / diag(food_pairs_nut_n) | |
#Cov(X,Y) = Var(XY) - E(X)E(Y) | |
food_cov <- food_mu_xy - food_mu_x %*% t(food_mu_x) | |
food_var <- diag(food_cov) | |
food_cor <- t(food_cov / sqrt(food_var)) / sqrt(food_var) | |
colnames(food_cor) <- food_sel_wide$fdc_id | |
rownames(food_cor) <- food_sel_wide$fdc_id | |
ndist <- as.data.table(food_cor) %>% | |
.[, from := colnames(food_cor) ] %>% | |
melt(id.vars = "from", variable.name = "to") %>% | |
.[, keep := 1:.N > .GRP, from] %>% | |
.[(keep)] | |
ndist[, from := as.integer(from)] | |
ndist[, to := as.character(to) %>% as.integer] | |
ndist[, unique(c(from,to))] | |
setkey(food, fdc_id) | |
id_dt <- food[.(food_sel_wide[, fdc_id]), .(id = fdc_id, name = description, cat_id = food_category_id)] | |
edges <- ndist[, .(from, to, weight = value)] | |
list(edges = unique(edges), nodes = unique(id_dt)) | |
} | |
#function making all plotting items from edges and nodes | |
make_graph_items <- function(graphobj, edgecor = 0.5){ | |
net <- graph_from_data_frame(d = graphobj$edges[weight > edgecor], vertices = graphobj$nodes, directed = F) | |
deg <- degree(net) | |
g <- delete_vertices(net, deg < 1) | |
V(g)$size <- degree(g) %>% sqrt | |
V(g)$label <- NA | |
#make cluster and assign colors based on membership | |
clp <- cluster_louvain(g, weights = E(g)$weight^2) | |
V(g)$comm <- clp$membership | |
V(g)$color <- sample(rainbow(length(clp$membership)))[clp$membership] | |
#create data.table and write each community to a file, sorted by size | |
g_dt <- get.vertex.attribute(g) %>% setDT | |
g_dt[, group := 1 + NROW(g_dt[,.N,comm]) - rank(g_dt[,.N,comm]$N, ties.method = "random")[.GRP], comm] | |
g_dt[, n := .N, group] | |
g_dt[group %in% 1:5, fwrite(.SD, paste0(mydir, "usda_grp_", .BY[[1]], ".txt"), sep = "\t"), group] | |
V(g)$group <- g_dt$group | |
#set plotting parameters | |
V(g)$size <- 3 | |
lw <- layout_with_fr(g, weights = E(g)$weight^2) | |
#look at the largest five communities and plot | |
interesting_bakes <- c(1:5) | |
#highlight only largest 5 communities | |
g2 <- delete_vertices(g, V(g)[! group %in% c(1, 2, 3, 4, 5)]) | |
V(g2)$color <- brewer.pal(length(V(g2)$color %>% unique), "Dark2")[V(g2)$group] | |
lw2 <- layout_with_fr(g2, weights = E(g2)$weight^2) | |
g_dt2 <- g_dt[group %in% c(1, 2, 3, 4, 5)] | |
g_dt2[, c("x", "y") := as.data.table(lw2)] | |
g_nodup <- g_dt2[, .SD[1], name] | |
a <- attributes(E(g2))$vnames %>% as.data.table %>% setnames("name") | |
a[, c("from", "to") := tstrsplit(name,"\\|")] | |
setkey(g_nodup, name) | |
a[, c("x", "xend", "y", "yend", "from", "to") := | |
.(g_nodup[a$from, x], g_nodup[a$to, x], g_nodup[a$from, y], g_nodup[a$to, y], g_nodup[a$from, name], g_nodup[a$to, name])] | |
list(full_graph = g, sub_graph = g2, sub_edges = a, sub_nodes = g_nodup, full_layout = lw, sub_layout = lw2) | |
} | |
#function to highlight out one or two groups from graph plot | |
graph_printone <- function(x, layout, num){ | |
allv <- get.vertex.attribute(graph = x) %>% setDT | |
lab <- allv[group %in% num[1], name] | |
col <- V(x)$color | |
col[!V(x)$group %in% num] <- "grey" | |
plot(x, layout=layout, vertex.frame.color = "grey40", vertex.color = col) | |
text(1.1, 1 - 1:length(lab)*0.04, lab, adj = 0, col = allv[group == num[1], color]) | |
if(length(num) > 1){ | |
lab2 <- allv[group %in% num[2], name] | |
text(-2, 1 - 1:length(lab2)*0.04, lab2, adj = 0, col = allv[group == num[2], color]) | |
} | |
} | |
#function to convert graph objects from make_graph_items into a ggplot object | |
subgraph_viz <- function(graphing){ | |
gg_edges <- ggplot(graphing$sub_nodes, aes(x,y,name = name, cat = group)) + | |
theme_bw() + | |
# theme(panel.grid = element_blank(), panel.border = element_blank(), axis.line = element_blank())+ | |
theme(axis.line=element_blank(),axis.text.x=element_blank(), | |
axis.text.y=element_blank(),axis.ticks=element_blank(), | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(),legend.position="none", | |
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(), | |
panel.grid.minor=element_blank(),plot.background=element_blank()) + | |
geom_segment(data=graphing$sub_edges, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes = FALSE, color = "grey80") | |
gg_edges + geom_point(aes(fill=as.factor(group)), size = 4, pch = 21, color = "black") | |
} | |
#look at all categories | |
#fast food: 21 (works) | |
#baking: 18 (works) | |
food_nutrient[!is.na(cat_id), .(length(unique(fdc_id)), cat[1]), cat_id] | |
#baking! | |
bakeobj <- create_foodgraph(food, food_nutrient, 18, min_nut = 3) | |
baking <- make_graph_items(bakeobj, edgecor = 0.7) | |
#plot full graph | |
plot(baking$full_graph, layout = baking$full_layout) | |
#plot 5 largest communities | |
plot(baking$sub_graph, layout = baking$sub_layout) | |
#highlight groups 1 and 2 | |
graph_printone(baking$sub_graph, baking$sub_layout, c(4,3)) | |
#create ggplot object for interactive viz with ggplotly | |
gg <- subgraph_viz(baking) | |
ggplotly(gg, tooltip = "name") | |
#same with fast food | |
ffobj <- create_foodgraph(food, food_nutrient, 21) | |
fastfood <- make_graph_items(ffobj, edgecor = 0.4) | |
plot(fastfood$full_graph, layout = fastfood$full_layout) | |
plot(fastfood$sub_graph, layout = fastfood$sub_layout) | |
graph_printone(fastfood$sub_graph, fastfood$sub_layout, c(1,5)) | |
gg <- subgraph_viz(fastfood) | |
ggplotly(gg, tooltip = c("name", "cat")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment