Skip to content

Instantly share code, notes, and snippets.

@diegovalle
Created March 9, 2010 17:18
  • Star 2 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save diegovalle/326839 to your computer and use it in GitHub Desktop.
Cluster analysis of what the world eats
########################################################
##### Author: Diego Valle Jones
##### Website: www.diegovalle.net
##### Date Created: Mon Mar 01 18:51:27 2010
########################################################
#1. For what foods are Americans and Mexicans outliers
#2. Partition the data around medoids to classify the
#countries of the world according to what they eat
library(ggplot2)
library(maps)
library(maptools)
library(RColorBrewer)
library(Cairo)
library(cluster)
library(reshape)
require(RCurl)
require(stringr)
gpclibPermit()
########################################################
#Outliers
########################################################
#uncomment the following line if you have a local copy of the data:
#diet<-read.csv("diet-forcsv-partial.csv")
#read the Google Spreadsheet
myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0AjjLwVYbDGx7dGR6cWZwLV95cERxVU5ZbkpFcThzZ2c&output=csv")
diet <- read.csv(textConnection(myCsv))
#Plot the distributions of all the different foods
plotDensities <- function(df, name, color) {
cntrynum <- which(df$Countries == name)
#Only food columns
clmns <- 4:52
##Rename the columns to get nice plot labels
names(df) <- str_replace(names(df), "\\.\\..*$", "")
names(df) <- str_replace(names(df), "\\.", " ")
mdiet <- melt(df[, c(1,clmns)], id = "Countries")
#mdiet$variable <- sub(".day...2003", "", mdiet$variable)
us <- data.frame(value = unlist(df[cntrynum, clmns]),
variable = unique(mdiet$variable))
ggplot(mdiet, aes(value)) +
geom_density() +
geom_vline(data =us, aes(xintercept = value), color = color) +
facet_wrap(~variable, scale = "free") +
xlab("") + opts(title = name)
}
plotDensities(diet, "United States of America", "blue")
#dev.print(png, "usa.png", width = 960, height = 600)
plotDensities(diet, "Mexico", "darkgreen")
#dev.print(png, "mx.png", width = 960, height = 600)
#Now get the outliers
diet.st <- diet
clmns <- 2:(ncol(diet.st)-1)
diet.st[ ,clmns] <- sapply(diet.st[ ,clmns],
function(x) (x - mean(x, na.action=na.omit)) / sd(x))
diet.mx <- subset(diet.st, Countries=="Mexico" |
Countries=="United States of America")
diet.mx <- data.frame(t(diet.mx[2:(ncol(diet.mx)-1)]))
colnames(diet.mx) <- c("Mexico","US")
diet.mx$dif <- diet.mx$US - diet.mx$Mexico
outliers <- subset(diet.mx, US >= 2 |
Mexico >= 2 |
dif >= 2 |
US <= -2 |
Mexico <= -2 |
dif <= -2)
########################################################
#CLuster Analysis
########################################################
cleanMap <- function(df) {
world.map <- map_data("world")
world.map <- subset(world.map, region != "Antarctica")
world.map <- world.map[-grep("Sea|Lake", world.map$region),]
world.map <- world.map[-grep("Sea|Lake", world.map$subregion),]
df$country <- as.character(df$country)
df[which(df$country == "United States of America"),]$country <- "USA"
df[which(df$country == "United Kingdom"),]$country <- "UK"
df[which(df$country == "Russian Federation"),]$country <- "USSR"
p.map <- merge(world.map, df[c("country", "cluster")],
by.x = "region",
by.y ="country",
all.x = TRUE)
p.map <- p.map[order(p.map$order), ]
p.map
}
onlyFood <- function(vec){
#col 52 is the last having to do with food
vec <- vec[ , 4:52]
vec
}
plotMap <- function(df, nclust){
df.food <- onlyFood(df)
df.food$cluster <- pam(df.food, nclust)$cluster
df.food$country <- df$Countries
p.map <- cleanMap(df.food)
clr.map <- brewer.pal(nclust, "Set2")
ggplot(p.map, aes(long, lat)) +
geom_polygon(aes(group = group, fill = factor(cluster)),
color = "#C4C4C4", size = .2) +
scale_y_continuous(breaks = NA) +
scale_x_continuous(breaks = NA) + xlab("") + ylab("") +
scale_fill_manual("Cluster", values = c(clr.map, "#FEFEFE")) +
opts(panel.background = theme_rect(fill = "#e0f2ff",
colour = "white")) +
opts(title = "Cluster analysis of what the world eats")
}
nclust <- 6
plotMap(diet.st, nclust)
#dev.print(png, "worlddiet.png", width = 900, height = 550)
cl <- pam(onlyFood(diet.st), nclust)
clusplot(pam(onlyFood(diet.st), nclust))
med <- data.frame(t(cl$medoids))
#write.csv(med, "centers.csv")
#Cairo(file="worlddiet.png", width = 900, height = 550)
#
#dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment