diegovalle/diet.r

## diet.r
########################################################
#####       Author: Diego Valle Jones
#####       Website: www.diegovalle.net
#####       Date Created: Mon Mar 01 18:51:27 2010
########################################################
#1. For what foods are Americans and Mexicans outliers
#2. Partition the data around medoids to classify the
#countries of the world according to what they eat

library(ggplot2)
library(maps)
library(maptools)
library(RColorBrewer)
library(Cairo)
library(cluster)
library(reshape)
require(RCurl)
require(stringr)
gpclibPermit()

########################################################
#Outliers
########################################################
#uncomment the following line if you have a local copy of the data:
#diet<-read.csv("diet-forcsv-partial.csv")

#read the Google Spreadsheet
myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0AjjLwVYbDGx7dGR6cWZwLV95cERxVU5ZbkpFcThzZ2c&output=csv")
diet <- read.csv(textConnection(myCsv))

#Plot the distributions of all the different foods
plotDensities <- function(df, name, color) {
  cntrynum <- which(df$Countries == name)
  #Only food columns
  clmns <- 4:52
  ##Rename the columns to get nice plot labels
  names(df) <- str_replace(names(df), "\\.\\..*$", "")
  names(df) <- str_replace(names(df), "\\.", " ")

  mdiet <- melt(df[, c(1,clmns)], id = "Countries")
  #mdiet$variable <- sub(".day...2003", "", mdiet$variable)

  us <- data.frame(value = unlist(df[cntrynum, clmns]),
                   variable = unique(mdiet$variable))
  ggplot(mdiet, aes(value)) +
    geom_density() +
    geom_vline(data =us, aes(xintercept = value), color = color) +
    facet_wrap(~variable, scale = "free") +
    xlab("") + opts(title = name)
}


plotDensities(diet, "United States of America", "blue")
#dev.print(png, "usa.png", width = 960, height = 600)
plotDensities(diet, "Mexico", "darkgreen")
#dev.print(png, "mx.png", width = 960, height = 600)

#Now get the outliers
diet.st <- diet
clmns <- 2:(ncol(diet.st)-1)
diet.st[ ,clmns] <- sapply(diet.st[ ,clmns],
                           function(x) (x - mean(x, na.action=na.omit)) / sd(x))

diet.mx <- subset(diet.st, Countries=="Mexico" |
  Countries=="United States of America")
diet.mx <- data.frame(t(diet.mx[2:(ncol(diet.mx)-1)]))
colnames(diet.mx) <- c("Mexico","US")
diet.mx$dif <- diet.mx$US - diet.mx$Mexico
outliers <- subset(diet.mx, US >= 2 |
  Mexico >= 2 |
  dif >= 2 |
  US <= -2 |
  Mexico <= -2 |
  dif <= -2)


########################################################
#CLuster Analysis
########################################################
cleanMap <- function(df) {
  world.map <- map_data("world")
  world.map <- subset(world.map, region != "Antarctica")
  world.map <- world.map[-grep("Sea|Lake", world.map$region),]
  world.map <- world.map[-grep("Sea|Lake", world.map$subregion),]
  df$country <- as.character(df$country)
  df[which(df$country == "United States of America"),]$country <- "USA"
  df[which(df$country == "United Kingdom"),]$country <- "UK"
  df[which(df$country == "Russian Federation"),]$country <- "USSR"
  p.map <- merge(world.map, df[c("country", "cluster")],
                 by.x = "region",
                 by.y ="country",
                 all.x = TRUE)
  p.map <- p.map[order(p.map$order), ]
  p.map
}

onlyFood <- function(vec){
  #col 52 is the last having to do with food
  vec <- vec[ , 4:52]
  vec
}

plotMap <- function(df, nclust){
  df.food <- onlyFood(df)
  df.food$cluster <- pam(df.food, nclust)$cluster
  df.food$country <- df$Countries
  p.map <- cleanMap(df.food)
  clr.map <- brewer.pal(nclust, "Set2")
  ggplot(p.map, aes(long, lat)) +
    geom_polygon(aes(group = group, fill = factor(cluster)),
                 color = "#C4C4C4", size = .2) +
                   scale_y_continuous(breaks = NA) +
                   scale_x_continuous(breaks = NA) + xlab("") + ylab("") +
                   scale_fill_manual("Cluster", values = c(clr.map, "#FEFEFE")) +
                   opts(panel.background = theme_rect(fill = "#e0f2ff",
                                                      colour = "white")) +
                                                        opts(title = "Cluster analysis of what the world eats")
}

nclust <- 6
plotMap(diet.st, nclust)
#dev.print(png, "worlddiet.png", width = 900, height = 550)

cl <- pam(onlyFood(diet.st), nclust)
clusplot(pam(onlyFood(diet.st), nclust))
med <- data.frame(t(cl$medoids))


#write.csv(med, "centers.csv")

#Cairo(file="worlddiet.png", width = 900, height = 550)
#
#dev.off()
	########################################################
	##### Author: Diego Valle Jones
	##### Website: www.diegovalle.net
	##### Date Created: Mon Mar 01 18:51:27 2010
	########################################################
	#1. For what foods are Americans and Mexicans outliers
	#2. Partition the data around medoids to classify the
	#countries of the world according to what they eat

	library(ggplot2)
	library(maps)
	library(maptools)
	library(RColorBrewer)
	library(Cairo)
	library(cluster)
	library(reshape)
	require(RCurl)
	require(stringr)
	gpclibPermit()

	########################################################
	#Outliers
	########################################################
	#uncomment the following line if you have a local copy of the data:
	#diet<-read.csv("diet-forcsv-partial.csv")

	#read the Google Spreadsheet
	myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0AjjLwVYbDGx7dGR6cWZwLV95cERxVU5ZbkpFcThzZ2c&output=csv")
	diet <- read.csv(textConnection(myCsv))

	#Plot the distributions of all the different foods
	plotDensities <- function(df, name, color) {
	cntrynum <- which(df$Countries == name)
	#Only food columns
	clmns <- 4:52
	##Rename the columns to get nice plot labels
	names(df) <- str_replace(names(df), "\\.\\..*$", "")
	names(df) <- str_replace(names(df), "\\.", " ")

	mdiet <- melt(df[, c(1,clmns)], id = "Countries")
	#mdiet$variable <- sub(".day...2003", "", mdiet$variable)

	us <- data.frame(value = unlist(df[cntrynum, clmns]),
	variable = unique(mdiet$variable))
	ggplot(mdiet, aes(value)) +
	geom_density() +
	geom_vline(data =us, aes(xintercept = value), color = color) +
	facet_wrap(~variable, scale = "free") +
	xlab("") + opts(title = name)
	}


	plotDensities(diet, "United States of America", "blue")
	#dev.print(png, "usa.png", width = 960, height = 600)
	plotDensities(diet, "Mexico", "darkgreen")
	#dev.print(png, "mx.png", width = 960, height = 600)

	#Now get the outliers
	diet.st <- diet
	clmns <- 2:(ncol(diet.st)-1)
	diet.st[ ,clmns] <- sapply(diet.st[ ,clmns],
	function(x) (x - mean(x, na.action=na.omit)) / sd(x))

	diet.mx <- subset(diet.st, Countries=="Mexico" \|
	Countries=="United States of America")
	diet.mx <- data.frame(t(diet.mx[2:(ncol(diet.mx)-1)]))
	colnames(diet.mx) <- c("Mexico","US")
	diet.mx$dif <- diet.mx$US - diet.mx$Mexico
	outliers <- subset(diet.mx, US >= 2 \|
	Mexico >= 2 \|
	dif >= 2 \|
	US <= -2 \|
	Mexico <= -2 \|
	dif <= -2)


	########################################################
	#CLuster Analysis
	########################################################
	cleanMap <- function(df) {
	world.map <- map_data("world")
	world.map <- subset(world.map, region != "Antarctica")
	world.map <- world.map[-grep("Sea\|Lake", world.map$region),]
	world.map <- world.map[-grep("Sea\|Lake", world.map$subregion),]
	df$country <- as.character(df$country)
	df[which(df$country == "United States of America"),]$country <- "USA"
	df[which(df$country == "United Kingdom"),]$country <- "UK"
	df[which(df$country == "Russian Federation"),]$country <- "USSR"
	p.map <- merge(world.map, df[c("country", "cluster")],
	by.x = "region",
	by.y ="country",
	all.x = TRUE)
	p.map <- p.map[order(p.map$order), ]
	p.map
	}

	onlyFood <- function(vec){
	#col 52 is the last having to do with food
	vec <- vec[ , 4:52]
	vec
	}

	plotMap <- function(df, nclust){
	df.food <- onlyFood(df)
	df.food$cluster <- pam(df.food, nclust)$cluster
	df.food$country <- df$Countries
	p.map <- cleanMap(df.food)
	clr.map <- brewer.pal(nclust, "Set2")
	ggplot(p.map, aes(long, lat)) +
	geom_polygon(aes(group = group, fill = factor(cluster)),
	color = "#C4C4C4", size = .2) +
	scale_y_continuous(breaks = NA) +
	scale_x_continuous(breaks = NA) + xlab("") + ylab("") +
	scale_fill_manual("Cluster", values = c(clr.map, "#FEFEFE")) +
	opts(panel.background = theme_rect(fill = "#e0f2ff",
	colour = "white")) +
	opts(title = "Cluster analysis of what the world eats")
	}

	nclust <- 6
	plotMap(diet.st, nclust)
	#dev.print(png, "worlddiet.png", width = 900, height = 550)

	cl <- pam(onlyFood(diet.st), nclust)
	clusplot(pam(onlyFood(diet.st), nclust))
	med <- data.frame(t(cl$medoids))


	#write.csv(med, "centers.csv")

	#Cairo(file="worlddiet.png", width = 900, height = 550)
	#
	#dev.off()