Created
February 15, 2014 23:05
-
-
Save steadyfish/9026485 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######## Packages ######## | |
checkAndDownload<-function(packageNames) { | |
for(packageName in packageNames) { | |
if(!isInstalled(packageName)) { | |
install.packages(packageName,repos="http://lib.stat.cmu.edu/R/CRAN") | |
} | |
library(packageName,character.only=TRUE,quietly=TRUE,verbose=FALSE) | |
} | |
} | |
isInstalled <- function(mypkg){ | |
is.element(mypkg, installed.packages()[,1]) | |
} | |
packages <- c("sp","ggplot2","plyr","rgeos","maptools","sqldf","RColorBrewer") | |
checkAndDownload(packages) | |
# 'sp' a package for spatial data | |
# 'plyr' required for fortify which converts 'sp' data to | |
#polygons data to be used with ggplot2 | |
# 'rgeos' required for maptools | |
# 'maptools' required for fortify - region | |
################ Data Input ############## | |
#get the nation-wide map data with state boundaries | |
con <- url("http://biogeo.ucdavis.edu/data/gadm2/R/IND_adm1.RData") | |
load(con) | |
close(con) | |
#get the data from indian government data website about powerSuply position for a particular month | |
data_url = "http://data.gov.in/access-point-download-count?url=http://data.gov.in/sites/default/files/powerSupplyNov04.csv&nid=34321" | |
nov04 <- read.table( | |
file=data_url, | |
header=TRUE, | |
sep=",", | |
as.is=TRUE) | |
power = nov04 | |
#get the state codes file | |
#(data dictionary with indian states mapped to a 2 letter state code) | |
stateCodes <- read.table( | |
file="D:/JustAnotherDataBlog/Data/IndiaStateCodes.csv", | |
header=TRUE, | |
sep=",", | |
as.is=TRUE) | |
########## Adhoc Data cleaning ######### | |
#renaming the important columns | |
colnames(power) | |
colnames(power)[1] = "State" | |
colnames(power)[2] = "Demand" | |
colnames(power)[3] = "Supplies" | |
colnames(power)[4] = "Net Surplus" | |
colnames(power) | |
#adjusting power supply position in these states based on geographic area | |
#area numbers obtained from wikipedia | |
WB_Area <- 88752 | |
Sikkim_Area <- 7096 | |
Jharkhand_Area <-79714 | |
WB_Sikkim_Prop <- WB_Area / (WB_Area + Sikkim_Area) | |
WB_Jharkhand_Prop <- WB_Area / (WB_Area + Jharkhand_Area) | |
#1.West Bengal and Sikkim to be split in some ratio:ratio of areas | |
WB1 <- power[power[, "State" ]== "W B + Sikkim" , -1] * WB_Sikkim_Prop | |
SK <- power[power[, "State" ]== "W B + Sikkim" , -1] - WB1 | |
#2.DVC numbers to be divided between Jharkhand and West Bengal: by ratio of areas | |
WB2 <- power[power[, "State" ]== "DVC" , -1] * WB_Jharkhand_Prop | |
power[power[, "State" ]== "Jharkhand" , -1] <- power[power[, "State" ]== "Jharkhand" , -1] + | |
power[power[, "State" ]== "DVC" , -1] - WB2 | |
WB = WB1 + WB2 | |
#3. Tripura: not present in some of the datasets | |
#4. Andaman and Nicobar, Lakshdweep not part of the analysis | |
power = rbind(power,cbind(State="West Bengal",WB),cbind(State="Sikkim",SK)) | |
######### Preparing data for visualisation ########### | |
#merge power data with state codes file | |
power_stateCodes <- sqldf("select a.*,b.* from stateCodes as a inner join power as b on a.State = b.State") | |
#cross check | |
sum(power_stateCodes$Demand) == power[power[, "State" ]== "All India" , 2] | |
power_stateCodes$Net_check = power_stateCodes$Supplies - power_stateCodes$Demand | |
sum(power_stateCodes$Net_check - power_stateCodes$Net_Surplus) | |
as.data.frame(gadm) | |
gadm@data | |
states <- as.data.frame(gadm@data$NAME_1) | |
colnames(states) ="State_gadm" | |
states_stateCodes <- sqldf("select a.*,b.* from states as a | |
left join stateCodes as b on a.State_gadm = b.State") | |
power_states <- sqldf("select a.State_gadm, a.Code, b.Demand, | |
b.Supplies, b.Net_Surplus from states_stateCodes as a | |
left join power_stateCodes as b on a.Code = b.Code") | |
power_states$log_Deficit <- log(-power_states$Net_Surplus+1) | |
breaks = quantile(power_states$log_Deficit,na.rm=TRUE) | |
breaks = 1 - exp(breaks) | |
power_states$Severity = cut(power_states$Net_Surplus,breaks,include.lowest =TRUE) | |
#with(power_states,power_states[Code == 'MH',]) | |
#crosscheck as we can't merge using name | |
sum(gadm@data$NAME_1==power_states$State_gadm)==nrow(gadm@data) | |
gadm <- spCbind(gadm,power_states) | |
plotclr <- rev(brewer.pal(length(levels(gadm@data$Severity)),"Blues")) | |
#using spplot | |
png(file="D:/JustAnotherDataBlog/Plots/FirstPost_PS_Pos_Nov_04_Ind_spplot.png",width=500,height=400) | |
spplot(gadm, "Severity", | |
col.regions=plotclr | |
,main="India: Net Power Supply Position for Nov 2004 (MU)")#, col="transparent") | |
dev.off() | |
#using ggplot | |
india <- fortify(gadm, region = "NAME_1") | |
#india.df <- join(india, gadm@data, by="NAME_1") doesn't work because id cols are different | |
names(india) | |
temp <- gadm@data | |
india.df <- sqldf("select a.* ,b.* from india as a | |
left join temp as b on a.id = b.Name_1")#it's not case sensitive | |
S_Code <- aggregate(cbind(long, lat) ~ Code, data=india.df, FUN=function(x)mean(range(x))) | |
S_Code <-ddply(india.df, .(Code), function(df) kmeans(df[,1:2], centers=1)$centers) | |
png(file="D:/JustAnotherDataBlog/Plots/FirstPost_PS_Pos_Nov_04_Ind_ggplot.png",width=500,height=400) | |
p <- ggplot(india.df,aes(long,lat)) + | |
geom_polygon(aes(group=group,fill=Severity),color='white') + | |
geom_text(data=S_Code,aes(long,lat,label=Code), size=2.5) + | |
#geom_path(color="white") + | |
coord_equal() + | |
scale_fill_brewer() + | |
ggtitle("India: Net Power Supply Position for Nov 2004 (MU)") | |
p | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment