######## Packages ######## | |
checkAndDownload<-function(packageNames) { | |
for(packageName in packageNames) { | |
if(!isInstalled(packageName)) { | |
install.packages(packageName,repos="http://lib.stat.cmu.edu/R/CRAN") | |
} | |
library(packageName,character.only=TRUE,quietly=TRUE,verbose=FALSE) | |
} | |
} | |
isInstalled <- function(mypkg){ | |
is.element(mypkg, installed.packages()[,1]) | |
} | |
packages <- c("sp","ggplot2","plyr","rgeos","maptools","sqldf","RColorBrewer") | |
checkAndDownload(packages) | |
# 'sp' a package for spatial data | |
# 'plyr' required for fortify which converts 'sp' data to | |
#polygons data to be used with ggplot2 | |
# 'rgeos' required for maptools | |
# 'maptools' required for fortify - region | |
################ Data Input ############## | |
#get the nation-wide map data with state boundaries | |
con <- url("http://biogeo.ucdavis.edu/data/gadm2/R/IND_adm1.RData") | |
load(con) | |
close(con) | |
#get the data from indian government data website about powerSuply position for a particular month | |
data_url = "http://data.gov.in/access-point-download-count?url=http://data.gov.in/sites/default/files/powerSupplyNov04.csv&nid=34321" | |
nov04 <- read.table( | |
file=data_url, | |
header=TRUE, | |
sep=",", | |
as.is=TRUE) | |
power = nov04 | |
#get the state codes file | |
#(data dictionary with indian states mapped to a 2 letter state code) | |
stateCodes <- read.table( | |
file="D:/JustAnotherDataBlog/Data/IndiaStateCodes.csv", | |
header=TRUE, | |
sep=",", | |
as.is=TRUE) | |
########## Adhoc Data cleaning ######### | |
#renaming the important columns | |
colnames(power) | |
colnames(power)[1] = "State" | |
colnames(power)[2] = "Demand" | |
colnames(power)[3] = "Supplies" | |
colnames(power)[4] = "Net Surplus" | |
colnames(power) | |
#adjusting power supply position in these states based on geographic area | |
#area numbers obtained from wikipedia | |
WB_Area <- 88752 | |
Sikkim_Area <- 7096 | |
Jharkhand_Area <-79714 | |
WB_Sikkim_Prop <- WB_Area / (WB_Area + Sikkim_Area) | |
WB_Jharkhand_Prop <- WB_Area / (WB_Area + Jharkhand_Area) | |
#1.West Bengal and Sikkim to be split in some ratio:ratio of areas | |
WB1 <- power[power[, "State" ]== "W B + Sikkim" , -1] * WB_Sikkim_Prop | |
SK <- power[power[, "State" ]== "W B + Sikkim" , -1] - WB1 | |
#2.DVC numbers to be divided between Jharkhand and West Bengal: by ratio of areas | |
WB2 <- power[power[, "State" ]== "DVC" , -1] * WB_Jharkhand_Prop | |
power[power[, "State" ]== "Jharkhand" , -1] <- power[power[, "State" ]== "Jharkhand" , -1] + | |
power[power[, "State" ]== "DVC" , -1] - WB2 | |
WB = WB1 + WB2 | |
#3. Tripura: not present in some of the datasets | |
#4. Andaman and Nicobar, Lakshdweep not part of the analysis | |
power = rbind(power,cbind(State="West Bengal",WB),cbind(State="Sikkim",SK)) | |
######### Preparing data for visualisation ########### | |
#merge power data with state codes file | |
power_stateCodes <- sqldf("select a.*,b.* from stateCodes as a inner join power as b on a.State = b.State") | |
#cross check | |
sum(power_stateCodes$Demand) == power[power[, "State" ]== "All India" , 2] | |
power_stateCodes$Net_check = power_stateCodes$Supplies - power_stateCodes$Demand | |
sum(power_stateCodes$Net_check - power_stateCodes$Net_Surplus) | |
as.data.frame(gadm) | |
gadm@data | |
states <- as.data.frame(gadm@data$NAME_1) | |
colnames(states) ="State_gadm" | |
states_stateCodes <- sqldf("select a.*,b.* from states as a | |
left join stateCodes as b on a.State_gadm = b.State") | |
power_states <- sqldf("select a.State_gadm, a.Code, b.Demand, | |
b.Supplies, b.Net_Surplus from states_stateCodes as a | |
left join power_stateCodes as b on a.Code = b.Code") | |
power_states$log_Deficit <- log(-power_states$Net_Surplus+1) | |
breaks = quantile(power_states$log_Deficit,na.rm=TRUE) | |
breaks = 1 - exp(breaks) | |
power_states$Severity = cut(power_states$Net_Surplus,breaks,include.lowest =TRUE) | |
#with(power_states,power_states[Code == 'MH',]) | |
#crosscheck as we can't merge using name | |
sum(gadm@data$NAME_1==power_states$State_gadm)==nrow(gadm@data) | |
gadm <- spCbind(gadm,power_states) | |
plotclr <- rev(brewer.pal(length(levels(gadm@data$Severity)),"Blues")) | |
#using spplot | |
png(file="D:/JustAnotherDataBlog/Plots/FirstPost_PS_Pos_Nov_04_Ind_spplot.png",width=500,height=400) | |
spplot(gadm, "Severity", | |
col.regions=plotclr | |
,main="India: Net Power Supply Position for Nov 2004 (MU)")#, col="transparent") | |
dev.off() | |
#using ggplot | |
india <- fortify(gadm, region = "NAME_1") | |
#india.df <- join(india, gadm@data, by="NAME_1") doesn't work because id cols are different | |
names(india) | |
temp <- gadm@data | |
india.df <- sqldf("select a.* ,b.* from india as a | |
left join temp as b on a.id = b.Name_1")#it's not case sensitive | |
S_Code <- aggregate(cbind(long, lat) ~ Code, data=india.df, FUN=function(x)mean(range(x))) | |
S_Code <-ddply(india.df, .(Code), function(df) kmeans(df[,1:2], centers=1)$centers) | |
png(file="D:/JustAnotherDataBlog/Plots/FirstPost_PS_Pos_Nov_04_Ind_ggplot.png",width=500,height=400) | |
p <- ggplot(india.df,aes(long,lat)) + | |
geom_polygon(aes(group=group,fill=Severity),color='white') + | |
geom_text(data=S_Code,aes(long,lat,label=Code), size=2.5) + | |
#geom_path(color="white") + | |
coord_equal() + | |
scale_fill_brewer() + | |
ggtitle("India: Net Power Supply Position for Nov 2004 (MU)") | |
p | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment