Skip to content

Instantly share code, notes, and snippets.

@fhr
Forked from jkeirstead/need-analysis.r
Last active December 16, 2015 07:08
Show Gist options
  • Save fhr/5396233 to your computer and use it in GitHub Desktop.
Save fhr/5396233 to your computer and use it in GitHub Desktop.
## Analysis of DECC NEED data
## James Keirstead
## 22 November 2012
##
## Underlying data available from
## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
## For more info, see
## http://www.jameskeirstead.ca/dphil/we-need-more-data/
## Updated by
## Francine Bennett
## April 2013
## www.mastodonc.com
## on behalf of Homely (www.homely.com)
## Define useful global variables
file_name <- "6951-local-authority-consumption-tables-2010.xls"
### Define functions
## ------------------------------
## Remove regional totals
##
## The spreadsheets contain subtotals by region and country.
## However since I want to analyse the information by local authority
## I first need to strip out these extra rows.
## df - the input data frame. It should contain the local authority
## names in the second column ("Col1")
remove_regional_totals <- function(df) {
## The regional names are kept in Col0 and therefore Col1 is blank
tmp <- subset(df, !is.na(Col1))
## Remove whitespace from local authority names
require(stringr)
tmp <- mutate(tmp, Col1=str_trim(Col1))
## Return the result, dropping the now empty first column
return(tmp[,-1])
}
## Extract columns
##
## The data are arranged in columns with discrete categories.
## This function will extract a generic set of columns and apply a new set
## of labels
##
## df - the input data frame with local authority name in first column
## startCol - the number of the first column to be selected
## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
## category - the generic category of the labels, e.g. "Fruit"
## melt - return the columns as a melted data frame
extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
## Extract the columns and rename
tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
names(tmp) <- c("LAU", labels)
if (do_melt) {
require(reshape2)
tmp.m <- melt(tmp, id="LAU", variable.name="variable_value")
# Remove DECC's placeholder for confidential info
require(stringr)
tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
tmp.m$category=category
return(tmp.m)
} else {
tmp$category=category
return(tmp)
}
}
## Extract the number of dwellings in each local authority, cut by our category
get_number_dwellings <- function(df,labels,category) {
tmp <- extract_columns(df, 3, labels, category)
names(tmp)[3] <- "number"
return(tmp)
}
## Extract the mean energy consumption in each local authority by our category
## Both the gas and electricity sheets position this information in the same column relative to label length
get_mean_consumption <- function(df,labels,category) {
tmp <- extract_columns(df, 6+length(labels), labels, category)
names(tmp)[3] <- "mean_energy_kwh"
return(tmp)
}
## A convenience function to assemble the overall data set
clean_data <- function(df,labels,category) {
tmp <- remove_regional_totals(df)
mean_energy <- get_mean_consumption(tmp,labels,category)
dwellings <- get_number_dwellings(tmp,labels,category)
result <- merge(mean_energy, dwellings)
return(result)
}
## @knitr run-analysis
## Run the analysis
## Read in the spreadsheet
## Start and end coordinates are by manual inspection
require(XLConnect)
floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")
bedrooms_labels<-c("1","2","3","4","5 or more")
dwelling_type_labels<-c("Detached","Semi detached","End terrace","Mid terrace","Bungalow","Purpose built flat","Converted flat")
dwelling_age_labels<-c("Pre 1919","1919 to 1944","1945 to 1964","1965 to 1982","1983 to 1992","1993 to 1999","Post 1999")
tenure_labels<-c("Owner-occupied","Privately rented","Council /Housing Assocation")
household_income_labels<-c("Less than £10,000","£10,000 to £14,999","£15,000 to £19,999","£20,000 to £24,999","£25,000 to £29,999","£30,000 to £39,999",
"£40,000 to £49,999","£50,000 to £59,999","£60,000 to £74,999","£75,000 and over")
worksheet.list<-c("LAG1","LAG2","LAG3","LAG4","LAG5","LAG6","LAE1","LAE2","LAE3","LAE4","LAE5","LAE6")
end_cols<-rep(c("R","P","T","T","L","Z"),2)
category_list<-rep(c("floor_area","bedrooms","dwelling_type","dwelling_age","tenure","household_income"),2)
type_list<-c(rep("Gas",6),rep("Electrical",6))
output<-data.frame(matrix(nrow=0,ncol=6))
for (i in 1:length(worksheet.list)){
temp<-readWorksheetFromFile(file_name, worksheet.list[i],
startRow=10, startCol=which(LETTERS=="A"),
endRow=432, endCol=which(LETTERS==end_cols[i])+1)
temp2<- clean_data(temp,get(paste(category_list[i],"_labels",sep="")),category_list[i])
temp2$type<-type_list[i]
output<-rbind(output,temp2)
print(i)
}
write.csv(output,"flat_need_data.csv",row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment