-
-
Save fhr/5396233 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Analysis of DECC NEED data | |
## James Keirstead | |
## 22 November 2012 | |
## | |
## Underlying data available from | |
## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx | |
## For more info, see | |
## http://www.jameskeirstead.ca/dphil/we-need-more-data/ | |
## Updated by | |
## Francine Bennett | |
## April 2013 | |
## www.mastodonc.com | |
## on behalf of Homely (www.homely.com) | |
## Define useful global variables | |
file_name <- "6951-local-authority-consumption-tables-2010.xls" | |
### Define functions | |
## ------------------------------ | |
## Remove regional totals | |
## | |
## The spreadsheets contain subtotals by region and country. | |
## However since I want to analyse the information by local authority | |
## I first need to strip out these extra rows. | |
## df - the input data frame. It should contain the local authority | |
## names in the second column ("Col1") | |
remove_regional_totals <- function(df) { | |
## The regional names are kept in Col0 and therefore Col1 is blank | |
tmp <- subset(df, !is.na(Col1)) | |
## Remove whitespace from local authority names | |
require(stringr) | |
tmp <- mutate(tmp, Col1=str_trim(Col1)) | |
## Return the result, dropping the now empty first column | |
return(tmp[,-1]) | |
} | |
## Extract columns | |
## | |
## The data are arranged in columns with discrete categories. | |
## This function will extract a generic set of columns and apply a new set | |
## of labels | |
## | |
## df - the input data frame with local authority name in first column | |
## startCol - the number of the first column to be selected | |
## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears") | |
## category - the generic category of the labels, e.g. "Fruit" | |
## melt - return the columns as a melted data frame | |
extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) { | |
## Extract the columns and rename | |
tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))] | |
names(tmp) <- c("LAU", labels) | |
if (do_melt) { | |
require(reshape2) | |
tmp.m <- melt(tmp, id="LAU", variable.name="variable_value") | |
# Remove DECC's placeholder for confidential info | |
require(stringr) | |
tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA))) | |
tmp.m$category=category | |
return(tmp.m) | |
} else { | |
tmp$category=category | |
return(tmp) | |
} | |
} | |
## Extract the number of dwellings in each local authority, cut by our category | |
get_number_dwellings <- function(df,labels,category) { | |
tmp <- extract_columns(df, 3, labels, category) | |
names(tmp)[3] <- "number" | |
return(tmp) | |
} | |
## Extract the mean energy consumption in each local authority by our category | |
## Both the gas and electricity sheets position this information in the same column relative to label length | |
get_mean_consumption <- function(df,labels,category) { | |
tmp <- extract_columns(df, 6+length(labels), labels, category) | |
names(tmp)[3] <- "mean_energy_kwh" | |
return(tmp) | |
} | |
## A convenience function to assemble the overall data set | |
clean_data <- function(df,labels,category) { | |
tmp <- remove_regional_totals(df) | |
mean_energy <- get_mean_consumption(tmp,labels,category) | |
dwellings <- get_number_dwellings(tmp,labels,category) | |
result <- merge(mean_energy, dwellings) | |
return(result) | |
} | |
## @knitr run-analysis | |
## Run the analysis | |
## Read in the spreadsheet | |
## Start and end coordinates are by manual inspection | |
require(XLConnect) | |
floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250") | |
bedrooms_labels<-c("1","2","3","4","5 or more") | |
dwelling_type_labels<-c("Detached","Semi detached","End terrace","Mid terrace","Bungalow","Purpose built flat","Converted flat") | |
dwelling_age_labels<-c("Pre 1919","1919 to 1944","1945 to 1964","1965 to 1982","1983 to 1992","1993 to 1999","Post 1999") | |
tenure_labels<-c("Owner-occupied","Privately rented","Council /Housing Assocation") | |
household_income_labels<-c("Less than £10,000","£10,000 to £14,999","£15,000 to £19,999","£20,000 to £24,999","£25,000 to £29,999","£30,000 to £39,999", | |
"£40,000 to £49,999","£50,000 to £59,999","£60,000 to £74,999","£75,000 and over") | |
worksheet.list<-c("LAG1","LAG2","LAG3","LAG4","LAG5","LAG6","LAE1","LAE2","LAE3","LAE4","LAE5","LAE6") | |
end_cols<-rep(c("R","P","T","T","L","Z"),2) | |
category_list<-rep(c("floor_area","bedrooms","dwelling_type","dwelling_age","tenure","household_income"),2) | |
type_list<-c(rep("Gas",6),rep("Electrical",6)) | |
output<-data.frame(matrix(nrow=0,ncol=6)) | |
for (i in 1:length(worksheet.list)){ | |
temp<-readWorksheetFromFile(file_name, worksheet.list[i], | |
startRow=10, startCol=which(LETTERS=="A"), | |
endRow=432, endCol=which(LETTERS==end_cols[i])+1) | |
temp2<- clean_data(temp,get(paste(category_list[i],"_labels",sep="")),category_list[i]) | |
temp2$type<-type_list[i] | |
output<-rbind(output,temp2) | |
print(i) | |
} | |
write.csv(output,"flat_need_data.csv",row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment