Last active
October 13, 2015 03:48
-
-
Save jkeirstead/4134752 to your computer and use it in GitHub Desktop.
Analysis of DECC NEED data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Analysis of DECC NEED data | |
## James Keirstead | |
## 22 November 2012 | |
## | |
## Underlying data available from | |
## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx | |
## For more info, see | |
## http://www.jameskeirstead.ca/blog/we-need-more-data/ | |
## @knitr define-functions | |
## Define useful global variables | |
## ------------------------------ | |
file_name <- "data/6951-local-authority-consumption-tables-2010-1.xls" | |
floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250") | |
### Define functions | |
## ------------------------------ | |
## Remove regional totals | |
## | |
## The spreadsheets contain subtotals by region and country. | |
## However since I want to analyse the information by local authority | |
## I first need to strip out these extra rows. | |
## df - the input data frame. It should contain the local authority | |
## names in the second column ("Col1") | |
remove_regional_totals <- function(df) { | |
## The regional names are kept in Col0 and therefore Col1 is blank | |
tmp <- subset(df, !is.na(Col1)) | |
## Remove whitespace from local authority names | |
require(stringr) | |
tmp <- mutate(tmp, Col1=str_trim(Col1)) | |
## Return the result, dropping the now empty first column | |
return(tmp[,-1]) | |
} | |
## Extract columns | |
## | |
## The data are arranged in columns with discrete categories. | |
## This function will extract a generic set of columns and apply a new set | |
## of labels | |
## | |
## df - the input data frame with local authority name in first column | |
## startCol - the number of the first column to be selected | |
## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears") | |
## category - the generic category of the labels, e.g. "Fruit" | |
## melt - return the columns as a melted data frame | |
extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) { | |
## Extract the columns and rename | |
tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))] | |
names(tmp) <- c("LAU", labels) | |
if (do_melt) { | |
require(reshape2) | |
tmp.m <- melt(tmp, id="LAU", variable.name=category) | |
# Remove DECC's placeholder for confidential info | |
require(stringr) | |
tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA))) | |
return(tmp.m) | |
} else { | |
return(tmp) | |
} | |
} | |
## Extract the number of dwellings in each local authority by floor area | |
get_number_dwellings <- function(df) { | |
tmp <- extract_columns(df, 3, floor_area_labels, "floor") | |
names(tmp)[3] <- "number" | |
return(tmp) | |
} | |
## Extract the mean energy consumption in each local authority by floor area | |
## Both the gas and electricity sheets position this information in the same column | |
get_mean_consumption <- function(df) { | |
tmp <- extract_columns(df, 12, floor_area_labels, "floor") | |
names(tmp)[3] <- "energy" | |
return(tmp) | |
} | |
## A convenience function to assemble the overall data set | |
clean_data <- function(df) { | |
tmp <- remove_regional_totals(df) | |
mean_energy <- get_mean_consumption(tmp) | |
dwellings <- get_number_dwellings(tmp) | |
result <- merge(mean_energy, dwellings) | |
return(result) | |
} | |
## @knitr run-analysis | |
## Run the analysis | |
##------------------------ | |
## Read in the spreadsheet | |
## Start and end coordinates are by manual inspection | |
require(XLConnect) | |
## Process the gas data first | |
raw_gas_data <- readWorksheetFromFile(file_name, "LAG1", | |
startRow=10, startCol=which(LETTERS=="A"), | |
endRow=432, endCol=which(LETTERS=="S")) | |
gas_data <- clean_data(raw_gas_data) | |
## Then the electricity data | |
raw_elec_data <- readWorksheetFromFile(file_name, "LAE1", | |
startRow=10, startCol=which(LETTERS=="A"), | |
endRow=432, endCol=which(LETTERS=="S")) | |
elec_data <- clean_data(raw_elec_data) | |
## Merge the two data sets, renaming columns first | |
df.gas <- summarize(gas_data, LAU=LAU, floor=floor, gas=energy) | |
df.elec <- summarize(elec_data, LAU=LAU, floor=floor, elec=energy) | |
data <- merge(df.gas, df.elec) | |
data <- melt(data, id=c('LAU', 'floor'), variable.name='fuel') | |
data <- mutate(data, fuel=factor(fuel, lev=c('elec','gas'), labels=c('Electricity', 'Gas'))) | |
## @knitr make-plots | |
require(ggplot2) | |
gg <- ggplot(data, aes(x=floor, y=value)) + | |
geom_boxplot() + | |
facet_wrap(~ fuel, ncol=1, scale='free') + | |
labs(x="Floor area (m2)", y="Mean annual energy consumption (kWh)") + | |
theme_bw() | |
print(gg) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi James - I've updated this to read all the tabs of the spreadsheet. It generalises a couple of the functions and then writes everything to a dataframe called "output". https://gist.github.com/fhr/5396233