Skip to content

Instantly share code, notes, and snippets.

@jkeirstead
Last active October 13, 2015 03:48
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save jkeirstead/4134752 to your computer and use it in GitHub Desktop.
Save jkeirstead/4134752 to your computer and use it in GitHub Desktop.
Analysis of DECC NEED data
## Analysis of DECC NEED data
## James Keirstead
## 22 November 2012
##
## Underlying data available from
## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
## For more info, see
## http://www.jameskeirstead.ca/blog/we-need-more-data/
## @knitr define-functions
## Define useful global variables
## ------------------------------
file_name <- "data/6951-local-authority-consumption-tables-2010-1.xls"
floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")
### Define functions
## ------------------------------
## Remove regional totals
##
## The spreadsheets contain subtotals by region and country.
## However since I want to analyse the information by local authority
## I first need to strip out these extra rows.
## df - the input data frame. It should contain the local authority
## names in the second column ("Col1")
remove_regional_totals <- function(df) {
## The regional names are kept in Col0 and therefore Col1 is blank
tmp <- subset(df, !is.na(Col1))
## Remove whitespace from local authority names
require(stringr)
tmp <- mutate(tmp, Col1=str_trim(Col1))
## Return the result, dropping the now empty first column
return(tmp[,-1])
}
## Extract columns
##
## The data are arranged in columns with discrete categories.
## This function will extract a generic set of columns and apply a new set
## of labels
##
## df - the input data frame with local authority name in first column
## startCol - the number of the first column to be selected
## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
## category - the generic category of the labels, e.g. "Fruit"
## melt - return the columns as a melted data frame
extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
## Extract the columns and rename
tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
names(tmp) <- c("LAU", labels)
if (do_melt) {
require(reshape2)
tmp.m <- melt(tmp, id="LAU", variable.name=category)
# Remove DECC's placeholder for confidential info
require(stringr)
tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
return(tmp.m)
} else {
return(tmp)
}
}
## Extract the number of dwellings in each local authority by floor area
get_number_dwellings <- function(df) {
tmp <- extract_columns(df, 3, floor_area_labels, "floor")
names(tmp)[3] <- "number"
return(tmp)
}
## Extract the mean energy consumption in each local authority by floor area
## Both the gas and electricity sheets position this information in the same column
get_mean_consumption <- function(df) {
tmp <- extract_columns(df, 12, floor_area_labels, "floor")
names(tmp)[3] <- "energy"
return(tmp)
}
## A convenience function to assemble the overall data set
clean_data <- function(df) {
tmp <- remove_regional_totals(df)
mean_energy <- get_mean_consumption(tmp)
dwellings <- get_number_dwellings(tmp)
result <- merge(mean_energy, dwellings)
return(result)
}
## @knitr run-analysis
## Run the analysis
##------------------------
## Read in the spreadsheet
## Start and end coordinates are by manual inspection
require(XLConnect)
## Process the gas data first
raw_gas_data <- readWorksheetFromFile(file_name, "LAG1",
startRow=10, startCol=which(LETTERS=="A"),
endRow=432, endCol=which(LETTERS=="S"))
gas_data <- clean_data(raw_gas_data)
## Then the electricity data
raw_elec_data <- readWorksheetFromFile(file_name, "LAE1",
startRow=10, startCol=which(LETTERS=="A"),
endRow=432, endCol=which(LETTERS=="S"))
elec_data <- clean_data(raw_elec_data)
## Merge the two data sets, renaming columns first
df.gas <- summarize(gas_data, LAU=LAU, floor=floor, gas=energy)
df.elec <- summarize(elec_data, LAU=LAU, floor=floor, elec=energy)
data <- merge(df.gas, df.elec)
data <- melt(data, id=c('LAU', 'floor'), variable.name='fuel')
data <- mutate(data, fuel=factor(fuel, lev=c('elec','gas'), labels=c('Electricity', 'Gas')))
## @knitr make-plots
require(ggplot2)
gg <- ggplot(data, aes(x=floor, y=value)) +
geom_boxplot() +
facet_wrap(~ fuel, ncol=1, scale='free') +
labs(x="Floor area (m2)", y="Mean annual energy consumption (kWh)") +
theme_bw()
print(gg)
@fhr
Copy link

fhr commented Apr 16, 2013

Hi James - I've updated this to read all the tabs of the spreadsheet. It generalises a couple of the functions and then writes everything to a dataframe called "output". https://gist.github.com/fhr/5396233

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment