jkeirstead/need-analysis.r

## need-analysis.r
## Analysis of DECC NEED data
## James Keirstead
## 22 November 2012
##
## Underlying data available from
##     http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
## For more info, see
##     http://www.jameskeirstead.ca/blog/we-need-more-data/


## @knitr define-functions
## Define useful global variables
## ------------------------------

file_name <- "data/6951-local-authority-consumption-tables-2010-1.xls"
floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")

### Define functions
## ------------------------------

## Remove regional totals
##
## The spreadsheets contain subtotals by region and country.
## However since I want to analyse the information by local authority
## I first need to strip out these extra rows.

##    df  - the input data frame.  It should contain the local authority
##          names in the second column ("Col1")
remove_regional_totals <- function(df) {
  ## The regional names are kept in Col0 and therefore Col1 is blank
  tmp <- subset(df, !is.na(Col1))

  ## Remove whitespace from local authority names
  require(stringr)
  tmp <- mutate(tmp, Col1=str_trim(Col1))

  ## Return the result, dropping the now empty first column
  return(tmp[,-1])
}

## Extract columns
##
## The data are arranged in columns with discrete categories.
## This function will extract a generic set of columns and apply a new set
## of labels
##
##    df       - the input data frame with local authority name in first column
##    startCol - the number of the first column to be selected
##    labels   - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
##    category - the generic category of the labels, e.g. "Fruit"
##    melt     - return the columns as a melted data frame
extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
  ## Extract the columns and rename
  tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
  names(tmp) <- c("LAU", labels)

  if (do_melt) {
    require(reshape2)
    tmp.m <- melt(tmp, id="LAU", variable.name=category)
	# Remove DECC's placeholder for confidential info
	require(stringr)
	tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
	return(tmp.m)
  } else {
    return(tmp)
  }
}


## Extract the number of dwellings in each local authority by floor area
get_number_dwellings <- function(df) {
  tmp <- extract_columns(df, 3, floor_area_labels, "floor")
  names(tmp)[3] <- "number"
  return(tmp)
}

## Extract the mean energy consumption in each local authority by floor area
## Both the gas and electricity sheets position this information in the same column
get_mean_consumption <- function(df) {
  tmp <- extract_columns(df, 12, floor_area_labels, "floor")
  names(tmp)[3] <- "energy"
  return(tmp)
}

## A convenience function to assemble the overall data set
clean_data <- function(df) {
   tmp <- remove_regional_totals(df)
   mean_energy <- get_mean_consumption(tmp)
   dwellings <- get_number_dwellings(tmp)
   result <- merge(mean_energy, dwellings)
   return(result)
}


## @knitr run-analysis
## Run the analysis
##------------------------

## Read in the spreadsheet
## Start and end coordinates are by manual inspection
require(XLConnect)

## Process the gas data first
raw_gas_data <- readWorksheetFromFile(file_name, "LAG1",
	startRow=10, startCol=which(LETTERS=="A"),
	endRow=432, endCol=which(LETTERS=="S"))
gas_data <- clean_data(raw_gas_data)

## Then the electricity data
raw_elec_data <- readWorksheetFromFile(file_name, "LAE1",
	startRow=10, startCol=which(LETTERS=="A"),
	endRow=432, endCol=which(LETTERS=="S"))
elec_data <- clean_data(raw_elec_data)

## Merge the two data sets, renaming columns first
df.gas <- summarize(gas_data, LAU=LAU, floor=floor, gas=energy)
df.elec <- summarize(elec_data, LAU=LAU, floor=floor, elec=energy)
data <- merge(df.gas, df.elec)
data <- melt(data, id=c('LAU', 'floor'), variable.name='fuel')
data <- mutate(data, fuel=factor(fuel, lev=c('elec','gas'), labels=c('Electricity', 'Gas')))


## @knitr make-plots
require(ggplot2)
gg <- ggplot(data, aes(x=floor, y=value)) +
  geom_boxplot() +
  facet_wrap(~ fuel, ncol=1, scale='free') +
  labs(x="Floor area (m2)", y="Mean annual energy consumption (kWh)") +
  theme_bw()
print(gg)
	## Analysis of DECC NEED data
	## James Keirstead
	## 22 November 2012
	##
	## Underlying data available from
	## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
	## For more info, see
	## http://www.jameskeirstead.ca/blog/we-need-more-data/


	## @knitr define-functions
	## Define useful global variables
	## ------------------------------

	file_name <- "data/6951-local-authority-consumption-tables-2010-1.xls"
	floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")

	### Define functions
	## ------------------------------

	## Remove regional totals
	##
	## The spreadsheets contain subtotals by region and country.
	## However since I want to analyse the information by local authority
	## I first need to strip out these extra rows.

	## df - the input data frame. It should contain the local authority
	## names in the second column ("Col1")
	remove_regional_totals <- function(df) {
	## The regional names are kept in Col0 and therefore Col1 is blank
	tmp <- subset(df, !is.na(Col1))

	## Remove whitespace from local authority names
	require(stringr)
	tmp <- mutate(tmp, Col1=str_trim(Col1))

	## Return the result, dropping the now empty first column
	return(tmp[,-1])
	}

	## Extract columns
	##
	## The data are arranged in columns with discrete categories.
	## This function will extract a generic set of columns and apply a new set
	## of labels
	##
	## df - the input data frame with local authority name in first column
	## startCol - the number of the first column to be selected
	## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
	## category - the generic category of the labels, e.g. "Fruit"
	## melt - return the columns as a melted data frame
	extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
	## Extract the columns and rename
	tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
	names(tmp) <- c("LAU", labels)

	if (do_melt) {
	require(reshape2)
	tmp.m <- melt(tmp, id="LAU", variable.name=category)
	# Remove DECC's placeholder for confidential info
	require(stringr)
	tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
	return(tmp.m)
	} else {
	return(tmp)
	}
	}


	## Extract the number of dwellings in each local authority by floor area
	get_number_dwellings <- function(df) {
	tmp <- extract_columns(df, 3, floor_area_labels, "floor")
	names(tmp)[3] <- "number"
	return(tmp)
	}

	## Extract the mean energy consumption in each local authority by floor area
	## Both the gas and electricity sheets position this information in the same column
	get_mean_consumption <- function(df) {
	tmp <- extract_columns(df, 12, floor_area_labels, "floor")
	names(tmp)[3] <- "energy"
	return(tmp)
	}

	## A convenience function to assemble the overall data set
	clean_data <- function(df) {
	tmp <- remove_regional_totals(df)
	mean_energy <- get_mean_consumption(tmp)
	dwellings <- get_number_dwellings(tmp)
	result <- merge(mean_energy, dwellings)
	return(result)
	}


	## @knitr run-analysis
	## Run the analysis
	##------------------------

	## Read in the spreadsheet
	## Start and end coordinates are by manual inspection
	require(XLConnect)

	## Process the gas data first
	raw_gas_data <- readWorksheetFromFile(file_name, "LAG1",
	startRow=10, startCol=which(LETTERS=="A"),
	endRow=432, endCol=which(LETTERS=="S"))
	gas_data <- clean_data(raw_gas_data)

	## Then the electricity data
	raw_elec_data <- readWorksheetFromFile(file_name, "LAE1",
	startRow=10, startCol=which(LETTERS=="A"),
	endRow=432, endCol=which(LETTERS=="S"))
	elec_data <- clean_data(raw_elec_data)

	## Merge the two data sets, renaming columns first
	df.gas <- summarize(gas_data, LAU=LAU, floor=floor, gas=energy)
	df.elec <- summarize(elec_data, LAU=LAU, floor=floor, elec=energy)
	data <- merge(df.gas, df.elec)
	data <- melt(data, id=c('LAU', 'floor'), variable.name='fuel')
	data <- mutate(data, fuel=factor(fuel, lev=c('elec','gas'), labels=c('Electricity', 'Gas')))


	## @knitr make-plots
	require(ggplot2)
	gg <- ggplot(data, aes(x=floor, y=value)) +
	geom_boxplot() +
	facet_wrap(~ fuel, ncol=1, scale='free') +
	labs(x="Floor area (m2)", y="Mean annual energy consumption (kWh)") +
	theme_bw()
	print(gg)