fhr/need-analysis.r

## need-analysis.r
## Analysis of DECC NEED data
## James Keirstead
## 22 November 2012
##
## Underlying data available from
##     http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
## For more info, see
##     http://www.jameskeirstead.ca/dphil/we-need-more-data/

## Updated by
## Francine Bennett
## April 2013
## www.mastodonc.com
## on behalf of Homely (www.homely.com)


## Define useful global variables
file_name <- "6951-local-authority-consumption-tables-2010.xls"

### Define functions
## ------------------------------

## Remove regional totals
##
## The spreadsheets contain subtotals by region and country.
## However since I want to analyse the information by local authority
## I first need to strip out these extra rows.

##    df  - the input data frame.  It should contain the local authority
##          names in the second column ("Col1")
remove_regional_totals <- function(df) {
  ## The regional names are kept in Col0 and therefore Col1 is blank
  tmp <- subset(df, !is.na(Col1))

  ## Remove whitespace from local authority names
  require(stringr)
  tmp <- mutate(tmp, Col1=str_trim(Col1))

  ## Return the result, dropping the now empty first column
  return(tmp[,-1])
}

## Extract columns
##
## The data are arranged in columns with discrete categories.
## This function will extract a generic set of columns and apply a new set
## of labels
##
##    df       - the input data frame with local authority name in first column
##    startCol - the number of the first column to be selected
##    labels   - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
##    category - the generic category of the labels, e.g. "Fruit"
##    melt     - return the columns as a melted data frame

extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
  ## Extract the columns and rename
  tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
  names(tmp) <- c("LAU", labels)

  if (do_melt) {
    require(reshape2)
    tmp.m <- melt(tmp, id="LAU", variable.name="variable_value")
	# Remove DECC's placeholder for confidential info
	require(stringr)
	tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
	tmp.m$category=category
    return(tmp.m)
  } else {
    tmp$category=category
    return(tmp)
  }
}


## Extract the number of dwellings in each local authority, cut by our category
get_number_dwellings <- function(df,labels,category) {
  tmp <- extract_columns(df, 3, labels, category)
  names(tmp)[3] <- "number"
  return(tmp)
}

## Extract the mean energy consumption in each local authority by our category
## Both the gas and electricity sheets position this information in the same column relative to label length
get_mean_consumption <- function(df,labels,category) {
  tmp <- extract_columns(df, 6+length(labels), labels, category)
  names(tmp)[3] <- "mean_energy_kwh"
  return(tmp)
}

## A convenience function to assemble the overall data set
clean_data <- function(df,labels,category) {
   tmp <- remove_regional_totals(df)
   mean_energy <- get_mean_consumption(tmp,labels,category)
   dwellings <- get_number_dwellings(tmp,labels,category)
   result <- merge(mean_energy, dwellings)
   return(result)
}

## @knitr run-analysis
## Run the analysis
## Read in the spreadsheet
## Start and end coordinates are by manual inspection
require(XLConnect)

floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")
bedrooms_labels<-c("1","2","3","4","5 or more")
dwelling_type_labels<-c("Detached","Semi detached","End terrace","Mid terrace","Bungalow","Purpose built flat","Converted flat")
dwelling_age_labels<-c("Pre 1919","1919 to 1944","1945 to 1964","1965 to 1982","1983 to 1992","1993 to 1999","Post 1999")
tenure_labels<-c("Owner-occupied","Privately rented","Council /Housing Assocation")
household_income_labels<-c("Less than £10,000","£10,000 to £14,999","£15,000 to £19,999","£20,000 to £24,999","£25,000 to £29,999","£30,000 to £39,999",
"£40,000 to £49,999","£50,000 to £59,999","£60,000 to £74,999","£75,000 and over")

worksheet.list<-c("LAG1","LAG2","LAG3","LAG4","LAG5","LAG6","LAE1","LAE2","LAE3","LAE4","LAE5","LAE6")
end_cols<-rep(c("R","P","T","T","L","Z"),2)
category_list<-rep(c("floor_area","bedrooms","dwelling_type","dwelling_age","tenure","household_income"),2)
type_list<-c(rep("Gas",6),rep("Electrical",6))

output<-data.frame(matrix(nrow=0,ncol=6))

for (i in 1:length(worksheet.list)){
  temp<-readWorksheetFromFile(file_name, worksheet.list[i],
                              startRow=10, startCol=which(LETTERS=="A"),
                              endRow=432, endCol=which(LETTERS==end_cols[i])+1)
  temp2<- clean_data(temp,get(paste(category_list[i],"_labels",sep="")),category_list[i])
  temp2$type<-type_list[i]
  output<-rbind(output,temp2)
  print(i)
}

write.csv(output,"flat_need_data.csv",row.names=FALSE)
	## Analysis of DECC NEED data
	## James Keirstead
	## 22 November 2012
	##
	## Underlying data available from
	## http://www.decc.gov.uk/en/content/cms/statistics/energy_stats/en_effic_stats/need/need.aspx
	## For more info, see
	## http://www.jameskeirstead.ca/dphil/we-need-more-data/

	## Updated by
	## Francine Bennett
	## April 2013
	## www.mastodonc.com
	## on behalf of Homely (www.homely.com)


	## Define useful global variables
	file_name <- "6951-local-authority-consumption-tables-2010.xls"

	### Define functions
	## ------------------------------

	## Remove regional totals
	##
	## The spreadsheets contain subtotals by region and country.
	## However since I want to analyse the information by local authority
	## I first need to strip out these extra rows.

	## df - the input data frame. It should contain the local authority
	## names in the second column ("Col1")
	remove_regional_totals <- function(df) {
	## The regional names are kept in Col0 and therefore Col1 is blank
	tmp <- subset(df, !is.na(Col1))

	## Remove whitespace from local authority names
	require(stringr)
	tmp <- mutate(tmp, Col1=str_trim(Col1))

	## Return the result, dropping the now empty first column
	return(tmp[,-1])
	}

	## Extract columns
	##
	## The data are arranged in columns with discrete categories.
	## This function will extract a generic set of columns and apply a new set
	## of labels
	##
	## df - the input data frame with local authority name in first column
	## startCol - the number of the first column to be selected
	## labels - the labels of the extracted columns, e.g. c("Apples", "Oranges", "Pears")
	## category - the generic category of the labels, e.g. "Fruit"
	## melt - return the columns as a melted data frame

	extract_columns <- function(df, startCol, labels, category, do_melt=TRUE) {
	## Extract the columns and rename
	tmp <- df[, c(1, startCol:(startCol + length(labels) - 1))]
	names(tmp) <- c("LAU", labels)

	if (do_melt) {
	require(reshape2)
	tmp.m <- melt(tmp, id="LAU", variable.name="variable_value")
	# Remove DECC's placeholder for confidential info
	require(stringr)
	tmp.m <- mutate(tmp.m, value=as.numeric(str_replace(value, "x", NA)))
	tmp.m$category=category
	return(tmp.m)
	} else {
	tmp$category=category
	return(tmp)
	}
	}


	## Extract the number of dwellings in each local authority, cut by our category
	get_number_dwellings <- function(df,labels,category) {
	tmp <- extract_columns(df, 3, labels, category)
	names(tmp)[3] <- "number"
	return(tmp)
	}

	## Extract the mean energy consumption in each local authority by our category
	## Both the gas and electricity sheets position this information in the same column relative to label length
	get_mean_consumption <- function(df,labels,category) {
	tmp <- extract_columns(df, 6+length(labels), labels, category)
	names(tmp)[3] <- "mean_energy_kwh"
	return(tmp)
	}

	## A convenience function to assemble the overall data set
	clean_data <- function(df,labels,category) {
	tmp <- remove_regional_totals(df)
	mean_energy <- get_mean_consumption(tmp,labels,category)
	dwellings <- get_number_dwellings(tmp,labels,category)
	result <- merge(mean_energy, dwellings)
	return(result)
	}

	## @knitr run-analysis
	## Run the analysis
	## Read in the spreadsheet
	## Start and end coordinates are by manual inspection
	require(XLConnect)

	floor_area_labels <- c("50 or less", "51 to 100", "101 to 150", "151 to 200", "201 to 250", "Over 250")
	bedrooms_labels<-c("1","2","3","4","5 or more")
	dwelling_type_labels<-c("Detached","Semi detached","End terrace","Mid terrace","Bungalow","Purpose built flat","Converted flat")
	dwelling_age_labels<-c("Pre 1919","1919 to 1944","1945 to 1964","1965 to 1982","1983 to 1992","1993 to 1999","Post 1999")
	tenure_labels<-c("Owner-occupied","Privately rented","Council /Housing Assocation")
	household_income_labels<-c("Less than £10,000","£10,000 to £14,999","£15,000 to £19,999","£20,000 to £24,999","£25,000 to £29,999","£30,000 to £39,999",
	"£40,000 to £49,999","£50,000 to £59,999","£60,000 to £74,999","£75,000 and over")

	worksheet.list<-c("LAG1","LAG2","LAG3","LAG4","LAG5","LAG6","LAE1","LAE2","LAE3","LAE4","LAE5","LAE6")
	end_cols<-rep(c("R","P","T","T","L","Z"),2)
	category_list<-rep(c("floor_area","bedrooms","dwelling_type","dwelling_age","tenure","household_income"),2)
	type_list<-c(rep("Gas",6),rep("Electrical",6))

	output<-data.frame(matrix(nrow=0,ncol=6))

	for (i in 1:length(worksheet.list)){
	temp<-readWorksheetFromFile(file_name, worksheet.list[i],
	startRow=10, startCol=which(LETTERS=="A"),
	endRow=432, endCol=which(LETTERS==end_cols[i])+1)
	temp2<- clean_data(temp,get(paste(category_list[i],"_labels",sep="")),category_list[i])
	temp2$type<-type_list[i]
	output<-rbind(output,temp2)
	print(i)
	}

	write.csv(output,"flat_need_data.csv",row.names=FALSE)