jwlawson/all.sh Secret

## all.sh
#!/bin/sh
# Ensure that all raw data is stored in folder ./raw
# Assumes that ./data/wards1.{dbf, shp, prj, shx} is the OS BoundaryLine
#   shapefile encoded with the EPSG:4326 projection. If needed, convert the
#   shapefile using
# ogr2ogr -t_srs "EPSG:4326" -f "ESRI Shapefile" ./data/wards1.shp [downloaded shp file]
#
# The downloaded shapefile will be named something like:
#   OSNI_Open_Data_Largescale_Boundaries__District_Electoral_Areas_2012.shp
#
# python packages required:
#   matplotlib
#   mpl_toolkits basemap
#   numpy
#   pandas
#   seaborn
#

python pickle_postcodes.py
python pickle_results.py
python pickle_immigration.py
python pickle_unemployment.py
python pickle_cap.py
python find_cap_pc.py
python pickle_income.py
python combine_pickles.py
python draw.py
python scatter.py

## combine_pickles.py
"""
Combine the various data into a super table
"""
import pandas as pd


res = pd.read_pickle('./data/results.pkl').sort_index()
emp = pd.read_pickle('./data/unemployment.pkl').sort_index()
cpc = pd.read_pickle('./data/pc_cap2015.pkl').sort_index()
inc = pd.read_pickle('./data/income.pkl').sort_index()
imm = pd.read_pickle('./data/immigration.pkl').sort_index()


a = pd.merge(res, emp.drop('Area', axis=1), right_index=True, left_index=True,
        how='outer')
a = pd.merge(a, cpc, right_index=True, left_index=True, how='outer')
a = pd.merge(a, inc.drop('Area', axis=1), right_index=True, left_index=True,
        how='outer')

migration = imm[ [('10 Year Growth', 'Non-British', 'Val'), ('10 Year Growth',
    'Non-UK Born', 'Val'), ('2014', 'Non-British', 'Percent'), ('2014',
    'Non-UK Born', 'Percent')] ]
migration.columns = [ 'Non-Brit_Growth', 'Non-UK_Growth', 'Pct_Non-Brit',
        'Pct_Non-UK' ]
a = pd.merge(a, migration, right_index=True, left_index=True, how='outer')

a.to_pickle('./data/all.pkl')

## draw.py
"""
Draw stuff with basemap
"""
import pandas as pd

import map_draw

a = pd.read_pickle('./data/all.pkl')

results = (a['Result'] + 1) / 2
map_draw.draw_data(results, 'votes', 'seismic', '100% Leave', '100% Remain',
        'Results of 23rd May EU Referendum', source='Electoral commision')

cap = a['Total'] / 1000
map_draw.scale_draw(cap, 'cap', 'Blues',
        'Total CAP spending in thousands of GBP\nby region in 2015',
        source='DEFRA, cap-payments.defra.gov.uk')

rural = a['Rural_Development'] / 1000
map_draw.scale_draw(rural, 'ruraldev', 'Blues',
        'Rural development spending in thousands of GBP\nby region in 2015',
        source='DEFRA, cap-payments.defra.gov.uk')

inc = a['Income'] / 1000
map_draw.scale_draw(inc, 'income', 'Blues',
        'Median income in thousands of GBP\nby region in 2015',
        source='NOMIS, nomisweb.co.uk')

emp = a['Pct_Unemployed']
map_draw.scale_draw(emp, 'unemployed', 'Blues',
        'Percentage of population (aged 16-64) unemployed\nin 2015',
        source='NOMIS, nomisweb.co.uk')

turnout = a['Pct_Turnout']
map_draw.scale_draw(turnout, 'turnout', 'Blues',
        'Percentage of turnout in EU referendum')

nuk = a['Pct_Non-UK']
map_draw.scale_draw(nuk, 'pctnonuk', 'Blues',
        'Percentage of population not born in the UK\nin 2014',
        source='Office of National Statistics')

nbr = a['Pct_Non-Brit']
map_draw.scale_draw(nbr, 'pctnonbrit', 'Blues',
        'Percentage of population not British\nin 2014',
        source='Office of National Statistics')

nug = a['Non-UK_Growth']
map_draw.scale_draw(nug, 'nonukgrowth', 'Blues',
        'Growth in percent of population not born in the UK\nfrom 2004 to 2014',
        source='Office of National Statistics')

nbg = a['Non-Brit_Growth']
map_draw.scale_draw(nbg, 'nonbritgrowth', 'Blues',
        'Growth in percent of non-British population\nfrom 2004 to 2014',
        source='Office of National Statistics')

## find_cap_pc.py
"""
Convert the CAP data indexed by postcode to indexed by voting district code.
"""
import pandas as pd


cap = pd.read_pickle('./data/CAP2015.pkl')
pc = pd.read_pickle('./data/pc_prefixes.pkl')

# Reindex and convert CAP spending in to area wards
cap = cap.reset_index()
cap_codes = cap.Postcode.str.strip().str.upper().map(pc)
cap_codes.name = 'Ward'
cap = pd.concat([cap, cap_codes], axis=1).groupby('Ward').sum()
cap.to_pickle('./data/pc_cap2015.pkl')

## map_draw.py
"""
Uses Ordnance Survey BoundaryLine data to plot district/unitary wards. Available
as open data, district_borough_unitary
"""
import cPickle as pickle
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import matplotlib.cm as cm
import matplotlib.colors
from mpl_toolkits.basemap import Basemap
import numpy as np
import os.path

basemap_pkl = './data/ward_map.pkl'
text_col = '#555555'

def get_basemap():
    if os.path.isfile(basemap_pkl):
        with open(basemap_pkl, 'r') as f:
            return pickle.load(f)
    else:
        pmap = Basemap(
                projection='tmerc',
                ellps='WGS84',
                llcrnrlat=49.16,
                llcrnrlon=-7.91,
                urcrnrlat=61.35,
                urcrnrlon=3.77,
                resolution=None,
                lat_0=54,
                lon_0=-1.7)
        with open(basemap_pkl, 'w') as f:
            pickle.dump(pmap, f, pickle.HIGHEST_PROTOCOL)
        return pmap

def draw_data(data, f, colors, mx, mn, title, source='', show=False):
    fig = plt.figure(figsize=(7, 7), dpi=96)
    ax = fig.add_subplot(111, axisbg='w', frame_on=False)
    cmap = plt.get_cmap(colors)

    m = get_basemap()
    m.readshapefile('./data/wards1', 'wards')
    patches = []
    colors = []
    names = {'GI': 'Gibraltar', 'N92000002': 'Northern Ireland', 'N09000005':
        'Derry and Strabane'}
    for info, shape in zip(m.wards_info, m.wards):
        code = info['CODE']
        names[code] = info['NAME']
        if code in data.index.values and ~np.isnan(data[code]):
            colors.append(cmap(data[code]))
        else:
            colors.append((0.3, 0.3, 0.3, 1.0))
        patches.append(Polygon(np.array(shape), True))

    pc = PatchCollection(patches, edgecolor=text_col, linewidths=.1, zorder=2)
    pc.set_facecolor(colors)
    ax.add_collection(pc)

    mappable = cm.ScalarMappable(cmap=cmap)
    mappable.set_array([])
    mappable.set_clim(0, 1)
    cbar = fig.colorbar(mappable, ticks=[0, 1], shrink=0.5)
    cbar_labels = cbar.ax.set_yticklabels([mn, mx], color=text_col)

    sort = data.dropna().sort_values(ascending=False)
    highest = '\n'.join([ names[i] for i in sort.index[:4]])
    highest = 'Maximum Wards:\n\n' + highest
    lowest = '\n'.join([ names[i] for i in sort.index[-4:][::-1]])
    lowest = 'Minimum Wards:\n\n' + lowest
    extremes = highest + '\n\n\n' + lowest
    details = cbar.ax.text(
        -1., 1 - 0.007,
        extremes,
        ha='right', va='top',
        size=6,
        color=text_col)

    copyright = 'Contains OS data'
    copyright += '$\copyright$ Crown copyright and database right 2016'
    if len(source) > 0:
        copyright += '\nData provided by ' + source
        copyright += '\nData available under the Open Government License'
    smallprint = ax.text(
        1., 0.05,
        copyright,
        ha='right', va='bottom',
        size=4,
        color=text_col,
        transform=ax.transAxes)

    title = ax.set_title(title, color=text_col)
    plt.tight_layout()
    fig.savefig('img/' + f + '_7.png', dpi=96, bbox_inches='tight')

    fig.set_size_inches(12, 12)
    details.set_size(details.get_size() * 12 / 7)
    smallprint.set_size(smallprint.get_size() * 12 / 7)
    title.set_size(title.get_size() * 12 / 7 )
    for l in cbar_labels:
        l.set_size(l.get_size() * 12 / 7 )
    fig.savefig('img/' + f + '_12.png', dpi=96, bbox_inches='tight')

    fig.set_size_inches(36, 36)
    details.set_size(details.get_size() * 36 / 12)
    smallprint.set_size(smallprint.get_size() * 36 / 12)
    title.set_size(title.get_size() * 36 / 12 )
    for l in cbar_labels:
        l.set_size(l.get_size() * 36 / 12 )
    fig.savefig('img/' + f + '_36.png', dpi=96, bbox_inches='tight')

    if(show):
        plt.show()

def scale_draw(data, f, colors, title, source='', show=False):
    """
    Draws the provided data onto a chloropleth map of the UK. The data will be
    rescaled from 0 to 1 in the process.

    The map is saved in 3 sizes to the provided filename as pngs.

    Arguments:
        data -- pandas Series indexed by district wards
        f -- filename to save plots to
        colors -- string identifying a matplotlib colormap
        title -- Title of the plot
        show -- show the plot as an interactive window (default False)
    """
    mx = data.max()
    mn = data.min()
    if(data._is_view):
        data = data.copy()
    norm = matplotlib.colors.Normalize(vmin=mn, vmax=mx)
    norm(data)
    mx = '{:.3g}'.format(mx)
    mn = '{:.3g}'.format(mn)
    draw_data(data, f, colors, mx, mn, title, source, show)


## pickle_cap.py
"""
Pickles the CAP data, extracting only none-identifiable data

Data available from:
http://cap-payments.defra.gov.uk/Download.aspx

Available under the `Open Government Licence for public sector information`
http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
"""

import pandas as pd

sheet_columns = ['PostcodePrefix_F202B', 'TownCity_F202C',
                'OtherEAGFTotal', 'DirectEAGFTotal',
                'RuralDevelopmentTotal', 'Total']
final_columns = ['Postcode', 'Town', 'Other_EAGF', 'Direct_EAGF',
                'Rural_Development', 'Total']

def load_cap_data(filename):
    with pd.ExcelFile(filename) as xls:
        return pd.concat( (pd.read_excel(xls, sheet) for sheet in
            xls.sheet_names), ignore_index=True)

def organise_data(df):
    data = df.dropna(how='any', subset=['PostcodePrefix_F202B'])[sheet_columns]
    data.columns = final_columns
    town_names = (
            data[['Postcode', 'Town']].groupby('Postcode')
            .agg(lambda x: x.value_counts().index[0])
    )
    sums = data.groupby('Postcode').sum()
    return pd.concat([town_names, sums], axis=1)

def get_cap(year):
    df = load_cap_data('./raw/' + year + '_All_CAP_Search_Results_Data_P14.xls')
    return organise_data(df)

d15 = get_cap('2015')
d15.to_pickle('./data/CAP2015.pkl')

## pickle_immigration.py
"""
Pickles the immigration data

Data available at:
https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/
	migrationwithintheuk/datasets/localareamigrationindicatorsunitedkingdom

Data is released under the Open Government Licence:
	http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""

import pandas as pd

sheets = ['Non-UK Born Population', 'Non-British Population']
years = ['2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
    '2013', '2014']
year_cols = ['Percent', 'CI']
other_cols = ['10 Year Growth', 'Area Name']

def load_sheet(xls, sheet):
    df = pd.read_excel(xls, sheet, header=[0, 2]).dropna(how='all')
    df = df.loc[~df.index.str.startswith('95')]
    df.columns.set_levels(['Area Name'] + years, level=0, inplace=True)
    df.columns.set_levels(['CI +/-', 'Estimate', 'Population', ''],
        level=1, inplace=True)
    for year in years:
        df[year] = df[year].apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df.sortlevel(axis=1, inplace=True)
    return df

def load_data(filename):
    with pd.ExcelFile(filename) as xls:
        result = {}
        for s in sheets:
            result[s] = load_sheet(xls, s)
        return result

def calc_perc(sub, total):
    return sub / total * 100

def calc_growth(start, end):
    return calc_perc(end - start, start)

def add_percents(df):
    for y in years:
        df[y, 'Percent'] = calc_perc(df[y, 'Estimate'], df[y, 'Population'])
        df[y, 'CI'] = calc_perc(df[y, 'CI +/-'], df[y, 'Population'])

    df['10 Year Growth', 'Val'] = calc_growth(df['2004', 'Percent'],
        df['2014', 'Percent'])
    df['10 Year Growth', 'CI+'] = ( calc_growth(df['2004', 'Percent'] -
        df['2004', 'CI'], df['2014', 'Percent'] + df['2014', 'CI']) -
        df['10 Year Growth', 'Val'] )
    df['10 Year Growth', 'CI-'] = ( df['10 Year Growth', 'Val'] -
        calc_growth(df['2004', 'Percent'] + df['2004', 'CI'],
        df['2014', 'Percent'] - df['2014', 'CI']) )
    df.sortlevel(axis=1, inplace=True)

def add_descriptive_index(df, ind):
    mi = pd.MultiIndex.from_tuples([(ind, x, y) for x, y in df.columns.values])
    df.columns = mi
    return df.swaplevel(0, 1, axis=1)

idx = pd.IndexSlice

filename = './raw/v12localareamigrationindicatorsaug15_tcm77-414816.xls'
data = load_data(filename)
nuk = data[sheets[0]]
nbr = data[sheets[1]]

pop = nuk.loc[idx[:, idx[:, 'Population']]]
index = pd.MultiIndex.from_tuples([(x, y, '') for x, y in pop.columns.values])
pop.columns = index

add_percents(nuk)
nuk = add_descriptive_index(nuk, 'Non-UK Born')
nuk.sortlevel(axis=1, inplace=True)

add_percents(nbr)
nbr = add_descriptive_index(nbr, 'Non-British')
nbr.sortlevel(axis=1, inplace=True)

nuk = pd.concat([nuk.loc[idx[:, other_cols]],
        nuk.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
nbr = pd.concat([nbr.loc[idx[:, other_cols]],
        nbr.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
cols = nbr.columns.difference(nuk.columns)
both = pd.merge(nuk, nbr[cols], left_index=True, right_index=True, how='outer')
both[('Area Name', '', '')] = both[('Area Name', 'Non-UK Born', '')]
both.sortlevel(axis=1, inplace=True)
both.drop( [('Area Name', 'Non-UK Born'), ('Area Name', 'Non-British')], axis=1,
    inplace=True)

with_pop = pd.merge(both, pop, left_index=True, right_index=True, how='outer')
with_pop.sortlevel(axis=1, inplace=True)

with_pop.to_pickle('./data/immigration.pkl')

## pickle_income.py
"""
Data available from:
    NOMIS website.
    Dataset: Annual Survey of Hours and Earnings
        -> annual survey of hours and earnings - resident analysis
    Geography: local authorities: district / unitary (as of April 2015)
    Date: 2015
    Pay and hours: Annual pay - gross
    Sex & full/part-time: full-time workers
    Variable: median
    Include Area codes

Assumes resulting filename is: 'nomis-media-income.csv'

Data is released under the Open Government Licence:
        http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""

import pandas as pd

inc = pd.read_csv('./raw/nomis-median-income.csv',header=7, na_values=['#','-'])
inc.columns = ['Area','Code','Income','Income_Conf']
inc.set_index('Code',inplace=True)
inc.drop('Column Total', inplace=True)
inc.sort_index(inplace=True)
inc = inc.dropna(subset=['Area'])

inc.to_pickle('./data/income.pkl')

## pickle_postcodes.py
"""
Loads OS Open Code Point data and combines all postcodes into one DataFrame.
This maps each postcode to its ward code, and then also creates another which
maps the postcode prefix to its ward code.

"""

import glob
import os
import pandas as pd

codepoint_dir = './raw/codepoint-open_1418967'
header_file = codepoint_dir + '/docs/Code-Point_Open_Column_Headers.csv'
one_let_dir = codepoint_dir + '/one_letter_pc_code'
two_let_dir = codepoint_dir + '/two_letter_pc_code'

def get_header():
    return pd.read_csv(header_file, header=1)

def get_all_csv(d, headers):
    files = glob.glob(os.path.join(d, "*.csv"))
    return pd.concat( (pd.read_csv(f, names=headers) for f in files),
        ignore_index=True)

def combine_all():
    h = get_header()
    all_one = get_all_csv(one_let_dir, h)
    all_two = get_all_csv(two_let_dir, h)
    return pd.concat([all_one, all_two], ignore_index=True)

a = combine_all()
a = a[['Postcode', 'Admin_district_code']]
a.columns = ['Postcode', 'Ward']
a.to_pickle('./data/postcodes.pkl')

prefix = a['Postcode'].str[0:4].str.strip()
prefix.name = 'PC Prefix'
a = pd.concat([prefix, a], axis=1)
b = a.dropna(how='any').groupby('PC Prefix')['Ward'].agg(
        lambda x: x.value_counts().index[0])
b.to_pickle('./data/pc_prefixes.pkl')


## pickle_results.py
"""
Pickle the referendum results

Data available at:
        http://www.electoralcommission.org.uk/find-information-by-subject/
        elections-and-referendums/upcoming-elections-and-referendums/eu-referendum/
        electorate-and-count-information

Warning: No licence is specified for the data!
"""

import pandas as pd

wanted_cols = ['Area', 'Electorate', 'VerifiedBallotPapers', 'Pct_Turnout',
        'Valid_Votes', 'Remain', 'Leave', 'Rejected_Ballots', 'Pct_Remain',
        'Pct_Leave', 'Pct_Rejected']
rename_col = ['Area', 'Electorate', 'Ballots', 'Pct_Turnout',
        'Votes', 'Remain', 'Leave', 'Rejected', 'Pct_Remain', 'Pct_Leave',
        'Pct_Rejected']
d = pd.read_csv('./raw/EU-referendum-result-data.csv')
d = d.set_index('Area_Code').ix[:, 'Area':].sort_index()
d = d[wanted_cols]
d.columns = rename_col
d['Result'] = (d['Leave'] - d['Remain']) / d['Votes']

d.to_pickle('./data/results.pkl')


## pickle_unemployment.py
"""
Data available from:
    NOMIS website.
    Dataset: Claimant count -> Claimant count by sex and age
    Geography: local authorities: district / unitary (as of April 2015)
    Date: May 2016
    Age: All (16+)
    Rates: Claimant count AND Claimants as a proportion of residents aged 16-64
    Sex: Total
    Include Area codes

Assumes resulting filename is: 'nomis-unemployment.csv'

Data is released under the Open Government Licence:
        http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""

import pandas as pd

a = pd.read_csv('raw/nomis-unemployment.csv', header=6)
a.columns = ['Area','Code','Unemployed','Pct_Unemployed']
a.set_index('Code', inplace=True)
a = a.dropna()
a.drop('Column Total', inplace=True)
a.sort_index(inplace=True)

a.to_pickle('./data/unemployment.pkl')

## scatter.py
"""
Draw various scatter plots of data
"""
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

a = pd.read_pickle('./data/all.pkl')

pairs = [
    ('Non-UK_Growth', 'Result'),
    ('Non-Brit_Growth', 'Result'),
    ('Pct_Non-UK', 'Result'),
    ('Pct_Non-Brit', 'Result'),
    ('Pct_Turnout', 'Result'),
    ('Income', 'Result'),
    ('Pct_Unemployed', 'Result'),
    ('Pct_Unemployed', 'Pct_Turnout'),
    ('Income', 'Pct_Unemployed'),
    ('Total', 'Result'),
    ('Rural_Development', 'Result')
]

names = {
    'Result': 'Proportion of Leave vs Remain votes',
    'Non-UK_Growth': '10 year growth in percent of population not born in UK',
    'Non-Brit_Growth': '10 year growth in percent of population not British',
    'Pct_Non-UK': 'Percent of population not born in the UK (2014)',
    'Pct_Non-Brit': 'Percent of population not British (2014)',
    'Pct_Turnout': 'Percentage turnout at EU referendum',
    'Pct_Unemployed': 'Percent of population aged 16-64 unemployed (May 2015)',
    'Income': 'Median income in GBP',
    'Total': 'Total CAP spend in GBP (2015)',
    'Rural_Development': 'Rural Development fund spend in GBP (2015)'
}


fig = plt.figure(figsize=(8, 6), dpi=96)
ax = fig.add_subplot(111)
for x, y in pairs:
    ax.clear()
    xj = 0.02 if x=='Pct_Unemployed' else 0
    yj = 0.02 if y=='Pct_Unemployed' else 0
    sns.regplot(x=x, y=y, data=a, ax=ax, robust=True, x_jitter=xj, y_jitter=yj,
        line_kws={'color': sns.xkcd_rgb["slate blue"]})
    title = textwrap.fill('{} compared to {}'.format(names[x], names[y]), 70)
    ax.set_title(title)
    ax.set_xlabel(textwrap.fill(names[x], 40))
    ax.set_ylabel(textwrap.fill(names[y], 40))
    plt.tight_layout()
    fig.savefig('./img/sc/{}-{}.png'.format(x,y))

x = 'Total'
y = 'Result'
ax.clear()
ax.set(xscale='log')
sns.regplot(x=x, y=y, data=a, ax=ax, robust=True, fit_reg=False,
    line_kws={'color': sns.xkcd_rgb["slate blue"]})
title = textwrap.fill('{} compared to {}'.format(names[x], names[y]), 70)
ax.set_title(title)
ax.set_xlabel(textwrap.fill(names[x], 40))
ax.set_ylabel(textwrap.fill(names[y], 40))
plt.tight_layout()
fig.savefig('./img/sc/{}-{}-log.png'.format(x,y))
	#!/bin/sh
	# Ensure that all raw data is stored in folder ./raw
	# Assumes that ./data/wards1.{dbf, shp, prj, shx} is the OS BoundaryLine
	# shapefile encoded with the EPSG:4326 projection. If needed, convert the
	# shapefile using
	# ogr2ogr -t_srs "EPSG:4326" -f "ESRI Shapefile" ./data/wards1.shp [downloaded shp file]
	#
	# The downloaded shapefile will be named something like:
	# OSNI_Open_Data_Largescale_Boundaries__District_Electoral_Areas_2012.shp
	#
	# python packages required:
	# matplotlib
	# mpl_toolkits basemap
	# numpy
	# pandas
	# seaborn
	#

	python pickle_postcodes.py
	python pickle_results.py
	python pickle_immigration.py
	python pickle_unemployment.py
	python pickle_cap.py
	python find_cap_pc.py
	python pickle_income.py
	python combine_pickles.py
	python draw.py
	python scatter.py
	"""
	Combine the various data into a super table
	"""
	import pandas as pd


	res = pd.read_pickle('./data/results.pkl').sort_index()
	emp = pd.read_pickle('./data/unemployment.pkl').sort_index()
	cpc = pd.read_pickle('./data/pc_cap2015.pkl').sort_index()
	inc = pd.read_pickle('./data/income.pkl').sort_index()
	imm = pd.read_pickle('./data/immigration.pkl').sort_index()


	a = pd.merge(res, emp.drop('Area', axis=1), right_index=True, left_index=True,
	how='outer')
	a = pd.merge(a, cpc, right_index=True, left_index=True, how='outer')
	a = pd.merge(a, inc.drop('Area', axis=1), right_index=True, left_index=True,
	how='outer')

	migration = imm[ [('10 Year Growth', 'Non-British', 'Val'), ('10 Year Growth',
	'Non-UK Born', 'Val'), ('2014', 'Non-British', 'Percent'), ('2014',
	'Non-UK Born', 'Percent')] ]
	migration.columns = [ 'Non-Brit_Growth', 'Non-UK_Growth', 'Pct_Non-Brit',
	'Pct_Non-UK' ]
	a = pd.merge(a, migration, right_index=True, left_index=True, how='outer')

	a.to_pickle('./data/all.pkl')
	"""
	Draw stuff with basemap
	"""
	import pandas as pd

	import map_draw

	a = pd.read_pickle('./data/all.pkl')

	results = (a['Result'] + 1) / 2
	map_draw.draw_data(results, 'votes', 'seismic', '100% Leave', '100% Remain',
	'Results of 23rd May EU Referendum', source='Electoral commision')

	cap = a['Total'] / 1000
	map_draw.scale_draw(cap, 'cap', 'Blues',
	'Total CAP spending in thousands of GBP\nby region in 2015',
	source='DEFRA, cap-payments.defra.gov.uk')

	rural = a['Rural_Development'] / 1000
	map_draw.scale_draw(rural, 'ruraldev', 'Blues',
	'Rural development spending in thousands of GBP\nby region in 2015',
	source='DEFRA, cap-payments.defra.gov.uk')

	inc = a['Income'] / 1000
	map_draw.scale_draw(inc, 'income', 'Blues',
	'Median income in thousands of GBP\nby region in 2015',
	source='NOMIS, nomisweb.co.uk')

	emp = a['Pct_Unemployed']
	map_draw.scale_draw(emp, 'unemployed', 'Blues',
	'Percentage of population (aged 16-64) unemployed\nin 2015',
	source='NOMIS, nomisweb.co.uk')

	turnout = a['Pct_Turnout']
	map_draw.scale_draw(turnout, 'turnout', 'Blues',
	'Percentage of turnout in EU referendum')

	nuk = a['Pct_Non-UK']
	map_draw.scale_draw(nuk, 'pctnonuk', 'Blues',
	'Percentage of population not born in the UK\nin 2014',
	source='Office of National Statistics')

	nbr = a['Pct_Non-Brit']
	map_draw.scale_draw(nbr, 'pctnonbrit', 'Blues',
	'Percentage of population not British\nin 2014',
	source='Office of National Statistics')

	nug = a['Non-UK_Growth']
	map_draw.scale_draw(nug, 'nonukgrowth', 'Blues',
	'Growth in percent of population not born in the UK\nfrom 2004 to 2014',
	source='Office of National Statistics')

	nbg = a['Non-Brit_Growth']
	map_draw.scale_draw(nbg, 'nonbritgrowth', 'Blues',
	'Growth in percent of non-British population\nfrom 2004 to 2014',
	source='Office of National Statistics')
	"""
	Convert the CAP data indexed by postcode to indexed by voting district code.
	"""
	import pandas as pd


	cap = pd.read_pickle('./data/CAP2015.pkl')
	pc = pd.read_pickle('./data/pc_prefixes.pkl')

	# Reindex and convert CAP spending in to area wards
	cap = cap.reset_index()
	cap_codes = cap.Postcode.str.strip().str.upper().map(pc)
	cap_codes.name = 'Ward'
	cap = pd.concat([cap, cap_codes], axis=1).groupby('Ward').sum()
	cap.to_pickle('./data/pc_cap2015.pkl')
	"""
	Uses Ordnance Survey BoundaryLine data to plot district/unitary wards. Available
	as open data, district_borough_unitary
	"""
	import cPickle as pickle
	import matplotlib.pyplot as plt
	from matplotlib.patches import Polygon
	from matplotlib.collections import PatchCollection
	import matplotlib.cm as cm
	import matplotlib.colors
	from mpl_toolkits.basemap import Basemap
	import numpy as np
	import os.path

	basemap_pkl = './data/ward_map.pkl'
	text_col = '#555555'

	def get_basemap():
	if os.path.isfile(basemap_pkl):
	with open(basemap_pkl, 'r') as f:
	return pickle.load(f)
	else:
	pmap = Basemap(
	projection='tmerc',
	ellps='WGS84',
	llcrnrlat=49.16,
	llcrnrlon=-7.91,
	urcrnrlat=61.35,
	urcrnrlon=3.77,
	resolution=None,
	lat_0=54,
	lon_0=-1.7)
	with open(basemap_pkl, 'w') as f:
	pickle.dump(pmap, f, pickle.HIGHEST_PROTOCOL)
	return pmap

	def draw_data(data, f, colors, mx, mn, title, source='', show=False):
	fig = plt.figure(figsize=(7, 7), dpi=96)
	ax = fig.add_subplot(111, axisbg='w', frame_on=False)
	cmap = plt.get_cmap(colors)

	m = get_basemap()
	m.readshapefile('./data/wards1', 'wards')
	patches = []
	colors = []
	names = {'GI': 'Gibraltar', 'N92000002': 'Northern Ireland', 'N09000005':
	'Derry and Strabane'}
	for info, shape in zip(m.wards_info, m.wards):
	code = info['CODE']
	names[code] = info['NAME']
	if code in data.index.values and ~np.isnan(data[code]):
	colors.append(cmap(data[code]))
	else:
	colors.append((0.3, 0.3, 0.3, 1.0))
	patches.append(Polygon(np.array(shape), True))

	pc = PatchCollection(patches, edgecolor=text_col, linewidths=.1, zorder=2)
	pc.set_facecolor(colors)
	ax.add_collection(pc)

	mappable = cm.ScalarMappable(cmap=cmap)
	mappable.set_array([])
	mappable.set_clim(0, 1)
	cbar = fig.colorbar(mappable, ticks=[0, 1], shrink=0.5)
	cbar_labels = cbar.ax.set_yticklabels([mn, mx], color=text_col)

	sort = data.dropna().sort_values(ascending=False)
	highest = '\n'.join([ names[i] for i in sort.index[:4]])
	highest = 'Maximum Wards:\n\n' + highest
	lowest = '\n'.join([ names[i] for i in sort.index[-4:][::-1]])
	lowest = 'Minimum Wards:\n\n' + lowest
	extremes = highest + '\n\n\n' + lowest
	details = cbar.ax.text(
	-1., 1 - 0.007,
	extremes,
	ha='right', va='top',
	size=6,
	color=text_col)

	copyright = 'Contains OS data'
	copyright += '$\copyright$ Crown copyright and database right 2016'
	if len(source) > 0:
	copyright += '\nData provided by ' + source
	copyright += '\nData available under the Open Government License'
	smallprint = ax.text(
	1., 0.05,
	copyright,
	ha='right', va='bottom',
	size=4,
	color=text_col,
	transform=ax.transAxes)

	title = ax.set_title(title, color=text_col)
	plt.tight_layout()
	fig.savefig('img/' + f + '_7.png', dpi=96, bbox_inches='tight')

	fig.set_size_inches(12, 12)
	details.set_size(details.get_size() * 12 / 7)
	smallprint.set_size(smallprint.get_size() * 12 / 7)
	title.set_size(title.get_size() * 12 / 7 )
	for l in cbar_labels:
	l.set_size(l.get_size() * 12 / 7 )
	fig.savefig('img/' + f + '_12.png', dpi=96, bbox_inches='tight')

	fig.set_size_inches(36, 36)
	details.set_size(details.get_size() * 36 / 12)
	smallprint.set_size(smallprint.get_size() * 36 / 12)
	title.set_size(title.get_size() * 36 / 12 )
	for l in cbar_labels:
	l.set_size(l.get_size() * 36 / 12 )
	fig.savefig('img/' + f + '_36.png', dpi=96, bbox_inches='tight')

	if(show):
	plt.show()

	def scale_draw(data, f, colors, title, source='', show=False):
	"""
	Draws the provided data onto a chloropleth map of the UK. The data will be
	rescaled from 0 to 1 in the process.

	The map is saved in 3 sizes to the provided filename as pngs.

	Arguments:
	data -- pandas Series indexed by district wards
	f -- filename to save plots to
	colors -- string identifying a matplotlib colormap
	title -- Title of the plot
	show -- show the plot as an interactive window (default False)
	"""
	mx = data.max()
	mn = data.min()
	if(data._is_view):
	data = data.copy()
	norm = matplotlib.colors.Normalize(vmin=mn, vmax=mx)
	norm(data)
	mx = '{:.3g}'.format(mx)
	mn = '{:.3g}'.format(mn)
	draw_data(data, f, colors, mx, mn, title, source, show)
	"""
	Pickles the CAP data, extracting only none-identifiable data

	Data available from:
	http://cap-payments.defra.gov.uk/Download.aspx

	Available under the `Open Government Licence for public sector information`
	http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
	"""

	import pandas as pd

	sheet_columns = ['PostcodePrefix_F202B', 'TownCity_F202C',
	'OtherEAGFTotal', 'DirectEAGFTotal',
	'RuralDevelopmentTotal', 'Total']
	final_columns = ['Postcode', 'Town', 'Other_EAGF', 'Direct_EAGF',
	'Rural_Development', 'Total']

	def load_cap_data(filename):
	with pd.ExcelFile(filename) as xls:
	return pd.concat( (pd.read_excel(xls, sheet) for sheet in
	xls.sheet_names), ignore_index=True)

	def organise_data(df):
	data = df.dropna(how='any', subset=['PostcodePrefix_F202B'])[sheet_columns]
	data.columns = final_columns
	town_names = (
	data[['Postcode', 'Town']].groupby('Postcode')
	.agg(lambda x: x.value_counts().index[0])
	)
	sums = data.groupby('Postcode').sum()
	return pd.concat([town_names, sums], axis=1)

	def get_cap(year):
	df = load_cap_data('./raw/' + year + '_All_CAP_Search_Results_Data_P14.xls')
	return organise_data(df)

	d15 = get_cap('2015')
	d15.to_pickle('./data/CAP2015.pkl')
	"""
	Pickles the immigration data

	Data available at:
	https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/
	migrationwithintheuk/datasets/localareamigrationindicatorsunitedkingdom

	Data is released under the Open Government Licence:
	http://www.nationalarchives.gov.uk/doc/open-government-licence/
	"""

	import pandas as pd

	sheets = ['Non-UK Born Population', 'Non-British Population']
	years = ['2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
	'2013', '2014']
	year_cols = ['Percent', 'CI']
	other_cols = ['10 Year Growth', 'Area Name']

	def load_sheet(xls, sheet):
	df = pd.read_excel(xls, sheet, header=[0, 2]).dropna(how='all')
	df = df.loc[~df.index.str.startswith('95')]
	df.columns.set_levels(['Area Name'] + years, level=0, inplace=True)
	df.columns.set_levels(['CI +/-', 'Estimate', 'Population', ''],
	level=1, inplace=True)
	for year in years:
	df[year] = df[year].apply(lambda x: pd.to_numeric(x, errors='coerce'))
	df.sortlevel(axis=1, inplace=True)
	return df

	def load_data(filename):
	with pd.ExcelFile(filename) as xls:
	result = {}
	for s in sheets:
	result[s] = load_sheet(xls, s)
	return result

	def calc_perc(sub, total):
	return sub / total * 100

	def calc_growth(start, end):
	return calc_perc(end - start, start)

	def add_percents(df):
	for y in years:
	df[y, 'Percent'] = calc_perc(df[y, 'Estimate'], df[y, 'Population'])
	df[y, 'CI'] = calc_perc(df[y, 'CI +/-'], df[y, 'Population'])

	df['10 Year Growth', 'Val'] = calc_growth(df['2004', 'Percent'],
	df['2014', 'Percent'])
	df['10 Year Growth', 'CI+'] = ( calc_growth(df['2004', 'Percent'] -
	df['2004', 'CI'], df['2014', 'Percent'] + df['2014', 'CI']) -
	df['10 Year Growth', 'Val'] )
	df['10 Year Growth', 'CI-'] = ( df['10 Year Growth', 'Val'] -
	calc_growth(df['2004', 'Percent'] + df['2004', 'CI'],
	df['2014', 'Percent'] - df['2014', 'CI']) )
	df.sortlevel(axis=1, inplace=True)

	def add_descriptive_index(df, ind):
	mi = pd.MultiIndex.from_tuples([(ind, x, y) for x, y in df.columns.values])
	df.columns = mi
	return df.swaplevel(0, 1, axis=1)

	idx = pd.IndexSlice

	filename = './raw/v12localareamigrationindicatorsaug15_tcm77-414816.xls'
	data = load_data(filename)
	nuk = data[sheets[0]]
	nbr = data[sheets[1]]

	pop = nuk.loc[idx[:, idx[:, 'Population']]]
	index = pd.MultiIndex.from_tuples([(x, y, '') for x, y in pop.columns.values])
	pop.columns = index

	add_percents(nuk)
	nuk = add_descriptive_index(nuk, 'Non-UK Born')
	nuk.sortlevel(axis=1, inplace=True)

	add_percents(nbr)
	nbr = add_descriptive_index(nbr, 'Non-British')
	nbr.sortlevel(axis=1, inplace=True)

	nuk = pd.concat([nuk.loc[idx[:, other_cols]],
	nuk.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
	nbr = pd.concat([nbr.loc[idx[:, other_cols]],
	nbr.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
	cols = nbr.columns.difference(nuk.columns)
	both = pd.merge(nuk, nbr[cols], left_index=True, right_index=True, how='outer')
	both[('Area Name', '', '')] = both[('Area Name', 'Non-UK Born', '')]
	both.sortlevel(axis=1, inplace=True)
	both.drop( [('Area Name', 'Non-UK Born'), ('Area Name', 'Non-British')], axis=1,
	inplace=True)

	with_pop = pd.merge(both, pop, left_index=True, right_index=True, how='outer')
	with_pop.sortlevel(axis=1, inplace=True)

	with_pop.to_pickle('./data/immigration.pkl')
	"""
	Data available from:
	NOMIS website.
	Dataset: Annual Survey of Hours and Earnings
	-> annual survey of hours and earnings - resident analysis
	Geography: local authorities: district / unitary (as of April 2015)
	Date: 2015
	Pay and hours: Annual pay - gross
	Sex & full/part-time: full-time workers
	Variable: median
	Include Area codes

	Assumes resulting filename is: 'nomis-media-income.csv'

	Data is released under the Open Government Licence:
	http://www.nationalarchives.gov.uk/doc/open-government-licence/
	"""

	import pandas as pd

	inc = pd.read_csv('./raw/nomis-median-income.csv',header=7, na_values=['#','-'])
	inc.columns = ['Area','Code','Income','Income_Conf']
	inc.set_index('Code',inplace=True)
	inc.drop('Column Total', inplace=True)
	inc.sort_index(inplace=True)
	inc = inc.dropna(subset=['Area'])

	inc.to_pickle('./data/income.pkl')
	"""
	Loads OS Open Code Point data and combines all postcodes into one DataFrame.
	This maps each postcode to its ward code, and then also creates another which
	maps the postcode prefix to its ward code.

	"""

	import glob
	import os
	import pandas as pd

	codepoint_dir = './raw/codepoint-open_1418967'
	header_file = codepoint_dir + '/docs/Code-Point_Open_Column_Headers.csv'
	one_let_dir = codepoint_dir + '/one_letter_pc_code'
	two_let_dir = codepoint_dir + '/two_letter_pc_code'

	def get_header():
	return pd.read_csv(header_file, header=1)

	def get_all_csv(d, headers):
	files = glob.glob(os.path.join(d, "*.csv"))
	return pd.concat( (pd.read_csv(f, names=headers) for f in files),
	ignore_index=True)

	def combine_all():
	h = get_header()
	all_one = get_all_csv(one_let_dir, h)
	all_two = get_all_csv(two_let_dir, h)
	return pd.concat([all_one, all_two], ignore_index=True)

	a = combine_all()
	a = a[['Postcode', 'Admin_district_code']]
	a.columns = ['Postcode', 'Ward']
	a.to_pickle('./data/postcodes.pkl')

	prefix = a['Postcode'].str[0:4].str.strip()
	prefix.name = 'PC Prefix'
	a = pd.concat([prefix, a], axis=1)
	b = a.dropna(how='any').groupby('PC Prefix')['Ward'].agg(
	lambda x: x.value_counts().index[0])
	b.to_pickle('./data/pc_prefixes.pkl')
	"""
	Pickle the referendum results

	Data available at:
	http://www.electoralcommission.org.uk/find-information-by-subject/
	elections-and-referendums/upcoming-elections-and-referendums/eu-referendum/
	electorate-and-count-information

	Warning: No licence is specified for the data!
	"""

	import pandas as pd

	wanted_cols = ['Area', 'Electorate', 'VerifiedBallotPapers', 'Pct_Turnout',
	'Valid_Votes', 'Remain', 'Leave', 'Rejected_Ballots', 'Pct_Remain',
	'Pct_Leave', 'Pct_Rejected']
	rename_col = ['Area', 'Electorate', 'Ballots', 'Pct_Turnout',
	'Votes', 'Remain', 'Leave', 'Rejected', 'Pct_Remain', 'Pct_Leave',
	'Pct_Rejected']
	d = pd.read_csv('./raw/EU-referendum-result-data.csv')
	d = d.set_index('Area_Code').ix[:, 'Area':].sort_index()
	d = d[wanted_cols]
	d.columns = rename_col
	d['Result'] = (d['Leave'] - d['Remain']) / d['Votes']

	d.to_pickle('./data/results.pkl')