Skip to content

Instantly share code, notes, and snippets.

@jwlawson

jwlawson/all.sh Secret

Last active July 25, 2016 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwlawson/41302a734c6d9b0392cbd60571d755bf to your computer and use it in GitHub Desktop.
Save jwlawson/41302a734c6d9b0392cbd60571d755bf to your computer and use it in GitHub Desktop.
Plotting a referendum
#!/bin/sh
# Ensure that all raw data is stored in folder ./raw
# Assumes that ./data/wards1.{dbf, shp, prj, shx} is the OS BoundaryLine
# shapefile encoded with the EPSG:4326 projection. If needed, convert the
# shapefile using
# ogr2ogr -t_srs "EPSG:4326" -f "ESRI Shapefile" ./data/wards1.shp [downloaded shp file]
#
# The downloaded shapefile will be named something like:
# OSNI_Open_Data_Largescale_Boundaries__District_Electoral_Areas_2012.shp
#
# python packages required:
# matplotlib
# mpl_toolkits basemap
# numpy
# pandas
# seaborn
#
python pickle_postcodes.py
python pickle_results.py
python pickle_immigration.py
python pickle_unemployment.py
python pickle_cap.py
python find_cap_pc.py
python pickle_income.py
python combine_pickles.py
python draw.py
python scatter.py
"""
Combine the various data into a super table
"""
import pandas as pd
res = pd.read_pickle('./data/results.pkl').sort_index()
emp = pd.read_pickle('./data/unemployment.pkl').sort_index()
cpc = pd.read_pickle('./data/pc_cap2015.pkl').sort_index()
inc = pd.read_pickle('./data/income.pkl').sort_index()
imm = pd.read_pickle('./data/immigration.pkl').sort_index()
a = pd.merge(res, emp.drop('Area', axis=1), right_index=True, left_index=True,
how='outer')
a = pd.merge(a, cpc, right_index=True, left_index=True, how='outer')
a = pd.merge(a, inc.drop('Area', axis=1), right_index=True, left_index=True,
how='outer')
migration = imm[ [('10 Year Growth', 'Non-British', 'Val'), ('10 Year Growth',
'Non-UK Born', 'Val'), ('2014', 'Non-British', 'Percent'), ('2014',
'Non-UK Born', 'Percent')] ]
migration.columns = [ 'Non-Brit_Growth', 'Non-UK_Growth', 'Pct_Non-Brit',
'Pct_Non-UK' ]
a = pd.merge(a, migration, right_index=True, left_index=True, how='outer')
a.to_pickle('./data/all.pkl')
"""
Draw stuff with basemap
"""
import pandas as pd
import map_draw
a = pd.read_pickle('./data/all.pkl')
results = (a['Result'] + 1) / 2
map_draw.draw_data(results, 'votes', 'seismic', '100% Leave', '100% Remain',
'Results of 23rd May EU Referendum', source='Electoral commision')
cap = a['Total'] / 1000
map_draw.scale_draw(cap, 'cap', 'Blues',
'Total CAP spending in thousands of GBP\nby region in 2015',
source='DEFRA, cap-payments.defra.gov.uk')
rural = a['Rural_Development'] / 1000
map_draw.scale_draw(rural, 'ruraldev', 'Blues',
'Rural development spending in thousands of GBP\nby region in 2015',
source='DEFRA, cap-payments.defra.gov.uk')
inc = a['Income'] / 1000
map_draw.scale_draw(inc, 'income', 'Blues',
'Median income in thousands of GBP\nby region in 2015',
source='NOMIS, nomisweb.co.uk')
emp = a['Pct_Unemployed']
map_draw.scale_draw(emp, 'unemployed', 'Blues',
'Percentage of population (aged 16-64) unemployed\nin 2015',
source='NOMIS, nomisweb.co.uk')
turnout = a['Pct_Turnout']
map_draw.scale_draw(turnout, 'turnout', 'Blues',
'Percentage of turnout in EU referendum')
nuk = a['Pct_Non-UK']
map_draw.scale_draw(nuk, 'pctnonuk', 'Blues',
'Percentage of population not born in the UK\nin 2014',
source='Office of National Statistics')
nbr = a['Pct_Non-Brit']
map_draw.scale_draw(nbr, 'pctnonbrit', 'Blues',
'Percentage of population not British\nin 2014',
source='Office of National Statistics')
nug = a['Non-UK_Growth']
map_draw.scale_draw(nug, 'nonukgrowth', 'Blues',
'Growth in percent of population not born in the UK\nfrom 2004 to 2014',
source='Office of National Statistics')
nbg = a['Non-Brit_Growth']
map_draw.scale_draw(nbg, 'nonbritgrowth', 'Blues',
'Growth in percent of non-British population\nfrom 2004 to 2014',
source='Office of National Statistics')
"""
Convert the CAP data indexed by postcode to indexed by voting district code.
"""
import pandas as pd
cap = pd.read_pickle('./data/CAP2015.pkl')
pc = pd.read_pickle('./data/pc_prefixes.pkl')
# Reindex and convert CAP spending in to area wards
cap = cap.reset_index()
cap_codes = cap.Postcode.str.strip().str.upper().map(pc)
cap_codes.name = 'Ward'
cap = pd.concat([cap, cap_codes], axis=1).groupby('Ward').sum()
cap.to_pickle('./data/pc_cap2015.pkl')
"""
Uses Ordnance Survey BoundaryLine data to plot district/unitary wards. Available
as open data, district_borough_unitary
"""
import cPickle as pickle
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import matplotlib.cm as cm
import matplotlib.colors
from mpl_toolkits.basemap import Basemap
import numpy as np
import os.path
basemap_pkl = './data/ward_map.pkl'
text_col = '#555555'
def get_basemap():
if os.path.isfile(basemap_pkl):
with open(basemap_pkl, 'r') as f:
return pickle.load(f)
else:
pmap = Basemap(
projection='tmerc',
ellps='WGS84',
llcrnrlat=49.16,
llcrnrlon=-7.91,
urcrnrlat=61.35,
urcrnrlon=3.77,
resolution=None,
lat_0=54,
lon_0=-1.7)
with open(basemap_pkl, 'w') as f:
pickle.dump(pmap, f, pickle.HIGHEST_PROTOCOL)
return pmap
def draw_data(data, f, colors, mx, mn, title, source='', show=False):
fig = plt.figure(figsize=(7, 7), dpi=96)
ax = fig.add_subplot(111, axisbg='w', frame_on=False)
cmap = plt.get_cmap(colors)
m = get_basemap()
m.readshapefile('./data/wards1', 'wards')
patches = []
colors = []
names = {'GI': 'Gibraltar', 'N92000002': 'Northern Ireland', 'N09000005':
'Derry and Strabane'}
for info, shape in zip(m.wards_info, m.wards):
code = info['CODE']
names[code] = info['NAME']
if code in data.index.values and ~np.isnan(data[code]):
colors.append(cmap(data[code]))
else:
colors.append((0.3, 0.3, 0.3, 1.0))
patches.append(Polygon(np.array(shape), True))
pc = PatchCollection(patches, edgecolor=text_col, linewidths=.1, zorder=2)
pc.set_facecolor(colors)
ax.add_collection(pc)
mappable = cm.ScalarMappable(cmap=cmap)
mappable.set_array([])
mappable.set_clim(0, 1)
cbar = fig.colorbar(mappable, ticks=[0, 1], shrink=0.5)
cbar_labels = cbar.ax.set_yticklabels([mn, mx], color=text_col)
sort = data.dropna().sort_values(ascending=False)
highest = '\n'.join([ names[i] for i in sort.index[:4]])
highest = 'Maximum Wards:\n\n' + highest
lowest = '\n'.join([ names[i] for i in sort.index[-4:][::-1]])
lowest = 'Minimum Wards:\n\n' + lowest
extremes = highest + '\n\n\n' + lowest
details = cbar.ax.text(
-1., 1 - 0.007,
extremes,
ha='right', va='top',
size=6,
color=text_col)
copyright = 'Contains OS data'
copyright += '$\copyright$ Crown copyright and database right 2016'
if len(source) > 0:
copyright += '\nData provided by ' + source
copyright += '\nData available under the Open Government License'
smallprint = ax.text(
1., 0.05,
copyright,
ha='right', va='bottom',
size=4,
color=text_col,
transform=ax.transAxes)
title = ax.set_title(title, color=text_col)
plt.tight_layout()
fig.savefig('img/' + f + '_7.png', dpi=96, bbox_inches='tight')
fig.set_size_inches(12, 12)
details.set_size(details.get_size() * 12 / 7)
smallprint.set_size(smallprint.get_size() * 12 / 7)
title.set_size(title.get_size() * 12 / 7 )
for l in cbar_labels:
l.set_size(l.get_size() * 12 / 7 )
fig.savefig('img/' + f + '_12.png', dpi=96, bbox_inches='tight')
fig.set_size_inches(36, 36)
details.set_size(details.get_size() * 36 / 12)
smallprint.set_size(smallprint.get_size() * 36 / 12)
title.set_size(title.get_size() * 36 / 12 )
for l in cbar_labels:
l.set_size(l.get_size() * 36 / 12 )
fig.savefig('img/' + f + '_36.png', dpi=96, bbox_inches='tight')
if(show):
plt.show()
def scale_draw(data, f, colors, title, source='', show=False):
"""
Draws the provided data onto a chloropleth map of the UK. The data will be
rescaled from 0 to 1 in the process.
The map is saved in 3 sizes to the provided filename as pngs.
Arguments:
data -- pandas Series indexed by district wards
f -- filename to save plots to
colors -- string identifying a matplotlib colormap
title -- Title of the plot
show -- show the plot as an interactive window (default False)
"""
mx = data.max()
mn = data.min()
if(data._is_view):
data = data.copy()
norm = matplotlib.colors.Normalize(vmin=mn, vmax=mx)
norm(data)
mx = '{:.3g}'.format(mx)
mn = '{:.3g}'.format(mn)
draw_data(data, f, colors, mx, mn, title, source, show)
"""
Pickles the CAP data, extracting only none-identifiable data
Data available from:
http://cap-payments.defra.gov.uk/Download.aspx
Available under the `Open Government Licence for public sector information`
http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
"""
import pandas as pd
sheet_columns = ['PostcodePrefix_F202B', 'TownCity_F202C',
'OtherEAGFTotal', 'DirectEAGFTotal',
'RuralDevelopmentTotal', 'Total']
final_columns = ['Postcode', 'Town', 'Other_EAGF', 'Direct_EAGF',
'Rural_Development', 'Total']
def load_cap_data(filename):
with pd.ExcelFile(filename) as xls:
return pd.concat( (pd.read_excel(xls, sheet) for sheet in
xls.sheet_names), ignore_index=True)
def organise_data(df):
data = df.dropna(how='any', subset=['PostcodePrefix_F202B'])[sheet_columns]
data.columns = final_columns
town_names = (
data[['Postcode', 'Town']].groupby('Postcode')
.agg(lambda x: x.value_counts().index[0])
)
sums = data.groupby('Postcode').sum()
return pd.concat([town_names, sums], axis=1)
def get_cap(year):
df = load_cap_data('./raw/' + year + '_All_CAP_Search_Results_Data_P14.xls')
return organise_data(df)
d15 = get_cap('2015')
d15.to_pickle('./data/CAP2015.pkl')
"""
Pickles the immigration data
Data available at:
https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/
migrationwithintheuk/datasets/localareamigrationindicatorsunitedkingdom
Data is released under the Open Government Licence:
http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""
import pandas as pd
sheets = ['Non-UK Born Population', 'Non-British Population']
years = ['2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
'2013', '2014']
year_cols = ['Percent', 'CI']
other_cols = ['10 Year Growth', 'Area Name']
def load_sheet(xls, sheet):
df = pd.read_excel(xls, sheet, header=[0, 2]).dropna(how='all')
df = df.loc[~df.index.str.startswith('95')]
df.columns.set_levels(['Area Name'] + years, level=0, inplace=True)
df.columns.set_levels(['CI +/-', 'Estimate', 'Population', ''],
level=1, inplace=True)
for year in years:
df[year] = df[year].apply(lambda x: pd.to_numeric(x, errors='coerce'))
df.sortlevel(axis=1, inplace=True)
return df
def load_data(filename):
with pd.ExcelFile(filename) as xls:
result = {}
for s in sheets:
result[s] = load_sheet(xls, s)
return result
def calc_perc(sub, total):
return sub / total * 100
def calc_growth(start, end):
return calc_perc(end - start, start)
def add_percents(df):
for y in years:
df[y, 'Percent'] = calc_perc(df[y, 'Estimate'], df[y, 'Population'])
df[y, 'CI'] = calc_perc(df[y, 'CI +/-'], df[y, 'Population'])
df['10 Year Growth', 'Val'] = calc_growth(df['2004', 'Percent'],
df['2014', 'Percent'])
df['10 Year Growth', 'CI+'] = ( calc_growth(df['2004', 'Percent'] -
df['2004', 'CI'], df['2014', 'Percent'] + df['2014', 'CI']) -
df['10 Year Growth', 'Val'] )
df['10 Year Growth', 'CI-'] = ( df['10 Year Growth', 'Val'] -
calc_growth(df['2004', 'Percent'] + df['2004', 'CI'],
df['2014', 'Percent'] - df['2014', 'CI']) )
df.sortlevel(axis=1, inplace=True)
def add_descriptive_index(df, ind):
mi = pd.MultiIndex.from_tuples([(ind, x, y) for x, y in df.columns.values])
df.columns = mi
return df.swaplevel(0, 1, axis=1)
idx = pd.IndexSlice
filename = './raw/v12localareamigrationindicatorsaug15_tcm77-414816.xls'
data = load_data(filename)
nuk = data[sheets[0]]
nbr = data[sheets[1]]
pop = nuk.loc[idx[:, idx[:, 'Population']]]
index = pd.MultiIndex.from_tuples([(x, y, '') for x, y in pop.columns.values])
pop.columns = index
add_percents(nuk)
nuk = add_descriptive_index(nuk, 'Non-UK Born')
nuk.sortlevel(axis=1, inplace=True)
add_percents(nbr)
nbr = add_descriptive_index(nbr, 'Non-British')
nbr.sortlevel(axis=1, inplace=True)
nuk = pd.concat([nuk.loc[idx[:, other_cols]],
nuk.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
nbr = pd.concat([nbr.loc[idx[:, other_cols]],
nbr.loc[idx[:, idx[:, :, year_cols]]]], axis=1)
cols = nbr.columns.difference(nuk.columns)
both = pd.merge(nuk, nbr[cols], left_index=True, right_index=True, how='outer')
both[('Area Name', '', '')] = both[('Area Name', 'Non-UK Born', '')]
both.sortlevel(axis=1, inplace=True)
both.drop( [('Area Name', 'Non-UK Born'), ('Area Name', 'Non-British')], axis=1,
inplace=True)
with_pop = pd.merge(both, pop, left_index=True, right_index=True, how='outer')
with_pop.sortlevel(axis=1, inplace=True)
with_pop.to_pickle('./data/immigration.pkl')
"""
Data available from:
NOMIS website.
Dataset: Annual Survey of Hours and Earnings
-> annual survey of hours and earnings - resident analysis
Geography: local authorities: district / unitary (as of April 2015)
Date: 2015
Pay and hours: Annual pay - gross
Sex & full/part-time: full-time workers
Variable: median
Include Area codes
Assumes resulting filename is: 'nomis-media-income.csv'
Data is released under the Open Government Licence:
http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""
import pandas as pd
inc = pd.read_csv('./raw/nomis-median-income.csv',header=7, na_values=['#','-'])
inc.columns = ['Area','Code','Income','Income_Conf']
inc.set_index('Code',inplace=True)
inc.drop('Column Total', inplace=True)
inc.sort_index(inplace=True)
inc = inc.dropna(subset=['Area'])
inc.to_pickle('./data/income.pkl')
"""
Loads OS Open Code Point data and combines all postcodes into one DataFrame.
This maps each postcode to its ward code, and then also creates another which
maps the postcode prefix to its ward code.
"""
import glob
import os
import pandas as pd
codepoint_dir = './raw/codepoint-open_1418967'
header_file = codepoint_dir + '/docs/Code-Point_Open_Column_Headers.csv'
one_let_dir = codepoint_dir + '/one_letter_pc_code'
two_let_dir = codepoint_dir + '/two_letter_pc_code'
def get_header():
return pd.read_csv(header_file, header=1)
def get_all_csv(d, headers):
files = glob.glob(os.path.join(d, "*.csv"))
return pd.concat( (pd.read_csv(f, names=headers) for f in files),
ignore_index=True)
def combine_all():
h = get_header()
all_one = get_all_csv(one_let_dir, h)
all_two = get_all_csv(two_let_dir, h)
return pd.concat([all_one, all_two], ignore_index=True)
a = combine_all()
a = a[['Postcode', 'Admin_district_code']]
a.columns = ['Postcode', 'Ward']
a.to_pickle('./data/postcodes.pkl')
prefix = a['Postcode'].str[0:4].str.strip()
prefix.name = 'PC Prefix'
a = pd.concat([prefix, a], axis=1)
b = a.dropna(how='any').groupby('PC Prefix')['Ward'].agg(
lambda x: x.value_counts().index[0])
b.to_pickle('./data/pc_prefixes.pkl')
"""
Pickle the referendum results
Data available at:
http://www.electoralcommission.org.uk/find-information-by-subject/
elections-and-referendums/upcoming-elections-and-referendums/eu-referendum/
electorate-and-count-information
Warning: No licence is specified for the data!
"""
import pandas as pd
wanted_cols = ['Area', 'Electorate', 'VerifiedBallotPapers', 'Pct_Turnout',
'Valid_Votes', 'Remain', 'Leave', 'Rejected_Ballots', 'Pct_Remain',
'Pct_Leave', 'Pct_Rejected']
rename_col = ['Area', 'Electorate', 'Ballots', 'Pct_Turnout',
'Votes', 'Remain', 'Leave', 'Rejected', 'Pct_Remain', 'Pct_Leave',
'Pct_Rejected']
d = pd.read_csv('./raw/EU-referendum-result-data.csv')
d = d.set_index('Area_Code').ix[:, 'Area':].sort_index()
d = d[wanted_cols]
d.columns = rename_col
d['Result'] = (d['Leave'] - d['Remain']) / d['Votes']
d.to_pickle('./data/results.pkl')
"""
Data available from:
NOMIS website.
Dataset: Claimant count -> Claimant count by sex and age
Geography: local authorities: district / unitary (as of April 2015)
Date: May 2016
Age: All (16+)
Rates: Claimant count AND Claimants as a proportion of residents aged 16-64
Sex: Total
Include Area codes
Assumes resulting filename is: 'nomis-unemployment.csv'
Data is released under the Open Government Licence:
http://www.nationalarchives.gov.uk/doc/open-government-licence/
"""
import pandas as pd
a = pd.read_csv('raw/nomis-unemployment.csv', header=6)
a.columns = ['Area','Code','Unemployed','Pct_Unemployed']
a.set_index('Code', inplace=True)
a = a.dropna()
a.drop('Column Total', inplace=True)
a.sort_index(inplace=True)
a.to_pickle('./data/unemployment.pkl')
"""
Draw various scatter plots of data
"""
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
a = pd.read_pickle('./data/all.pkl')
pairs = [
('Non-UK_Growth', 'Result'),
('Non-Brit_Growth', 'Result'),
('Pct_Non-UK', 'Result'),
('Pct_Non-Brit', 'Result'),
('Pct_Turnout', 'Result'),
('Income', 'Result'),
('Pct_Unemployed', 'Result'),
('Pct_Unemployed', 'Pct_Turnout'),
('Income', 'Pct_Unemployed'),
('Total', 'Result'),
('Rural_Development', 'Result')
]
names = {
'Result': 'Proportion of Leave vs Remain votes',
'Non-UK_Growth': '10 year growth in percent of population not born in UK',
'Non-Brit_Growth': '10 year growth in percent of population not British',
'Pct_Non-UK': 'Percent of population not born in the UK (2014)',
'Pct_Non-Brit': 'Percent of population not British (2014)',
'Pct_Turnout': 'Percentage turnout at EU referendum',
'Pct_Unemployed': 'Percent of population aged 16-64 unemployed (May 2015)',
'Income': 'Median income in GBP',
'Total': 'Total CAP spend in GBP (2015)',
'Rural_Development': 'Rural Development fund spend in GBP (2015)'
}
fig = plt.figure(figsize=(8, 6), dpi=96)
ax = fig.add_subplot(111)
for x, y in pairs:
ax.clear()
xj = 0.02 if x=='Pct_Unemployed' else 0
yj = 0.02 if y=='Pct_Unemployed' else 0
sns.regplot(x=x, y=y, data=a, ax=ax, robust=True, x_jitter=xj, y_jitter=yj,
line_kws={'color': sns.xkcd_rgb["slate blue"]})
title = textwrap.fill('{} compared to {}'.format(names[x], names[y]), 70)
ax.set_title(title)
ax.set_xlabel(textwrap.fill(names[x], 40))
ax.set_ylabel(textwrap.fill(names[y], 40))
plt.tight_layout()
fig.savefig('./img/sc/{}-{}.png'.format(x,y))
x = 'Total'
y = 'Result'
ax.clear()
ax.set(xscale='log')
sns.regplot(x=x, y=y, data=a, ax=ax, robust=True, fit_reg=False,
line_kws={'color': sns.xkcd_rgb["slate blue"]})
title = textwrap.fill('{} compared to {}'.format(names[x], names[y]), 70)
ax.set_title(title)
ax.set_xlabel(textwrap.fill(names[x], 40))
ax.set_ylabel(textwrap.fill(names[y], 40))
plt.tight_layout()
fig.savefig('./img/sc/{}-{}-log.png'.format(x,y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment