Skip to content

Instantly share code, notes, and snippets.

@lynaghk
Created June 17, 2011 17:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lynaghk/1031879 to your computer and use it in GitHub Desktop.
Save lynaghk/1031879 to your computer and use it in GitHub Desktop.
Create UK University Stats Database
#!/usr/bin/env ruby
#
# Copyright Keming Labs 2011
# But you can use it under the MIT license. Yay!
# See our visualization of these data here:
#
# http://keminglabs.com/ukuni/
#
# This script imports data from UCAS
#
# http://www.ucas.com/about_us/stat_services/stats_online/annual_datasets_to_download/
#
# specifically UK "Ethnicity / Social Class", 2003--2010.
# The data is provided as executable self-unzipping excel sheets, and
# this script normalizes and imports into SQLite, then spits out JSON.
# Requires JRuby and the following gems:
require 'rubygems'
require 'spreadsheet' #spreadsheet access library
require 'facets' #ruby core language extensions
require 'sequel' #SQL database library
require 'pp' #pretty printing
require 'json'
DB = Sequel.connect("jdbc:sqlite:///#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/ukuni.db")
class Array
def to_i
self.collect{|x| x.to_i}
end
end
if not DB.table_exists?(:ukuni)
DB.create_table :ukuni do
primary_key :id
String :age
String :ethnicity
String :subject
String :gender
String :year
Integer :applications
Integer :accepts
end
fields = [:age, :ethnicity, :subject, :male_apps, :female_apps, :male_accs, :female_accs]
%w(eth03.xls eth04.xls eth05.xls eth06.xls eth_v2_07.xls eth_v2_08.xls eth_v2_09.xls eth_v2_10.xls).each_with_index{|filename, i|
year = 2003 + i
excel_path = "#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/#{filename}"
sheet = Spreadsheet.open(excel_path).worksheets.first
DB.transaction do
#skip the first six (or seven, in the case of 2010) rows
sheet.each(filename == "eth_v2_10.xls" ? 6 : 5){|x|
next if x[0].nil?
h = Hash.zip fields, if filename == "eth_v2_10.xls" or filename == "eth_v2_09.xls"
x.values_at(0, 1, 3, 4, 5, 7, 8)
else
x.values_at(0, 1, 4, 5, 6, 8, 9)
end
shared = Hash[h.select{|k,v| [:age, :ethnicity, :subject].include? k}]
shared[:year] = year
#normalize "unknown" and "unknown, preferred not to say"
shared[:ethnicity] = "Unknown" if shared[:ethnicity].match /unknown/i
#The Chinese split off from the asians in 2004, wtf.
shared[:ethnicity] = "Asian" if shared[:ethnicity].match /Chinese/i
DB[:ukuni].insert shared.merge({ :gender => "male",
:applications => h[:male_apps],
:accepts => h[:male_accs]})
DB[:ukuni].insert shared.merge({ :gender => "female",
:applications => h[:female_apps],
:accepts => h[:female_accs] })
}
end
}
end
fields = [:age, :ethnicity, :gender, :subject, :year]
combined = Hash.autonew
DB["SELECT #{fields*', '}, SUM(applications) AS applied, SUM(accepts) AS accepted
FROM ukuni
GROUP BY #{fields*', '}"].each{|x|
#to save on bandwidth + clientside aggregation, we are going to
#give nested hashes by year
shared = Hash[x.select{|k,v| [:age, :ethnicity, :gender, :subject].include? k}]
combined[shared][x[:year]][:accepted] = x[:accepted]
combined[shared][x[:year]][:applied] = x[:applied]
}
open('data.json', 'w'){|f|
f.write JSON.generate combined.collect{|k, v|
k.merge({:data => v})
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment