Created
June 17, 2011 17:39
-
-
Save lynaghk/1031879 to your computer and use it in GitHub Desktop.
Create UK University Stats Database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Copyright Keming Labs 2011 | |
# But you can use it under the MIT license. Yay! | |
# See our visualization of these data here: | |
# | |
# http://keminglabs.com/ukuni/ | |
# | |
# This script imports data from UCAS | |
# | |
# http://www.ucas.com/about_us/stat_services/stats_online/annual_datasets_to_download/ | |
# | |
# specifically UK "Ethnicity / Social Class", 2003--2010. | |
# The data is provided as executable self-unzipping excel sheets, and | |
# this script normalizes and imports into SQLite, then spits out JSON. | |
# Requires JRuby and the following gems: | |
require 'rubygems' | |
require 'spreadsheet' #spreadsheet access library | |
require 'facets' #ruby core language extensions | |
require 'sequel' #SQL database library | |
require 'pp' #pretty printing | |
require 'json' | |
DB = Sequel.connect("jdbc:sqlite:///#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/ukuni.db") | |
class Array | |
def to_i | |
self.collect{|x| x.to_i} | |
end | |
end | |
if not DB.table_exists?(:ukuni) | |
DB.create_table :ukuni do | |
primary_key :id | |
String :age | |
String :ethnicity | |
String :subject | |
String :gender | |
String :year | |
Integer :applications | |
Integer :accepts | |
end | |
fields = [:age, :ethnicity, :subject, :male_apps, :female_apps, :male_accs, :female_accs] | |
%w(eth03.xls eth04.xls eth05.xls eth06.xls eth_v2_07.xls eth_v2_08.xls eth_v2_09.xls eth_v2_10.xls).each_with_index{|filename, i| | |
year = 2003 + i | |
excel_path = "#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/#{filename}" | |
sheet = Spreadsheet.open(excel_path).worksheets.first | |
DB.transaction do | |
#skip the first six (or seven, in the case of 2010) rows | |
sheet.each(filename == "eth_v2_10.xls" ? 6 : 5){|x| | |
next if x[0].nil? | |
h = Hash.zip fields, if filename == "eth_v2_10.xls" or filename == "eth_v2_09.xls" | |
x.values_at(0, 1, 3, 4, 5, 7, 8) | |
else | |
x.values_at(0, 1, 4, 5, 6, 8, 9) | |
end | |
shared = Hash[h.select{|k,v| [:age, :ethnicity, :subject].include? k}] | |
shared[:year] = year | |
#normalize "unknown" and "unknown, preferred not to say" | |
shared[:ethnicity] = "Unknown" if shared[:ethnicity].match /unknown/i | |
#The Chinese split off from the asians in 2004, wtf. | |
shared[:ethnicity] = "Asian" if shared[:ethnicity].match /Chinese/i | |
DB[:ukuni].insert shared.merge({ :gender => "male", | |
:applications => h[:male_apps], | |
:accepts => h[:male_accs]}) | |
DB[:ukuni].insert shared.merge({ :gender => "female", | |
:applications => h[:female_apps], | |
:accepts => h[:female_accs] }) | |
} | |
end | |
} | |
end | |
fields = [:age, :ethnicity, :gender, :subject, :year] | |
combined = Hash.autonew | |
DB["SELECT #{fields*', '}, SUM(applications) AS applied, SUM(accepts) AS accepted | |
FROM ukuni | |
GROUP BY #{fields*', '}"].each{|x| | |
#to save on bandwidth + clientside aggregation, we are going to | |
#give nested hashes by year | |
shared = Hash[x.select{|k,v| [:age, :ethnicity, :gender, :subject].include? k}] | |
combined[shared][x[:year]][:accepted] = x[:accepted] | |
combined[shared][x[:year]][:applied] = x[:applied] | |
} | |
open('data.json', 'w'){|f| | |
f.write JSON.generate combined.collect{|k, v| | |
k.merge({:data => v}) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment