public
Created

Create UK University Stats Database

  • Download Gist
create_db.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
#!/usr/bin/env ruby
#
# Copyright Keming Labs 2011
# But you can use it under the MIT license. Yay!
# See our visualization of these data here:
#
# http://keminglabs.com/ukuni/
#
# This script imports data from UCAS
#
# http://www.ucas.com/about_us/stat_services/stats_online/annual_datasets_to_download/
#
# specifically UK "Ethnicity / Social Class", 2003--2010.
# The data is provided as executable self-unzipping excel sheets, and
# this script normalizes and imports into SQLite, then spits out JSON.
# Requires JRuby and the following gems:
 
require 'rubygems'
require 'spreadsheet' #spreadsheet access library
require 'facets' #ruby core language extensions
require 'sequel' #SQL database library
require 'pp' #pretty printing
require 'json'
 
DB = Sequel.connect("jdbc:sqlite:///#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/ukuni.db")
 
class Array
def to_i
self.collect{|x| x.to_i}
end
end
 
if not DB.table_exists?(:ukuni)
DB.create_table :ukuni do
primary_key :id
 
String :age
String :ethnicity
String :subject
String :gender
String :year
Integer :applications
Integer :accepts
end
 
 
fields = [:age, :ethnicity, :subject, :male_apps, :female_apps, :male_accs, :female_accs]
 
%w(eth03.xls eth04.xls eth05.xls eth06.xls eth_v2_07.xls eth_v2_08.xls eth_v2_09.xls eth_v2_10.xls).each_with_index{|filename, i|
year = 2003 + i
excel_path = "#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/#{filename}"
sheet = Spreadsheet.open(excel_path).worksheets.first
 
DB.transaction do
#skip the first six (or seven, in the case of 2010) rows
sheet.each(filename == "eth_v2_10.xls" ? 6 : 5){|x|
next if x[0].nil?
h = Hash.zip fields, if filename == "eth_v2_10.xls" or filename == "eth_v2_09.xls"
x.values_at(0, 1, 3, 4, 5, 7, 8)
else
x.values_at(0, 1, 4, 5, 6, 8, 9)
end
 
shared = Hash[h.select{|k,v| [:age, :ethnicity, :subject].include? k}]
shared[:year] = year
#normalize "unknown" and "unknown, preferred not to say"
shared[:ethnicity] = "Unknown" if shared[:ethnicity].match /unknown/i
#The Chinese split off from the asians in 2004, wtf.
shared[:ethnicity] = "Asian" if shared[:ethnicity].match /Chinese/i
 
DB[:ukuni].insert shared.merge({ :gender => "male",
:applications => h[:male_apps],
:accepts => h[:male_accs]})
DB[:ukuni].insert shared.merge({ :gender => "female",
:applications => h[:female_apps],
:accepts => h[:female_accs] })
}
end
 
}
end
 
fields = [:age, :ethnicity, :gender, :subject, :year]
combined = Hash.autonew
DB["SELECT #{fields*', '}, SUM(applications) AS applied, SUM(accepts) AS accepted
FROM ukuni
GROUP BY #{fields*', '}"].each{|x|
#to save on bandwidth + clientside aggregation, we are going to
#give nested hashes by year
shared = Hash[x.select{|k,v| [:age, :ethnicity, :gender, :subject].include? k}]
combined[shared][x[:year]][:accepted] = x[:accepted]
combined[shared][x[:year]][:applied] = x[:applied]
}
 
open('data.json', 'w'){|f|
f.write JSON.generate combined.collect{|k, v|
k.merge({:data => v})
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.