lynaghk/create_db.rb

## create_db.rb
#!/usr/bin/env ruby
#
# Copyright Keming Labs 2011
# But you can use it under the MIT license. Yay!
# See our visualization of these data here:
#
#    http://keminglabs.com/ukuni/
#
# This script imports data from UCAS
#
#    http://www.ucas.com/about_us/stat_services/stats_online/annual_datasets_to_download/
#
# specifically UK "Ethnicity / Social Class", 2003--2010.
# The data is provided as executable self-unzipping excel sheets, and
# this script normalizes and imports into SQLite, then spits out JSON.
# Requires JRuby and the following gems:

require 'rubygems'
require 'spreadsheet' #spreadsheet access library
require 'facets' #ruby core language extensions
require 'sequel' #SQL database library
require 'pp' #pretty printing
require 'json'

DB = Sequel.connect("jdbc:sqlite:///#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/ukuni.db")

class Array
  def to_i
    self.collect{|x| x.to_i}
  end
end

if not DB.table_exists?(:ukuni)
  DB.create_table :ukuni do
    primary_key :id

    String      :age
    String      :ethnicity
    String      :subject
    String      :gender
    String      :year
    Integer     :applications
    Integer     :accepts
  end


  fields = [:age, :ethnicity, :subject, :male_apps, :female_apps, :male_accs, :female_accs]

  %w(eth03.xls eth04.xls eth05.xls eth06.xls eth_v2_07.xls eth_v2_08.xls eth_v2_09.xls eth_v2_10.xls).each_with_index{|filename, i|
    year = 2003 + i
    excel_path = "#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/#{filename}"
    sheet = Spreadsheet.open(excel_path).worksheets.first

    DB.transaction do
      #skip the first six (or seven, in the case of 2010) rows
      sheet.each(filename == "eth_v2_10.xls" ? 6 : 5){|x|
        next if x[0].nil?
        h = Hash.zip fields, if filename == "eth_v2_10.xls" or filename == "eth_v2_09.xls"
                               x.values_at(0, 1, 3, 4, 5, 7, 8)
                             else
                               x.values_at(0, 1, 4, 5, 6, 8, 9)
                             end

        shared = Hash[h.select{|k,v| [:age, :ethnicity, :subject].include? k}]
        shared[:year] = year
        #normalize "unknown" and "unknown, preferred not to say"
        shared[:ethnicity] = "Unknown" if shared[:ethnicity].match /unknown/i
        #The Chinese split off from the asians in 2004, wtf.
        shared[:ethnicity] = "Asian" if shared[:ethnicity].match /Chinese/i

        DB[:ukuni].insert shared.merge({ :gender => "male",
                                         :applications => h[:male_apps],
                                         :accepts => h[:male_accs]})
        DB[:ukuni].insert shared.merge({ :gender => "female",
                                         :applications => h[:female_apps],
                                         :accepts => h[:female_accs] })
      }
    end

  }
end

fields = [:age, :ethnicity, :gender, :subject, :year]
combined = Hash.autonew
DB["SELECT #{fields*', '}, SUM(applications) AS applied, SUM(accepts) AS accepted
    FROM ukuni
    GROUP BY #{fields*', '}"].each{|x|
  #to save on bandwidth + clientside aggregation, we are going to
  #give nested hashes by year
  shared = Hash[x.select{|k,v| [:age, :ethnicity, :gender, :subject].include? k}]
  combined[shared][x[:year]][:accepted] = x[:accepted]
  combined[shared][x[:year]][:applied] = x[:applied]
}


open('data.json', 'w'){|f|
  f.write JSON.generate combined.collect{|k, v|
    k.merge({:data => v})
  }
}
	#!/usr/bin/env ruby
	#
	# Copyright Keming Labs 2011
	# But you can use it under the MIT license. Yay!
	# See our visualization of these data here:
	#
	# http://keminglabs.com/ukuni/
	#
	# This script imports data from UCAS
	#
	# http://www.ucas.com/about_us/stat_services/stats_online/annual_datasets_to_download/
	#
	# specifically UK "Ethnicity / Social Class", 2003--2010.
	# The data is provided as executable self-unzipping excel sheets, and
	# this script normalizes and imports into SQLite, then spits out JSON.
	# Requires JRuby and the following gems:

	require 'rubygems'
	require 'spreadsheet' #spreadsheet access library
	require 'facets' #ruby core language extensions
	require 'sequel' #SQL database library
	require 'pp' #pretty printing
	require 'json'

	DB = Sequel.connect("jdbc:sqlite:///#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/ukuni.db")

	class Array
	def to_i
	self.collect{\|x\| x.to_i}
	end
	end

	if not DB.table_exists?(:ukuni)
	DB.create_table :ukuni do
	primary_key :id

	String :age
	String :ethnicity
	String :subject
	String :gender
	String :year
	Integer :applications
	Integer :accepts
	end


	fields = [:age, :ethnicity, :subject, :male_apps, :female_apps, :male_accs, :female_accs]

	%w(eth03.xls eth04.xls eth05.xls eth06.xls eth_v2_07.xls eth_v2_08.xls eth_v2_09.xls eth_v2_10.xls).each_with_index{\|filename, i\|
	year = 2003 + i
	excel_path = "#{File.expand_path(File.dirname(__FILE__))}/../../resources/data/#{filename}"
	sheet = Spreadsheet.open(excel_path).worksheets.first

	DB.transaction do
	#skip the first six (or seven, in the case of 2010) rows
	sheet.each(filename == "eth_v2_10.xls" ? 6 : 5){\|x\|
	next if x[0].nil?
	h = Hash.zip fields, if filename == "eth_v2_10.xls" or filename == "eth_v2_09.xls"
	x.values_at(0, 1, 3, 4, 5, 7, 8)
	else
	x.values_at(0, 1, 4, 5, 6, 8, 9)
	end

	shared = Hash[h.select{\|k,v\| [:age, :ethnicity, :subject].include? k}]
	shared[:year] = year
	#normalize "unknown" and "unknown, preferred not to say"
	shared[:ethnicity] = "Unknown" if shared[:ethnicity].match /unknown/i
	#The Chinese split off from the asians in 2004, wtf.
	shared[:ethnicity] = "Asian" if shared[:ethnicity].match /Chinese/i

	DB[:ukuni].insert shared.merge({ :gender => "male",
	:applications => h[:male_apps],
	:accepts => h[:male_accs]})
	DB[:ukuni].insert shared.merge({ :gender => "female",
	:applications => h[:female_apps],
	:accepts => h[:female_accs] })
	}
	end

	}
	end

	fields = [:age, :ethnicity, :gender, :subject, :year]
	combined = Hash.autonew
	DB["SELECT #{fields*', '}, SUM(applications) AS applied, SUM(accepts) AS accepted
	FROM ukuni
	GROUP BY #{fields*', '}"].each{\|x\|
	#to save on bandwidth + clientside aggregation, we are going to
	#give nested hashes by year
	shared = Hash[x.select{\|k,v\| [:age, :ethnicity, :gender, :subject].include? k}]
	combined[shared][x[:year]][:accepted] = x[:accepted]
	combined[shared][x[:year]][:applied] = x[:applied]
	}


	open('data.json', 'w'){\|f\|
	f.write JSON.generate combined.collect{\|k, v\|
	k.merge({:data => v})
	}
	}