Skip to content

Instantly share code, notes, and snippets.

@CountCulture
Created November 2, 2009 10:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CountCulture/224069 to your computer and use it in GitHub Desktop.
Save CountCulture/224069 to your computer and use it in GitHub Desktop.
class OnsDatasetFinder
require 'httpclient'
BaseUrl = "http://neighbourhood.statistics.gov.uk/dissemination/"
def self.process
all_datasets = []
home_page = "http://neighbourhood.statistics.gov.uk/dissemination/Download1.do"
parsed_page = get_page(home_page)
dataset_group_paths = parsed_page.search('#mainContent3 li a[@href*="datasetList.do"]').collect {|ds| ds[:href]}.uniq
dataset_group_paths.each do |expand_group_path|
# Get page of list of dataset groupings
dataset_group_ids = get_page(BaseUrl+expand_group_path).search("input[@type='radio']").collect{ |i| i[:value] }
# For each group, get ids for datasets
dataset_group_ids.each do |group_id|
page = get_page(BaseUrl + "datasetList.do?JSAllowed=true&Function=&%24ph=60&CurrentPageId=60&step=1&CurrentTreeIndex=-1&Next.x=7&Next.y=9&searchString=&datasetFamilyId=" + group_id)
dataset_instances = page.search("div.floatRight input[@type='radio']:last-of-type")
dataset_instances.each do |instance_input_radio|
all_datasets << instance_hash_from(instance_input_radio, group_id)
end
end
end
open(File.join(RAILS_ROOT, "db/ons_data", "all_datasets.yml"), "wb") do |file|
file.write(all_datasets.flatten.to_yaml) # saved as YAML file, but obviously could be saved as csv or put in db
end
end
private
def self.get_page(url)
@client ||= HTTPClient.new
puts "Fetching #{url}"
doc = Nokogiri::HTML(@client.get_content(url))
if doc.at("title[text()*='Check Browser Settings']")
follow_link = doc.at('a')[:href]
doc = Nokogiri::HTML(@client.get_content(follow_link))
end
doc
end
def self.instance_hash_from(instance_radio_item, dataset_id, parent_radio_item=nil)
raw_title = instance_radio_item.parent.parent.inner_text
url = url_from(instance_radio_item, dataset_id, parent_radio_item)
if download_link_on(url)
# Within the datasets, build page listing the instance ids, i.e instance of that dataset by year
instance = { :url => url,
:title => (raw_title.scan(/(.+)(?:\xC2)/).to_s || raw_title).strip,
:dataset_id => dataset_id
}
puts "************** #{instance.inspect}"
instance
else
get_page(url).search("input[@type='radio']").collect{ |r| instance_hash_from(r,dataset_id, instance_radio_item) }
end
end
def self.download_link_on(url)
get_page(url).at("#rightPaneBox a[@href*='DownloadData.zip']")
end
def self.url_from(radio_item, dataset_id, instance_radio_item=nil)
url = BaseUrl
url += (radio_item[:name] == "filesetIndex" ? "filesetSelection.do?Function=&%24ph=60_61&CurrentPageId=61&step=2&datasetFamilyId=#{dataset_id}&instanceSelection=#{instance_radio_item[:value]}&filesetIndex=#{radio_item[:value]}&Next.x=11&Next.y=7" : "instanceSelection.do?Function=&%24ph=60_61&CurrentPageId=61&step=2&datasetFamilyId=#{dataset_id}&instanceSelection=#{radio_item[:value]}&Next.x=11&Next.y=7")
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment