Created
August 7, 2013 14:55
-
-
Save 46bit/6174768 to your computer and use it in GitHub Desktop.
Scraping Charity Choice for a YRS UOS group :)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'httparty' | |
require 'nokogiri' | |
require 'json' | |
pages = 1..64 | |
def get_page page | |
$stderr.puts "get_page #{page}" | |
return HTTParty.get("http://www.charitychoice.co.uk/charities/england/#{page}?onlinedonations=1") | |
end | |
def extract_from_html html | |
$stderr.puts "extract_from_html" | |
doc = Nokogiri::HTML(html) | |
charity_divs = doc.css(".charity-search-results .result") | |
charities = [] | |
charity_divs.each do |charity_div| | |
begin | |
h2_a = charity_div.css("h2 a")[0] | |
h2_a_content = h2_a.nil? ? "" : h2_a.content.strip | |
h2_a_href = h2_a.nil? ? "" : "http://www.charitychoice.co.uk#{h2_a.attr("href")}" | |
p_town_country = charity_div.css(".town-county")[0] | |
p_town_country_content = p_town_country.nil? ? "" : p_town_country.content.strip | |
donate_btn = charity_div.css(".btn-action.btn-donate")[0] | |
donate_btn_href = donate_btn.nil? ? "" : "http://www.charitychoice.co.uk#{donate_btn.attr("href")}" | |
rescue NoMethodError(e) | |
$stderr.puts charity_div.content | |
throw e | |
end | |
charities.push({ | |
title: h2_a_content, | |
location: p_town_country_content, | |
link: h2_a_href, | |
donate_link: donate_btn_href | |
}) | |
end | |
return charities | |
end | |
def extract_from_specific_page html | |
$stderr.puts "extract_from_specific_page" | |
doc = Nokogiri::HTML(html) | |
address_html = doc.css(".charity-postal-address span[itemprop=address]")[0].inner_html | |
address = doc.css(".charity-postal-address span[itemprop=address]")[0].content | |
address_html = address_html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') | |
#$stderr.puts address_html.inspect | |
address_html.gsub!("<br>", ", ") | |
address_html.gsub!(/<[^<>]+>/, "") | |
address_html.gsub!(" ", "") | |
address_html.gsub!("\n", "") | |
address_html.gsub!("\r", "") | |
address_html.gsub!(/^, /, "") | |
address_html.gsub!(/, $/, "") | |
#$stderr.puts address_html.inspect | |
return address_html | |
end | |
all_charities = [] | |
pages.each do |page| | |
html = get_page(page).body | |
charities = extract_from_html html | |
all_charities += charities | |
#break | |
# puts charities.to_json | |
end | |
# puts all_charities.to_json | |
i = 0 | |
all_charities.map do |charity| | |
i = i + 1 | |
$stderr.puts("PULLING CHARITY #{i} OF #{all_charities.length}") | |
$stderr.puts charity[:link] | |
html = HTTParty.get(charity[:link]).body | |
charity[:address] = extract_from_specific_page html | |
$stderr.puts charity[:address] | |
charity | |
#exit | |
end | |
puts all_charities.to_json | |
=begin | |
<span itemprop="street-address"> | |
Stroke Association House <br>240 City Road </span> | |
<br><span itemprop="locality">London</span> <br><span itemprop="postal-code">EC1V 2PR</span> | |
gsub("<br>", ", ") | |
<span itemprop="street-address"> | |
Stroke Association House , 240 City Road </span> | |
, <span itemprop="locality">London</span> | |
gsub(/<[^<>]+>/, "") | |
\n Stroke Association House , 240 City Road \n , London , EC1V 2PR | |
gsub(" ", "") | |
\nStroke Association House, 240 City Road\n, London, EC1V 2PR | |
gsub("\n", "") | |
address_html.gsub!("<br>", ", ") | |
address_html.gsub!(/<[^<>]+>/, "") | |
address_html.gsub!(" ", "") | |
address_html.gsub!("\n", "") | |
=end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment