Skip to content

Instantly share code, notes, and snippets.

@46bit
Created August 7, 2013 14:55
Show Gist options
  • Save 46bit/6174768 to your computer and use it in GitHub Desktop.
Save 46bit/6174768 to your computer and use it in GitHub Desktop.
Scraping Charity Choice for a YRS UOS group :)
require 'httparty'
require 'nokogiri'
require 'json'
pages = 1..64
def get_page page
$stderr.puts "get_page #{page}"
return HTTParty.get("http://www.charitychoice.co.uk/charities/england/#{page}?onlinedonations=1")
end
def extract_from_html html
$stderr.puts "extract_from_html"
doc = Nokogiri::HTML(html)
charity_divs = doc.css(".charity-search-results .result")
charities = []
charity_divs.each do |charity_div|
begin
h2_a = charity_div.css("h2 a")[0]
h2_a_content = h2_a.nil? ? "" : h2_a.content.strip
h2_a_href = h2_a.nil? ? "" : "http://www.charitychoice.co.uk#{h2_a.attr("href")}"
p_town_country = charity_div.css(".town-county")[0]
p_town_country_content = p_town_country.nil? ? "" : p_town_country.content.strip
donate_btn = charity_div.css(".btn-action.btn-donate")[0]
donate_btn_href = donate_btn.nil? ? "" : "http://www.charitychoice.co.uk#{donate_btn.attr("href")}"
rescue NoMethodError(e)
$stderr.puts charity_div.content
throw e
end
charities.push({
title: h2_a_content,
location: p_town_country_content,
link: h2_a_href,
donate_link: donate_btn_href
})
end
return charities
end
def extract_from_specific_page html
$stderr.puts "extract_from_specific_page"
doc = Nokogiri::HTML(html)
address_html = doc.css(".charity-postal-address span[itemprop=address]")[0].inner_html
address = doc.css(".charity-postal-address span[itemprop=address]")[0].content
address_html = address_html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
#$stderr.puts address_html.inspect
address_html.gsub!("<br>", ", ")
address_html.gsub!(/<[^<>]+>/, "")
address_html.gsub!(" ", "")
address_html.gsub!("\n", "")
address_html.gsub!("\r", "")
address_html.gsub!(/^, /, "")
address_html.gsub!(/, $/, "")
#$stderr.puts address_html.inspect
return address_html
end
all_charities = []
pages.each do |page|
html = get_page(page).body
charities = extract_from_html html
all_charities += charities
#break
# puts charities.to_json
end
# puts all_charities.to_json
i = 0
all_charities.map do |charity|
i = i + 1
$stderr.puts("PULLING CHARITY #{i} OF #{all_charities.length}")
$stderr.puts charity[:link]
html = HTTParty.get(charity[:link]).body
charity[:address] = extract_from_specific_page html
$stderr.puts charity[:address]
charity
#exit
end
puts all_charities.to_json
=begin
<span itemprop="street-address">
Stroke Association House <br>240 City Road </span>
<br><span itemprop="locality">London</span> <br><span itemprop="postal-code">EC1V 2PR</span>
gsub("<br>", ", ")
<span itemprop="street-address">
Stroke Association House , 240 City Road </span>
, <span itemprop="locality">London</span>
gsub(/<[^<>]+>/, "")
\n Stroke Association House , 240 City Road \n , London , EC1V 2PR
gsub(" ", "")
\nStroke Association House, 240 City Road\n, London, EC1V 2PR
gsub("\n", "")
address_html.gsub!("<br>", ", ")
address_html.gsub!(/<[^<>]+>/, "")
address_html.gsub!(" ", "")
address_html.gsub!("\n", "")
=end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment