Skip to content

Instantly share code, notes, and snippets.

@yaanno
Created November 22, 2008 17:50
Show Gist options
  • Save yaanno/27890 to your computer and use it in GitHub Desktop.
Save yaanno/27890 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'mysql'
require 'sequel'
require 'timeout'
class Scraper
SITEMAP = "http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml"
SITEMAPFILE = "sitemap_merged.txt"
DB = Sequel.connect("mysql://root:root@localhost/google-profiles")
DATASET = DB[:people]
def initialize
puts "Init Scraper"
if File.exists?(SITEMAPFILE) && File.size(SITEMAPFILE) >= 0
puts "Already have sitemapfile"
self.get_profile_pages
else
puts "Updating main sitemap"
self.update_sitemap
end
end
def update_sitemap
@sitemapfile = File.new("sitemap.txt","w")
@sitemap = get_document_or_404(SITEMAP)
if not @sitemap.nil?
@sitemapfile << @sitemap
puts "Main sitemap saved"
end
end
def get_subsitemaps
@subsitemap = Array.new
(@sitemap/:sitemap).each do |subsitemap|
@subsitemap << (subsitemap/:loc).inner_html
end
puts "Subsitemap links retrieved"
end
def process_subsitemaps
@sitemap_merged = Array.new
@subsitemap.each do |subsitemap|
puts "Retrieving subsitemap: " + subsitemap
map = get_document_or_404(subsitemap).to_s
if not map.nil?
@sitemap_merged << map
end
end
@sitemapfile_merged = File.new(SITEMAPFILE,"w")
@sitemapfile_merged << @sitemap_merged
puts "Merged sitemap saved"
end
def get_profile_pages
@sitemaps = File.new(SITEMAPFILE,"r")
@sitemaps.each do |profile_page|
@page = get_document_or_404(profile_page)
if not @page.nil?
self.process_profile(@page,profile_page)
end
end
end
def process_profile(page,url)
@profile = {
:name => (page/"div.name_header").inner_html,
:url => url,
:position => (page/"span.title").inner_html,
:school => (page/"span.school").inner_html,
:organization => (page/"span.org").inner_html,
:address => (page/"span.adr").inner_html,
:reader_url => "",
:feed_url => "",
}
self.process_shared_page
self.save_or_update_profile(@profile)
end
def process_shared_page
shared_page_link = @page.at("a[@href*='reader/shared']")
if not shared_page_link.nil?
shared_page_link = shared_page_link[:href]
shared_page = get_document_or_404(shared_page_link)
if not shared_page.nil?
@profile[:reader_url] = shared_page_link
if not shared_page.at("#shared-empty-message").nil?
feed_url = shared_page.at("a[@href*='reader/public']")[:href]
@profile[:feed_url] = feed_url
end
end
end
end
def get_document_or_404(url)
begin
timeout(10) do
return Hpricot(open(url.to_s))
end
rescue Timeout::Error
puts "Request timed out, skipping"
return nil
rescue
puts "Document not found (404): " + url.to_s
return nil
end
end
def save_or_update_profile(profile)
begin
DATASET << profile
puts "New profile saved"
rescue Sequel::DatabaseError
puts "Profile exists, skipping"
end
end
end
scraper = Scraper.new
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment