Created
January 1, 2009 04:47
-
-
Save robmckinnon/42184 to your computer and use it in GitHub Desktop.
grabs soulmate profiles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems'; require 'pottery'; require 'mechanize'; require 'hpricot'; require 'open-uri'; require 'active_support' | |
class SoulMate | |
include Pottery | |
class << self | |
def find age='29', postcode='N1' | |
ids = find_ids age.to_s, postcode | |
ids.collect do |id| | |
soul_mate = restore(id.to_s) | |
unless soul_mate | |
begin | |
soul_mate = SoulMate.new | |
soul_mate.load_data id | |
soul_mate.save | |
rescue Exception => e | |
puts e.to_s | |
end | |
end | |
soul_mate | |
end.compact | |
end | |
def women | |
saved_by_gender 'Female' | |
end | |
def men | |
saved_by_gender 'Male' | |
end | |
private | |
def saved_by_gender gender | |
all = Soup.sieve(:gender, "= '#{gender}'") | |
all.collect {|snip| restore(snip.name)} | |
end | |
def find_ids age, postcode | |
ids = {} | |
WWW::Mechanize.new.get('http://dating.guardian.co.uk/s/') do |page| | |
results = page.form_with(:action => '/s/find/search.php') do |f| | |
f.gender = '2' | |
f.mGender = '1' | |
f.mAgeMin = age | |
f.mAgeMax = age | |
f.mPostCode = postcode | |
f.mRangeID = '1' | |
end.click_button | |
handle_links ids, results.links | |
end | |
ids.keys.sort | |
end | |
def handle_links ids, links | |
done_next = false | |
links.each do |link| | |
uri = link.href.strip | |
if uri[/\/s\/view\/(\d+)\/s\/(\d+)/] | |
ids[$1] = uri | |
elsif link.text[/Next/i] && !done_next | |
handle_links ids, link.click.links | |
done_next = true | |
end | |
end | |
end | |
end | |
public | |
def height_cm | |
if height && height[/(\d+)cm/] | |
$1.to_i | |
else | |
0 | |
end | |
end | |
def height_match_cm | |
if height_match && height_match[/(\d+)cm/] | |
$1.to_i | |
else | |
0 | |
end | |
end | |
def load_data id | |
doc = Hpricot open("http://dating.guardian.co.uk/s/view/#{id}") | |
begin | |
load_profile doc | |
morph(:id_name, id) | |
rescue Exception => e | |
puts "trouble loading: #{id}" | |
raise e | |
end | |
end | |
private | |
def s text | |
text.to_s.strip | |
end | |
def load_profile doc | |
info = doc.at('#profileInfo') | |
morph :name => s(info.at('h2/text()')), | |
:gender => s(info.at('h2/img')['title']), | |
:headline => s(info.at('#headline/text()')), | |
:last_modify => info.at('#lastModify/span/text()').to_s.chomp('?').strip, | |
:last_login => s(info.at('#lastLogin/span/text()')), | |
:about_self => s(doc.at('#selfBox/p')), | |
:about_match => s(doc.at('#matchBox/p')), | |
:image => doc.at('#primaryPhoto/a/img')['src'] | |
load_table doc, '#aboutTable' | |
load_table doc, '#match', 'match' | |
load_table doc, '#depth' | |
end | |
def load_table doc, id, suffix = '' | |
(doc.at(id) / 'td/text()').in_groups_of(2).each do |attributes| | |
label = attributes[0].to_s.strip.tr(',\'','').chomp(':') | |
value = attributes[1].to_s.gsub('?',' ').strip | |
morph("#{label} #{suffix}", value) | |
end | |
if id == '#match' | |
(doc.at('#match') / 'tr').select {|row| row.at('td/span')}.each do |row| | |
label = row.at('td/text()').to_s.strip.tr(',\'','').chomp(':') | |
value = row.at('td/span/text()').to_s.tr('()','') | |
morph("#{label}_importance", value) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment