Skip to content

Instantly share code, notes, and snippets.

@ceritium
Created June 8, 2009 11:56
Show Gist options
  • Save ceritium/125773 to your computer and use it in GitHub Desktop.
Save ceritium/125773 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'hpricot'
require 'open-uri'
class CareerjetSpider
BASE = "http://www.careerjet.com/search/jobs"
MONTH = { 'January' => 'Jan', 'February' => 'Feb', 'March' => 'Mar', 'April' => 'Apr', 'May' => 'May', 'June' => 'Jun', 'July' => 'Jul', 'August' => 'Aug', 'September' => 'Sep', 'October' => 'Oct', 'November' => 'Nov', 'December' => 'Dec'}
attr_reader :page
def initialize(where = nil, what = nil, page = 1)
puts @url = get_url(where, what)
@page = page
end
def next_page()
@page += 1
end
def get_url(where = nil, what = nil)
URI.escape("#{BASE}?s=#{what}&l=#{where}&p=#{@page}")
end
def items
items = []
doc = open(@url) { |f| Hpricot(f) }
pp (doc/'html/body/table/tbody/tr/td/div[6]/div/div/div[3]/p[3]/span')
doc.search("//div[@class='job']").each do |job|
items << get_elements(job)
end
items
end
def get_elements(job)
hash = {
:location => (job/"//span[@class='locations_compact']").inner_text.gsub('-', '').strip,
:title => (job/"//a[@class='title_compact']").inner_text.strip,
:hash => (job/"//a[@class='title_compact']").first.attributes['href'].gsub('/job/', '').gsub('.html','').strip,
:company => (job/"//span[@class='company_compact']").inner_text.strip,
:url_company => get_url_company((job/"//span[@class='site']").inner_html.match("(').*(')").to_s.gsub("'", '')),
:date => get_date((job/"//span[@class='date_compact']").inner_text.gsub('-', '').strip),
:description => (job/"//p[@class='advertise_compact']").inner_text.strip
}
end
def get_url_company(str)
res = ''
str.length.times do |i|
res += (str[i]-2).chr
end
res
end
def get_date(date)
split = date.split(' ')
month = split.first
day = split.last
Time.utc(Time.now.year, MONTH[month], day)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment