Skip to content

Instantly share code, notes, and snippets.

@arjunvenkat
Created December 12, 2012 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arjunvenkat/4270598 to your computer and use it in GitHub Desktop.
Save arjunvenkat/4270598 to your computer and use it in GitHub Desktop.
scraper to save apprenticeship info from department of labor site
namespace :apprenticeships do
desc "scrapes DOL site for apprenticeship data by state"
task :scrape => :environment do
require 'mechanize'
require 'open-uri'
require 'csv'
apprenticeships_in_state_array = []
# states = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","WI","WY"]
states = ["WY"]
states.each do |state|
puts "State: #{state}"
start_row = 1
url = "http://oa.doleta.gov/bat.cfm?startrow=#{start_row}&curpage=1&MaxRows=20&state=#{state}&county=all&sel=all" #string interpolates the state and start_row variables
agent = Mechanize.new
page = agent.get(url)
num_apprenticeships_in_state = page.search('span.boldred strong').text.to_i
puts "Apprenticeships in state: #{num_apprenticeships_in_state}"
while start_row - 1 < num_apprenticeships_in_state
url = "http://oa.doleta.gov/bat.cfm?startrow=#{start_row}&curpage=1&MaxRows=20&state=#{state}&county=all&sel=all"
puts "Current page: #{url}"
page = agent.get(url)
forms = page.forms
form_counter = 1 # starts at 1 instead of 0 to skip the first search form at the top of the page
while form_counter < forms.count - 1
page = agent.submit(forms[form_counter])
occupation = page.search('div#content p.boldred strong').text
occupation = occupation.strip()
n = 1
sponsor_array_count = 0
sponsor_array = [occupation]
page.search('table tr td').each do |sponsor_info|
unless n%5 == 0
if (n-1)/5 == sponsor_array_count
if n%5 == 3 || n%5 == 4
sponsor_array << sponsor_info.text[/\w{1,}|\s{1,}|\./] #this doesn't quite work yet, check with cities that have multi-word names
else
sponsor_array << sponsor_info.text
end
else
sponsor_array << url
apprenticeships_in_state_array << sponsor_array
sponsor_array = [occupation]
sponsor_array << sponsor_info.text
sponsor_array_count += 1
end
end
n += 1
end
sponsor_array << url
apprenticeships_in_state_array << sponsor_array
puts "apprenticeship stored"
form_counter += 1
end
puts "#{start_row + 19} of #{num_apprenticeships_in_state} apprenticeships stored for #{state}"
start_row += 20
end
puts "Completed apprenticeships for #{state}"
CSV.open(Rails.root + "doc/appren_files/#{state}_apprenticeships.csv", "wb") do |csv|
csv << ["Occupation", "Sponsor Name", "Sponsor Address", "Sponsor City", "Sponsor State", "Original Page"]
apprenticeships_in_state_array.each do |apprenticeship|
csv << apprenticeship
end
end
apprenticeships_in_state_array = []
end
puts "finished scraping apprenticeships"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment