Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
FCC Comment Scraper
require 'mechanize'
require 'csv'
require 'net/http'
count = 1
site = "http://apps.fcc.gov"
last_id = 6018081653
agent = Mechanize.new
allowed_fields = [
'Proceeding Number:',
'Name of Filer:',
'View Filing:',
'Type of Filing:',
'Exparte:',
'Date Received:',
'Date Posted:',
'Address:'
]
CSV.open("fcc.csv", "ab") do |csv|
while true
row = []
begin
page = agent.get("#{site}/ecfs/comment/view?id=#{last_id}")
page.search(".wwgrp").each do |grp|
parts = grp.search('span')
key = parts[0].text.strip
value = parts[1]
next unless allowed_fields.include? key
case key
when 'Proceeding Number:'
break unless value.text.strip == '14-28'
when 'View Filing:'
href = value.search('a')[0][:href]
doc_id = href.split('id=').last
local_filename = "downloads/#{doc_id}.pdf"
File.open(local_filename, 'w') {|f| f.write(Net::HTTP.get(URI.parse("#{site}#{href}"))) }
row << doc_id
when 'Address:'
lines = value.search('div').inner_html.split('<br>').map(&:strip).reject(&:empty?)
lines.insert 1, "" if lines.length == 2
row.concat lines
else
row << value.text.strip
end
end
unless row.empty?
csv << row.unshift(last_id)
csv.flush
puts "#{count}: #{row.join(',')}"
count += 1
end
rescue Exception => e
puts "Error #{last_id}: #{e}"
ensure
last_id += 1
sleep 0.3
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.