Skip to content

Instantly share code, notes, and snippets.

@bycoffe
Created August 3, 2012 19:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bycoffe/3250702 to your computer and use it in GitHub Desktop.
Save bycoffe/3250702 to your computer and use it in GitHub Desktop.
A script for scraping the FCC's website and finding political file submissions
require 'open-uri'
require 'nokogiri'
class PublicFileCrawler
def initialize(params={})
@call_sign = params[:call_sign]
@url = "https://stations.fcc.gov/station-profile/#{@call_sign}/political-files/browse-%3e2012"
@checked = {}
@found = {}
end
def find_links(start_url, &block)
open(start_url).read.scan(/(https:\/\/stations.fcc.gov\/station-profile\/#{@call_sign}\/political-files\/browse.*?)"/).flatten.each do |url|
url = url.downcase.gsub(/>/, '%3e')
next unless @checked[url].nil?
@checked[url] = true
find_pdfs(url, &block)
sleep(0.5)
find_links(url, &block)
end
end
def find_pdfs(url, &block)
page = open(url).read
hierarchy = Nokogiri::HTML(page).css('.browser-path a').map(&:content)
page.scan(/https:\/\/stations\.fcc\.gov\/collect\/files.*?\.pdf/).each do |pdf|
next unless @found[pdf].nil?
@found[pdf] = true
match = {:pdf_url => pdf, :hierarchy => hierarchy}
yield match
end
end
def crawl(&block)
find_links(@url) do |match|
yield match
end
end
end
call_sign = ARGV[-1]
unless call_sign
puts """
Usage: ruby public_file_crawler.rb call_sign
e.g.: ruby public_file_crawler.rb wcpo-tv
""".strip
exit
end
require 'csv'
CSV.new(STDOUT) << ['pdf_url', 'hierarchy']
crawler = PublicFileCrawler.new(:call_sign => ARGV[-1])
crawler.crawl do |match|
CSV.new(STDOUT) << [match[:pdf_url], match[:hierarchy]].flatten
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment