Skip to content

Instantly share code, notes, and snippets.

@OpenCoderX
Created October 21, 2013 14:59
Show Gist options
  • Save OpenCoderX/7085323 to your computer and use it in GitHub Desktop.
Save OpenCoderX/7085323 to your computer and use it in GitHub Desktop.
require 'mechanize'
URL_FILE = 'qc_urls'
OUTPUT_FILE = 'qc_urls_out'
class QuantcastCrawler
class QuantcastURL
attr_reader :summary
@@agent = Mechanize.new { |me| me.user_agent_alias = 'Mac Safari' }
def initialize(url)
@url = url
end
def crawl
begin
@@agent.get "https://www.quantcast.com/#{url}"
@trs = @@agent.page.search('#wunit-hierarchy-table tr')
rescue Mechanize::ResponseCodeError
@error = true
end
process
end
def url
@url.gsub(%r{https?:\\|\s}, '')
end
def extract_us_and_global_from_tr(tr)
[tr.search('td')[0], tr.search('td')[1]].map { |td|
(td && td.text).to_s.gsub(/(\s|US|Global)/, '')
}
end
def rank
exists = @@agent.page.search('#siteStats/li/h4/a/span/strong')[0]
exists ? exists.text.gsub(/\s/, '') : nil
end
def process
if @error
@summary = [
url, 'ERROR', 'ERROR', 'ERROR', 'ERROR', 'ERROR'
]
else
if @trs.count == 3
child = extract_us_and_global_from_tr(@trs[2])
parent = extract_us_and_global_from_tr(@trs[1])
@summary = [
url, child[0], child[1], rank, parent[0], parent[1]
]
else
child = extract_us_and_global_from_tr(@trs[1])
@summary = [
url, child[0], child[1], rank, nil, nil
]
end
end
end
end
def initialize(input_file, output_file)
@input = input_file
@output = output_file
end
def run
start = Time.now
File.open(@output, 'a') do |file|
urls.each do |url|
puts "#{Time.now} - #{Time.now - start} - #{url}"
url_info = QuantcastURL.new(url)
url_info.crawl
file.puts url_info.summary.join('^')
puts url_info.summary.join(', ')
end
end
end
def urls
File.readlines(@input)
end
end
# Run it
QuantcastCrawler.new(URL_FILE, OUTPUT_FILE).run
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment