ryanburnette/get-list.rb

## readme.md

      
    Raw
  

              readme.md
            
          
    This little ruby script takes a list of URLs, searches a paricular element
for a particular word and returns CSV output with each URL and the number
of instances of that word. Here's how to use it.
First, create a CSV file containing URLs. That just means you put each URL
on its own line.
Next you run the script using Ruby, passing three arguments
ruby get-list.rb list.csv div.my-div word

Now let's break down those arguments. First you provide the path to the list
of URLs. Next you provide the element you want to search. Last you provide the
word you are searching for.
When creating a report just send the output of the script into a CSV file like so.
ruby get-list.rb list.csv div.my-div word > report.csv


## get-list.rb
require 'rubygems'
require 'csv'
require 'mechanize'
require 'uri'
require 'addressable/uri'

file = ARGV[0]
div  = ARGV[1]
word = ARGV[2]

SCHEMES = %w(http https)
def valid_url?(url)
  parsed = Addressable::URI.parse(url) or return false
  SCHEMES.include?(parsed.scheme)
rescue Addressable::URI::InvalidURIError
  false
end

urls = []
CSV.foreach file do |row|
  urls.push(row[0])
end

def get_results(urls, div_to_search, scan_for_this)
  agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE}
  agent.open_timeout = 1000
  agent.read_timeout = 1000

  urls.each do |url|
    print url
    print ","

    if valid_url?(url)
      begin
        agent.get(url) do |page|
          if defined? page.parser
            print page.search(div_to_search).inner_text().scan(scan_for_this).length
          else
            print "not a page"
          end
        end
      rescue => e
        print "error: " + e.to_s
      end
    else
      print "error: invalid url syntax"
    end

    print "\n"
  end
end

get_results(urls, div, word)
	require 'rubygems'
	require 'csv'
	require 'mechanize'
	require 'uri'
	require 'addressable/uri'

	file = ARGV[0]
	div = ARGV[1]
	word = ARGV[2]

	SCHEMES = %w(http https)
	def valid_url?(url)
	parsed = Addressable::URI.parse(url) or return false
	SCHEMES.include?(parsed.scheme)
	rescue Addressable::URI::InvalidURIError
	false
	end

	urls = []
	CSV.foreach file do \|row\|
	urls.push(row[0])
	end

	def get_results(urls, div_to_search, scan_for_this)
	agent = Mechanize.new{\|a\| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE}
	agent.open_timeout = 1000
	agent.read_timeout = 1000

	urls.each do \|url\|
	print url
	print ","

	if valid_url?(url)
	begin
	agent.get(url) do \|page\|
	if defined? page.parser
	print page.search(div_to_search).inner_text().scan(scan_for_this).length
	else
	print "not a page"
	end
	end
	rescue => e
	print "error: " + e.to_s
	end
	else
	print "error: invalid url syntax"
	end

	print "\n"
	end
	end

	get_results(urls, div, word)