willpearse/article_cite.rb

## article_cite.rb
#!/usr/bin/ruby
require 'optparse'
require 'json'
require 'open-uri'
require 'nokogiri'

############################
# ARGUMENT PARSING #########
############################
options = {}
OptionParser.new do |opts|
  opts.banner = "Scraping Google scholar for an article's citation count\nUsage: article_cite.rb [options] [article_id] [output_filname]"
  opts.on_tail("-h", "--help", "Show this message") do
    puts opts
    exit
  end
  options[:max] = Float::INFINITY
  opts.on("-m MAX", "--max MAX", "Maximum number of articles to download (default: ALL)") {|x| options[:max] = x.to_i}

  options[:wait] = 10
  opts.on("-w DELAY", "--wait DELAY", "Wait DELAY seconds between downloads (default: 10)") {|x| options[:wait] = x.to_i}

  options[:domain] = "ca"
  opts.on("-d DOMAIN", "--domain DOMAIN", "Domain for Scholar download (default: 'CA' [canadian])") {|x| options[:domain] = x.to_i}
  opts.on("-")
end.parse!

ARGV.reverse!
options[:article] = ARGV.pop.to_s
options[:output] = ARGV.pop.to_s

############################
# DO THE WORK ##############
############################
File.open(options[:output], "w") do |handle|
  #Setup
  more_papers = true
  downloaded_papers = 0
  curr_article = 0
  articles = []

  #Download and write out
  while more_papers and downloaded_papers < options[:max]
    begin
      result = Nokogiri::HTML(open(URI.encode("https://scholar.google.#{options[:domain]}/scholar?start=#{curr_article.to_s}&hl=en&cites=#{options[:article]}")))
          rescue
      puts "ERROR: Cannot download citation from specified domain; check both"
      abort
    end
    downloaded_papers += result.css("h3 a").length
    result.css("h3 a").each {|x| handle << "#{x["href"]}\n"}
    if result.css("h3 a").length == 0 then more_papers = false end
    sleep options[:wait]
  end
end
	#!/usr/bin/ruby
	require 'optparse'
	require 'json'
	require 'open-uri'
	require 'nokogiri'

	############################
	# ARGUMENT PARSING #########
	############################
	options = {}
	OptionParser.new do \|opts\|
	opts.banner = "Scraping Google scholar for an article's citation count\nUsage: article_cite.rb [options] [article_id] [output_filname]"
	opts.on_tail("-h", "--help", "Show this message") do
	puts opts
	exit
	end
	options[:max] = Float::INFINITY
	opts.on("-m MAX", "--max MAX", "Maximum number of articles to download (default: ALL)") {\|x\| options[:max] = x.to_i}

	options[:wait] = 10
	opts.on("-w DELAY", "--wait DELAY", "Wait DELAY seconds between downloads (default: 10)") {\|x\| options[:wait] = x.to_i}

	options[:domain] = "ca"
	opts.on("-d DOMAIN", "--domain DOMAIN", "Domain for Scholar download (default: 'CA' [canadian])") {\|x\| options[:domain] = x.to_i}
	opts.on("-")
	end.parse!

	ARGV.reverse!
	options[:article] = ARGV.pop.to_s
	options[:output] = ARGV.pop.to_s

	############################
	# DO THE WORK ##############
	############################
	File.open(options[:output], "w") do \|handle\|
	#Setup
	more_papers = true
	downloaded_papers = 0
	curr_article = 0
	articles = []

	#Download and write out
	while more_papers and downloaded_papers < options[:max]
	begin
	result = Nokogiri::HTML(open(URI.encode("https://scholar.google.#{options[:domain]}/scholar?start=#{curr_article.to_s}&hl=en&cites=#{options[:article]}")))
	rescue
	puts "ERROR: Cannot download citation from specified domain; check both"
	abort
	end
	downloaded_papers += result.css("h3 a").length
	result.css("h3 a").each {\|x\| handle << "#{x["href"]}\n"}
	if result.css("h3 a").length == 0 then more_papers = false end
	sleep options[:wait]
	end
	end