Skip to content

Instantly share code, notes, and snippets.

@willpearse
Created February 15, 2016 18:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save willpearse/ec5b97a980b281d12cd6 to your computer and use it in GitHub Desktop.
Save willpearse/ec5b97a980b281d12cd6 to your computer and use it in GitHub Desktop.
Scraping all citing files from an article on Google Scholar
#!/usr/bin/ruby
require 'optparse'
require 'json'
require 'open-uri'
require 'nokogiri'
############################
# ARGUMENT PARSING #########
############################
options = {}
OptionParser.new do |opts|
opts.banner = "Scraping Google scholar for an article's citation count\nUsage: article_cite.rb [options] [article_id] [output_filname]"
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
options[:max] = Float::INFINITY
opts.on("-m MAX", "--max MAX", "Maximum number of articles to download (default: ALL)") {|x| options[:max] = x.to_i}
options[:wait] = 10
opts.on("-w DELAY", "--wait DELAY", "Wait DELAY seconds between downloads (default: 10)") {|x| options[:wait] = x.to_i}
options[:domain] = "ca"
opts.on("-d DOMAIN", "--domain DOMAIN", "Domain for Scholar download (default: 'CA' [canadian])") {|x| options[:domain] = x.to_i}
opts.on("-")
end.parse!
ARGV.reverse!
options[:article] = ARGV.pop.to_s
options[:output] = ARGV.pop.to_s
############################
# DO THE WORK ##############
############################
File.open(options[:output], "w") do |handle|
#Setup
more_papers = true
downloaded_papers = 0
curr_article = 0
articles = []
#Download and write out
while more_papers and downloaded_papers < options[:max]
begin
result = Nokogiri::HTML(open(URI.encode("https://scholar.google.#{options[:domain]}/scholar?start=#{curr_article.to_s}&hl=en&cites=#{options[:article]}")))
rescue
puts "ERROR: Cannot download citation from specified domain; check both"
abort
end
downloaded_papers += result.css("h3 a").length
result.css("h3 a").each {|x| handle << "#{x["href"]}\n"}
if result.css("h3 a").length == 0 then more_papers = false end
sleep options[:wait]
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment