Skip to content

Instantly share code, notes, and snippets.

@lava
Created October 3, 2011 22:29
Show Gist options
  • Save lava/1260435 to your computer and use it in GitHub Desktop.
Save lava/1260435 to your computer and use it in GitHub Desktop.
Naive ACM Metadata Grabber
require 'nokogiri'
require 'open-uri'
require 'watir-webdriver'
require './acm_issue_downloader.rb'
require './acm_base.rb'
#base class for all acm-specific html extraction functions
class ACMBase
def extract_id(url)
match = url.match(/(id=)(.*?)(&|\s|\"|>|#)/i)
if !match.nil?
return match[2]
else
return nil
end
end
def extract_cfid(url)
match = url.match(/(CFID=)(.*?)(&|\s|\"|>|#)/i)
if !match.nil?
return match[2]
else
return nil
end
end
def extract_cftoken(url)
match = url.match(/(CFTOKEN=)(.*?)(&|\s|\"|>|#)/i)
if !match.nil?
return match[2]
else
return nil
end
end
def extract_link_candidates(html)
matches = html.scan( /href.*citation.cfm.*?\s/ )
links = matches.map do |m|
"http://dl.acm.org/" + m.gsub("href=", "").gsub("\"","").gsub("&", "&")
end
end
end
class ACMJournalDownloader < ACMBase
def initialize( url, target_dir, pdf_download=false )
@pdf_download = pdf_download
@target_dir = target_dir
@url = url
end
# start a new thread and download
def run()
puts "Getting journal from " + @url
Dir.mkdir(@target_dir) unless File.exist? @target_dir
#t = Thread.new do
if @url.match(/citation.cfm/)
@target_url = @url + "&preflayout=flat"
html = open(@target_url).read
else # deploy overkill
# TODO conference proceedings still dont work
browser = Watir::Browser.new(:ff)
@target_url = browser.goto(@url)
html = browser.html
browser.close
end
@id = extract_id(@target_url)
link_candidates = extract_link_candidates(html)
issue_links = link_candidates.reject {|l| extract_id(l) == @id}.uniq{|l| extract_id(l)}
issue_links.each {|link| ACMIssueDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run() }
#end
#t
end
end
class ACMIssueDownloader < ACMBase
def initialize(url, parent_id, target_dir, pdf_download=false)
@url = url
@target_dir = target_dir
@parent_id = parent_id
@pdf_download = pdf_download
@id = nil
end
def run
puts "Getting issue from " + @url
Dir.mkdir(@target_dir) unless File.exist? @target_dir
@id = extract_id(@url)
html = open(@url).read
link_candidates = extract_link_candidates( html )
p link_candidates
article_links = link_candidates.reject {|l| id = extract_id(l); id == @id || id == @parent_id}.uniq{|l| extract_id(l)}
return
article_links.each do |link|
ACMArticleDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run()
sleep(1.0)
end
end
end
class ACMArticleDownloader < ACMBase
def initialize(url, parent_id, target_dir, pdf_download=false)
@target_dir = target_dir
@url = url
@pdf_download = pdf_download
@parent_id = parent_id
end
def run
puts "Getting article from " + @url
id = extract_id(@url)
cfid = extract_cfid(@url)
cftoken = extract_cftoken(@url)
bibtex = open("http://dl.acm.org/downformats.cfm?id=#{@id}&parent_id=#{@parent_id}&expformat=bibtex&CFID=#{@cfid}&CFTOKEN=#{@cftoken}").read
filename = @target_dir + id + ".bib"
File.new(filename, 'w') unless File.exist? filename
File.open(filename, 'w') {|f| f.write( bibtex ) }
end
end
# main.rb
# Get links to all ACM Journal/Proceedings Main Pages
# (web link: http://dl.acm.org/contents_dl.cfm )
acm_contents = Nokogiri::HTML(open('./contents_dl.cfm.html'))
journal_links = acm_contents.xpath('//td[@class="smaller-text"]/a').map do |node|
node.attr( :href )
end
journal_links[30..31].map do |link|
ACMJournalDownloader.new(link, './db/').run()
end
#threads.each {|t| t.join}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment