Created
October 3, 2011 22:29
-
-
Save lava/1260435 to your computer and use it in GitHub Desktop.
Naive ACM Metadata Grabber
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
require 'watir-webdriver' | |
require './acm_issue_downloader.rb' | |
require './acm_base.rb' | |
#base class for all acm-specific html extraction functions | |
class ACMBase | |
def extract_id(url) | |
match = url.match(/(id=)(.*?)(&|\s|\"|>|#)/i) | |
if !match.nil? | |
return match[2] | |
else | |
return nil | |
end | |
end | |
def extract_cfid(url) | |
match = url.match(/(CFID=)(.*?)(&|\s|\"|>|#)/i) | |
if !match.nil? | |
return match[2] | |
else | |
return nil | |
end | |
end | |
def extract_cftoken(url) | |
match = url.match(/(CFTOKEN=)(.*?)(&|\s|\"|>|#)/i) | |
if !match.nil? | |
return match[2] | |
else | |
return nil | |
end | |
end | |
def extract_link_candidates(html) | |
matches = html.scan( /href.*citation.cfm.*?\s/ ) | |
links = matches.map do |m| | |
"http://dl.acm.org/" + m.gsub("href=", "").gsub("\"","").gsub("&", "&") | |
end | |
end | |
end | |
class ACMJournalDownloader < ACMBase | |
def initialize( url, target_dir, pdf_download=false ) | |
@pdf_download = pdf_download | |
@target_dir = target_dir | |
@url = url | |
end | |
# start a new thread and download | |
def run() | |
puts "Getting journal from " + @url | |
Dir.mkdir(@target_dir) unless File.exist? @target_dir | |
#t = Thread.new do | |
if @url.match(/citation.cfm/) | |
@target_url = @url + "&preflayout=flat" | |
html = open(@target_url).read | |
else # deploy overkill | |
# TODO conference proceedings still dont work | |
browser = Watir::Browser.new(:ff) | |
@target_url = browser.goto(@url) | |
html = browser.html | |
browser.close | |
end | |
@id = extract_id(@target_url) | |
link_candidates = extract_link_candidates(html) | |
issue_links = link_candidates.reject {|l| extract_id(l) == @id}.uniq{|l| extract_id(l)} | |
issue_links.each {|link| ACMIssueDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run() } | |
#end | |
#t | |
end | |
end | |
class ACMIssueDownloader < ACMBase | |
def initialize(url, parent_id, target_dir, pdf_download=false) | |
@url = url | |
@target_dir = target_dir | |
@parent_id = parent_id | |
@pdf_download = pdf_download | |
@id = nil | |
end | |
def run | |
puts "Getting issue from " + @url | |
Dir.mkdir(@target_dir) unless File.exist? @target_dir | |
@id = extract_id(@url) | |
html = open(@url).read | |
link_candidates = extract_link_candidates( html ) | |
p link_candidates | |
article_links = link_candidates.reject {|l| id = extract_id(l); id == @id || id == @parent_id}.uniq{|l| extract_id(l)} | |
return | |
article_links.each do |link| | |
ACMArticleDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run() | |
sleep(1.0) | |
end | |
end | |
end | |
class ACMArticleDownloader < ACMBase | |
def initialize(url, parent_id, target_dir, pdf_download=false) | |
@target_dir = target_dir | |
@url = url | |
@pdf_download = pdf_download | |
@parent_id = parent_id | |
end | |
def run | |
puts "Getting article from " + @url | |
id = extract_id(@url) | |
cfid = extract_cfid(@url) | |
cftoken = extract_cftoken(@url) | |
bibtex = open("http://dl.acm.org/downformats.cfm?id=#{@id}&parent_id=#{@parent_id}&expformat=bibtex&CFID=#{@cfid}&CFTOKEN=#{@cftoken}").read | |
filename = @target_dir + id + ".bib" | |
File.new(filename, 'w') unless File.exist? filename | |
File.open(filename, 'w') {|f| f.write( bibtex ) } | |
end | |
end | |
# main.rb | |
# Get links to all ACM Journal/Proceedings Main Pages | |
# (web link: http://dl.acm.org/contents_dl.cfm ) | |
acm_contents = Nokogiri::HTML(open('./contents_dl.cfm.html')) | |
journal_links = acm_contents.xpath('//td[@class="smaller-text"]/a').map do |node| | |
node.attr( :href ) | |
end | |
journal_links[30..31].map do |link| | |
ACMJournalDownloader.new(link, './db/').run() | |
end | |
#threads.each {|t| t.join} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment