lava/combined_grabber.rb

## combined_grabber.rb
require 'nokogiri'
require 'open-uri'
require 'watir-webdriver'

require './acm_issue_downloader.rb'
require './acm_base.rb'

#base class for all acm-specific html extraction functions
class ACMBase
	def extract_id(url)
		match = url.match(/(id=)(.*?)(&|\s|\"|>|#)/i)
		if !match.nil?
			return match[2]
		else
			return nil
		end
	end

	def extract_cfid(url)
		match = url.match(/(CFID=)(.*?)(&|\s|\"|>|#)/i)
		if !match.nil?
			return match[2]
		else
			return nil
		end
	end

	def extract_cftoken(url)
		match = url.match(/(CFTOKEN=)(.*?)(&|\s|\"|>|#)/i)
		if !match.nil?
			return match[2]
		else
			return nil
		end
	end

	def extract_link_candidates(html)
		matches = html.scan( /href.*citation.cfm.*?\s/ )
		links = matches.map do |m|
			"http://dl.acm.org/" + m.gsub("href=", "").gsub("\"","").gsub("&amp;", "&")
		end
	end
end


class ACMJournalDownloader < ACMBase
	def initialize( url, target_dir, pdf_download=false )
		@pdf_download = pdf_download
		@target_dir = target_dir
		@url = url
	end

	# start a new thread and download
	def run()
		puts "Getting journal from " + @url
		Dir.mkdir(@target_dir) unless File.exist? @target_dir
		#t = Thread.new do
			if @url.match(/citation.cfm/)
				@target_url = @url + "&preflayout=flat"
				html = open(@target_url).read
			else # deploy overkill
				# TODO conference proceedings still dont work
				browser = Watir::Browser.new(:ff)
				@target_url = browser.goto(@url)
				html = browser.html
				browser.close
			end
			@id = extract_id(@target_url)


			link_candidates = extract_link_candidates(html)
			issue_links = link_candidates.reject {|l| extract_id(l) == @id}.uniq{|l| extract_id(l)}

			issue_links.each {|link| ACMIssueDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run() }
		#end
		#t
	end
end

class ACMIssueDownloader < ACMBase
	def initialize(url, parent_id, target_dir, pdf_download=false)
		@url = url
		@target_dir = target_dir
		@parent_id = parent_id
		@pdf_download = pdf_download
		@id = nil
	end

	def run
		puts "Getting issue from " + @url
		Dir.mkdir(@target_dir) unless File.exist? @target_dir
		@id = extract_id(@url)
		html = open(@url).read
		link_candidates = extract_link_candidates( html )
		p link_candidates
		article_links = link_candidates.reject {|l| id = extract_id(l); id == @id || id == @parent_id}.uniq{|l| extract_id(l)}
		return
		article_links.each do |link|
			ACMArticleDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run()
			sleep(1.0)

		end
	end
end

class ACMArticleDownloader < ACMBase
	def initialize(url, parent_id, target_dir, pdf_download=false)
		@target_dir = target_dir
		@url = url
		@pdf_download = pdf_download
		@parent_id = parent_id
	end

	def run
		puts "Getting article from " + @url
		id = extract_id(@url)
		cfid = extract_cfid(@url)
		cftoken = extract_cftoken(@url)

		bibtex = open("http://dl.acm.org/downformats.cfm?id=#{@id}&parent_id=#{@parent_id}&expformat=bibtex&CFID=#{@cfid}&CFTOKEN=#{@cftoken}").read
		filename = @target_dir + id + ".bib"
		File.new(filename, 'w') unless File.exist? filename
		File.open(filename, 'w') {|f| f.write( bibtex ) }
	end
end

# main.rb

# Get links to all ACM Journal/Proceedings Main Pages
# (web link: http://dl.acm.org/contents_dl.cfm )
acm_contents = Nokogiri::HTML(open('./contents_dl.cfm.html'))
journal_links = acm_contents.xpath('//td[@class="smaller-text"]/a').map do |node|
	node.attr( :href )
end

journal_links[30..31].map do |link|
	ACMJournalDownloader.new(link, './db/').run()
end
#threads.each {|t| t.join}
	require 'nokogiri'
	require 'open-uri'
	require 'watir-webdriver'

	require './acm_issue_downloader.rb'
	require './acm_base.rb'

	#base class for all acm-specific html extraction functions
	class ACMBase
	def extract_id(url)
	match = url.match(/(id=)(.*?)(&\|\s\|\"\|>\|#)/i)
	if !match.nil?
	return match[2]
	else
	return nil
	end
	end

	def extract_cfid(url)
	match = url.match(/(CFID=)(.*?)(&\|\s\|\"\|>\|#)/i)
	if !match.nil?
	return match[2]
	else
	return nil
	end
	end

	def extract_cftoken(url)
	match = url.match(/(CFTOKEN=)(.*?)(&\|\s\|\"\|>\|#)/i)
	if !match.nil?
	return match[2]
	else
	return nil
	end
	end

	def extract_link_candidates(html)
	matches = html.scan( /href.citation.cfm.?\s/ )
	links = matches.map do \|m\|
	"http://dl.acm.org/" + m.gsub("href=", "").gsub("\"","").gsub("&", "&")
	end
	end
	end


	class ACMJournalDownloader < ACMBase
	def initialize( url, target_dir, pdf_download=false )
	@pdf_download = pdf_download
	@target_dir = target_dir
	@url = url
	end

	# start a new thread and download
	def run()
	puts "Getting journal from " + @url
	Dir.mkdir(@target_dir) unless File.exist? @target_dir
	#t = Thread.new do
	if @url.match(/citation.cfm/)
	@target_url = @url + "&preflayout=flat"
	html = open(@target_url).read
	else # deploy overkill
	# TODO conference proceedings still dont work
	browser = Watir::Browser.new(:ff)
	@target_url = browser.goto(@url)
	html = browser.html
	browser.close
	end
	@id = extract_id(@target_url)


	link_candidates = extract_link_candidates(html)
	issue_links = link_candidates.reject {\|l\| extract_id(l) == @id}.uniq{\|l\| extract_id(l)}

	issue_links.each {\|link\| ACMIssueDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run() }
	#end
	#t
	end
	end

	class ACMIssueDownloader < ACMBase
	def initialize(url, parent_id, target_dir, pdf_download=false)
	@url = url
	@target_dir = target_dir
	@parent_id = parent_id
	@pdf_download = pdf_download
	@id = nil
	end

	def run
	puts "Getting issue from " + @url
	Dir.mkdir(@target_dir) unless File.exist? @target_dir
	@id = extract_id(@url)
	html = open(@url).read
	link_candidates = extract_link_candidates( html )
	p link_candidates
	article_links = link_candidates.reject {\|l\| id = extract_id(l); id == @id \|\| id == @parent_id}.uniq{\|l\| extract_id(l)}
	return
	article_links.each do \|link\|
	ACMArticleDownloader.new(link, @id, @target_dir + @id + "/", @pdf_download).run()
	sleep(1.0)

	end
	end
	end

	class ACMArticleDownloader < ACMBase
	def initialize(url, parent_id, target_dir, pdf_download=false)
	@target_dir = target_dir
	@url = url
	@pdf_download = pdf_download
	@parent_id = parent_id
	end

	def run
	puts "Getting article from " + @url
	id = extract_id(@url)
	cfid = extract_cfid(@url)
	cftoken = extract_cftoken(@url)

	bibtex = open("http://dl.acm.org/downformats.cfm?id=#{@id}&parent_id=#{@parent_id}&expformat=bibtex&CFID=#{@cfid}&CFTOKEN=#{@cftoken}").read
	filename = @target_dir + id + ".bib"
	File.new(filename, 'w') unless File.exist? filename
	File.open(filename, 'w') {\|f\| f.write( bibtex ) }
	end
	end

	# main.rb

	# Get links to all ACM Journal/Proceedings Main Pages
	# (web link: http://dl.acm.org/contents_dl.cfm )
	acm_contents = Nokogiri::HTML(open('./contents_dl.cfm.html'))
	journal_links = acm_contents.xpath('//td[@class="smaller-text"]/a').map do \|node\|
	node.attr( :href )
	end

	journal_links[30..31].map do \|link\|
	ACMJournalDownloader.new(link, './db/').run()
	end
	#threads.each {\|t\| t.join}