Skip to content

Instantly share code, notes, and snippets.

@toobulkeh
Forked from tenpercent/it-ebooks.rb
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toobulkeh/8b85842ccdcd903dfa27 to your computer and use it in GitHub Desktop.
Save toobulkeh/8b85842ccdcd903dfa27 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
MARK_FILE = "mark.dat"
MAX_THREADS = 5
def wait_for_threads(threads)
print "Waiting for downloads to finish..."
threads.each { |t| t.join }
puts " ok"
end
def quit(num, threads)
if !(threads.empty?)
wait_for_threads(threads)
end
File.open(MARK_FILE, 'w') { |f| f.write(num - 1) }
exit
end
EXCLUDE = %w(ActionScript flash flex active_directory)
def exclude?(book)
false
end
if ARGV.first == 'help'
puts "Get all new books: ruby it-ebooks.rb"
puts "Get n books from last download: ruby it-ebooks.rb 100"
puts "Options: y - download, ENTER - next, d - description, q - quit"
puts "(CTRL+C also works for quitting but the current position in not remembered)"
exit
end
puts "Working..."
mark = File.exists?(MARK_FILE) ? File.read(MARK_FILE).to_i : 0
max_num = Nokogiri::HTML(open('http://it-ebooks.info/')).css('a').select { |a| a['href'] =~ /\/book\// }.map { |a| a[:href].match(/book\/(\d+)/)[1].to_i }.max
latest = ARGV.first ? [mark + ARGV.first.to_i, max_num].min : max_num
books = []
nothing_to_do = true
for num in (mark + 1).upto(latest)
nothing_to_do = false
page = open("http://it-ebooks.info/book/#{num}/")
if page.base_uri.path == "/404/"
puts "#{num} - Not Found"
else
doc = Nokogiri::HTML(page)
div = doc.css("div[itemtype='http://schema.org/Book']")
book = {}
book[:num] = num
book[:title] = div.css("h1").text
book[:subtitle] = div.css("h3").text
book[:publisher] = doc.css("a[itemprop='publisher']").text
book[:description] = doc.css("span[itemprop='description']").text
book[:year] = doc.css("b[itemprop='datePublished']").text
book[:link] = "#{doc.css('a[href*="filepi.com"]').first['href']}" #this is a filepi FIRST url only. This service also adds captcha after a few downloads. Ugh.
if exclude?(book)
puts "Excluding #{num}: #{book[:title]} (#{book[:year]})"
else
puts "#{num} (#{num - mark}/#{latest - mark}) link: #{book[:link]}"
books << book
end
end
end
puts
threads = []
$total_threads = 0
books.each_with_index do |book, index|
puts "#{book[:num]} (#{index + 1}/#{books.count})"
puts book[:title]
puts book[:subtitle] unless book[:subtitle].empty?
puts "#{book[:publisher]}, #{book[:year]}"
threads << Thread.new do
begin
while ($total_threads >= MAX_THREADS) do
sleep(5)
end
$total_threads+=1
puts "Downloading book: " << "#{book[:num]} (#{index + 1}/#{books.count})" << "#{book[:title]}\n"
f = open(book[:link])
cdmeta = f.meta
p cdmeta
cd = f.meta['content-disposition']
if cd.nil? # content-disposition was missing, try again
f = open(book[:link])
cd = f.meta['content-disposition']
end
# filename = cd.match(/filename=(\"?)(.+)\1/)[2]
filename = "[#{book[:year]}] " + "[#{book[:publisher]}] " + "[#{book[:title]}]" + ".pdf"
if File.exists?(filename)
puts "#{filename} already exists!"
else
File.open(filename, "wb") do |file|
file.write f.read
end
end
rescue => e
puts
puts "### Error downloading book #{book[:num]}: #{e.message} ###"
end
$total_threads-=1
end
end
if threads.any?
wait_for_threads(threads)
elsif nothing_to_do
puts "No new books."
end
File.open(MARK_FILE, 'w') { |f| f.write(latest) }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment