public
Created

Ruby Tapas scraper

  • Download Gist
grab-tapas.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
require "mechanize"
 
agent = Mechanize.new
agent.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " + \
"AppleWebKit/536.30.1 (KHTML, like Gecko) Version/6.0.5" + \
" Safari/536.30.1"
page = agent.get "https://rubytapas.dpdcart.com/subscriber/content"
login_form = page.form_with id:"login-form"
login_form.field_with(id:"username").value = tapas_username
login_form.field_with(id:"password").value = tapas_password
 
content_page = agent.submit login_form
 
def friendly_filename(filename)
filename.gsub(/[^\w\s_-]+/, '')
.gsub(/(^|\b\s)\s+($|\s?\b)/, '\\1\\2')
.gsub(/\s+/, '_')
end
 
content_page.links_with(text:/File Attachment/).each do |link|
files_page = link.click
dir_name_pre = files_page.parser.css('title').text.strip.split('|').first
dir_name = friendly_filename(dir_name_pre.strip)
Dir.mkdir(dir_name) unless File.directory?(dir_name)
 
description = files_page.parser.css('div.blog-content').text.strip
 
if !File.exists?([dir_name,'description.txt'].join("/"))
File.open([dir_name,'description.txt'].join("/"),"w") do |f|
f.puts description
end
end
 
puts "Process tapa: #{dir_name}"
 
files_page.links_with(href:/download/).each do |file|
if !File.exists?([dir_name,file.text].join("/"))
file.click.save([dir_name,file.text].join("/"))
end
puts "#{file.text} downloaded"
end
puts '******************************************************************'
 
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.