Skip to content

Instantly share code, notes, and snippets.

@venj
Last active September 30, 2015 12:27
Show Gist options
  • Save venj/1789006 to your computer and use it in GitHub Desktop.
Save venj/1789006 to your computer and use it in GitHub Desktop.
!!!!!DEPRECATED!!!!! Tracking the changes to the website is FUCKING BORING!!!! Torrent spider. Hard to maintain...Damn it, always changes web page...
#!/usr/bin/env ruby
# Download torrents from SOME WEBSITE.
require "rubygems"
require "open-uri"
require "hpricot"
require "uri"
require "colorize"
require 'uri'
require 'net/http'
# Global vars
website = "http://javjunkies.com"
download_base = nil
base_uri = "#{website}/main/page/"
mainpage_index = 1
IMG_PROXY = nil
PG_PROXY = nil
MAX_RETRY = 5
# Exceptions Classes
class DownloadLimitError < StandardError;end
class ServerOverLoadError < StandardError;end
class TorrentServerError < StandardError;end
class TorrentNotFoundException < StandardError;end
class LinkRedirectException < StandardError;end
class SiteMaintenanceException < StandardError;end
# Local URL Expander Class
class Expander
def self.expand(url)
open("http://venj.me/url.php?shorturl=#{url}").read.strip
end
end
def skip_page?(page)
open('.finished').readlines.include?(page + "\n") ? true : false
end
def add_page(page)
open('.finished', 'a+') {|f| f.puts page }
end
# Just loop it.
while true
# Grab main page (daily list.)
print "\nOpening posts page: #{mainpage_index}..."
html = open("#{base_uri}#{mainpage_index}/", :proxy => PG_PROXY).read
html = open("#{website}/main/", :proxy => PG_PROXY).read if mainpage_index == 1
begin
raise SiteMaintenanceException.new if html.index("Site Maintenance")
rescue SiteMaintenanceException => e
puts "Maintenance.\n".yellow
break
end
puts
maindoc = Hpricot(html)
maindoc.search("//div[@class='post']/div[@class='entry']/a").each do |entry|
onclick = entry.attributes["onclick"]
next if onclick =~ /window.open/ # Skip the fake daily list.
page_uri = onclick.match(/location.href='([^']+)'/)[1]
begin
# Grab daily pagination.
print "Opening #{URI::parse(page_uri).path}..."
(puts "skip day!".green; next) if skip_page?(page_uri)
puts
page_content = open(page_uri, :proxy => PG_PROXY).read
dlbaseregex = /window\.open\("(http:\/\/javjunkies\.com\/[^"]+)"/
download_base = page_content.match(dlbaseregex)[1]
page_doc = Hpricot(page_content)
rescue Exception => e
puts "Finished or Unknow error...".red
puts e.message
exit 1
end
# Get pagination (page count)
begin
lastpage_index = page_doc.search("//div[@class='post']/div[@class='entry']/p/font/a").last.inner_html.to_i
rescue Exception => e
lastpage_index = 1
end
filename_base = File.basename(File.dirname(page_uri)) + "_" + File.basename(page_uri)
print "Processing #{filename_base}, "; puts "#{lastpage_index} pages...".green
# Fetch the daily list.
(1..lastpage_index).each do |i|
refererLink = "#{page_uri}#{i}/"
print " parse page #{i}: torrent "
(puts "...skip page!".green; next) if skip_page?(refererLink)
if i == 1
pagination_doc = page_doc
else
pagination_doc = Hpricot(open(refererLink, :proxy => PG_PROXY).read)
end
# Fetch torrents and images
tr_count = 1
pagination_doc.search("//div[@class='post']/div[@class='entry']/div[@class='image']").each do |e|
next if e.search("div").size < 1 # Skip fake torrent post, 2013-03-13.
print "#{tr_count}"; tr_count += 1
# Try to skip un-needed file formats, but its not usable for now.
skip_torrent = false
e.search("span") do |span|
html_text = span.inner_html
begin
skip_torrent = true if html_text =~ /\.[iso|mds]/ # allow wmv torrent.
rescue Exception => e
skip_torrent = true
end
end
(print "f".yellow; print ", ";next) if skip_torrent
# This is where real download begins.
e.search("a") do |a|
# Build the torrent link.
link_regx = /JavJ\('([^']+)'\)/
#puts a.attributes["onclick"].match(link_regx)[1]
(print "e".magenta; print ", ";next) if a.attributes["onclick"] !~ link_regx
tr_link = download_base + a.attributes["onclick"].match(link_regx)[1]
#puts tr_link
# Build the image link
image_link_regx = /url\('([^']+)'\)/
image_link_t = e.search("//div")[0].attributes["style"].match(image_link_regx)[1]
# Expand the image short link.
retry_counts = 0
while retry_counts < MAX_RETRY
begin
image_link = Expander.expand(image_link_t)
rescue StandardError => e
retry_counts += 1
end
break if image_link
end
#puts image_link
# Build image name and torrent name for save to disk.
image_filename = filename_base + "." + image_link.split("/").last
tr_filename = ""
if tr_filename.split(".").last == "torrent"
tr_filename = filename_base + "_" + tr_link.split("/").last
else
tr_filename = filename_base + "_" + image_link.split("/").last.split(".")[0] + ".torrent"
end
# Skip the torrent and image if exists.
(print "s".blue;print ", ";next) if ((File.exists? tr_filename) && (File.exists? image_filename))
ftext = ""
begin
print ", "
# Download torrent
unless File.exists? tr_filename # Skip it if torrent exists.
open(tr_link, "referer" => refererLink, :proxy => PG_PROXY) do |inf|
ftext = inf.read
#puts ftext
# Raise exceptions according to different conditions.
raise DownloadLimitError.new if ftext.index "You reached your download limit."
raise ServerOverLoadError if ftext.index "Server under heavy load, please try again later!"
raise TorrentServerError.new if ftext.index "Internal Server Error"
raise TorrentNotFoundException.new if ftext.index "Error 404"
raise LinkRedirectException.new if ftext[0..20].index("html")
open(tr_filename, "w+") do |ouf|
ouf.write(ftext)
end
end
end
rescue DownloadLimitError => err
print "gocha!\n".red
puts "Daily download limit reached. Please retry tomorrow.\n".red
exit
rescue ServerOverLoadError => err
print "damn!\n".red
puts "Server is under heavy load. Really?!\n".red
exit
rescue TorrentServerError => err
print "ouch!\n".red
puts "Torrent server 500 internal error.\n".red
exit
rescue TorrentNotFoundException => err
print "\b\b"
print "?".yellow
print ", "
rescue LinkRedirectException => err
print "fuck!\n".red
puts "Script broken!!!".red
exit
rescue Exception => err
puts "Error downloading torrent.\n".red
puts err.message
exit
end
# Download image for the torrent
next if File.exists? image_filename # Skip it if image exists.
retry_counts = 0
while retry_counts < MAX_RETRY
begin
open(image_link, :proxy => IMG_PROXY) do |inf|
open(image_filename, "w+") do |ouf|
ouf.write(inf.read)
end
end
rescue StandardError => err
retry_counts += 1
puts "Error downloading image: #{image_filename} (#{err.message}), retry.".red
puts "#{image_link} , retry...".red
else
break # Success if exec to here.
end
end
end
end
add_page(refererLink)
puts "done.".green # One daily list pagination finished
end
add_page(page_uri)
end
mainpage_index += 1 # One day finished.
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment