Skip to content

Instantly share code, notes, and snippets.

@joshrendek
Created September 4, 2010 19:45
Show Gist options
  • Save joshrendek/565437 to your computer and use it in GitHub Desktop.
Save joshrendek/565437 to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby
#script that will download images off of 4 chan
require 'rubygems'
require 'active_record'
require 'hpricot'
require 'rest-open-uri'
require 'net/http'
DIRECTORY = "/SOME/DIRECTORY/HERE/#{Time.now.strftime('%m-%d-%y')}"
p `mkdir -p #{DIRECTORY}`
class History < ActiveRecord::Base
set_table_name "history"
# history has 2 columns: id (int) auto inc
# post_id: bigint (20) - with a primary key
end
HDRS = {"User-Agent"=>"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 ", "Accept-Charset"=>"utf-8", "Accept"=>"text/html"}
my_html = ""
CHANS = ["m", ] # go to site to get chans, m is mecha (like gundam wing, etc)
open("http://boards.4chan.org/m/", HDRS).each {|s| my_html << s}
@web_doc= Hpricot(my_html)
ActiveRecord::Base.establish_connection(
:adapter => "mysql",
:host => "localhost",
:username => "root",
:password => "test",
:database => "4chan"
)
url = ""
for i in @web_doc.search('//a[@target=_blank]')
url = i.attributes["href"]
post_img = url.split('/').last
post_id = post_img.split('.')[0].to_i
if post_id > 0
p "Post ID: " + post_id.to_s
h = History.find(:all, :conditions => ["post_id = ?", post_id])
if !h.nil? && h.size > 0
p "Already in database, skipping."
elsif h.size.nil? || h.size == 0
p "Downloading file: " + url
History.create!(:post_id => post_id)
Net::HTTP.start("images.4chan.org") { |http|
resp = http.get("/" + url.split('/')[3..-1].join('/'))
open("#{DIRECTORY}/#{post_img}", "wb") { |file|
file.write(resp.body)
}
}
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment