Skip to content

Instantly share code, notes, and snippets.

@lcomplete
Created June 28, 2020 07:53
Show Gist options
  • Save lcomplete/59da9dad448c12a3387afa5232018b82 to your computer and use it in GitHub Desktop.
Save lcomplete/59da9dad448c12a3387afa5232018b82 to your computer and use it in GitHub Desktop.
ruby抓取图片(早期代码,已经无法使用)
#encoding: utf-8
require 'net/http'
require 'open-uri'
require 'nokogiri' # 用于解析html的模块
# sudo apt-get install libxslt-dev libxml2-dev
# sudo gem install nokogiri
require 'pathname'
class JanDanSpider
attr_accessor :base_uri, :cur_page # 定义属性访问器
def initialize(pagesize)
@base_uri = 'http://jandan.net/pic' # @表示实例变量、@@表示类变量、$表示全局变量
@dir = '/media/Develop/MyCode/SevenLang/ruby/pic'
@pagesize = Integer(pagesize) # 整型转换
end
def crawl()
Dir.mkdir @dir unless File.directory? @dir # 表判断的方法结尾都有个?
totalpage = crawlpage(0)
puts "pagesize #{totalpage}" # ""字符会引发字符串替换,''则不会
(1..@pagesize-1).each do |i| # 遍历元组
crawlpage(totalpage - i)
end
puts 'complete!'
end
def crawlpage(page)
url = page==0 ? @base_uri : @base_uri+'/page-'+page.to_s # to_s是必要的
puts "crawl-page: #{url}"
fpage = open(url)
html = fpage.read
doc = Nokogiri::HTML(html)
doc.css('ol.commentlist li').each { |comment|
match = /comment-(\d+)/.match(comment['id'])
if match
id = match[1]
oo = Integer(comment.css('#cos_support'+'-'+id)[0].content);
xx = Integer(comment.css('#cos_unsupport'+'-'+id)[0].content);
xx = 1 if xx==0
if(oo>xx && (oo>200 || oo/xx >10) )
src = comment.css('p img')[0]["src"]
puts "crawl: oo #{oo} xx #{xx} src #{src}"
save_pic(src)
end
end
}
if page==0
cur_page = doc.css(".current-comment-page")[0].content
page = Integer(/\d+/.match(cur_page).to_s)
end
puts "page #{page} done!"
page # ruby中每条语句都有返回值,函数内最后一条语句的返回值会被return
end
def save_pic(url)
urlpath = Pathname.new(url)
filename = urlpath.basename.to_s
dirpath = Pathname.new(@dir)
filepath = dirpath.join(filename).to_s
open(url) { |fin|
open(filepath,"wb") { |fout|
while buf = fin.read(4096) do
fout.write buf
end
}
} unless File.exists? filepath # 仅当文件不存在时进行抓取
puts 'done!'
end
end
if ARGV.length == 1
spider = JanDanSpider.new($*[0]) # 可从 ARGV 或 $* 读取命令行参数
spider.crawl()
else
puts 'please input pagesize' #tip: puts, 转义+换行符 print, 转义 p, 换行
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment