Created
June 28, 2020 07:53
-
-
Save lcomplete/59da9dad448c12a3387afa5232018b82 to your computer and use it in GitHub Desktop.
ruby抓取图片(早期代码,已经无法使用)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
require 'net/http' | |
require 'open-uri' | |
require 'nokogiri' # 用于解析html的模块 | |
# sudo apt-get install libxslt-dev libxml2-dev | |
# sudo gem install nokogiri | |
require 'pathname' | |
class JanDanSpider | |
attr_accessor :base_uri, :cur_page # 定义属性访问器 | |
def initialize(pagesize) | |
@base_uri = 'http://jandan.net/pic' # @表示实例变量、@@表示类变量、$表示全局变量 | |
@dir = '/media/Develop/MyCode/SevenLang/ruby/pic' | |
@pagesize = Integer(pagesize) # 整型转换 | |
end | |
def crawl() | |
Dir.mkdir @dir unless File.directory? @dir # 表判断的方法结尾都有个? | |
totalpage = crawlpage(0) | |
puts "pagesize #{totalpage}" # ""字符会引发字符串替换,''则不会 | |
(1..@pagesize-1).each do |i| # 遍历元组 | |
crawlpage(totalpage - i) | |
end | |
puts 'complete!' | |
end | |
def crawlpage(page) | |
url = page==0 ? @base_uri : @base_uri+'/page-'+page.to_s # to_s是必要的 | |
puts "crawl-page: #{url}" | |
fpage = open(url) | |
html = fpage.read | |
doc = Nokogiri::HTML(html) | |
doc.css('ol.commentlist li').each { |comment| | |
match = /comment-(\d+)/.match(comment['id']) | |
if match | |
id = match[1] | |
oo = Integer(comment.css('#cos_support'+'-'+id)[0].content); | |
xx = Integer(comment.css('#cos_unsupport'+'-'+id)[0].content); | |
xx = 1 if xx==0 | |
if(oo>xx && (oo>200 || oo/xx >10) ) | |
src = comment.css('p img')[0]["src"] | |
puts "crawl: oo #{oo} xx #{xx} src #{src}" | |
save_pic(src) | |
end | |
end | |
} | |
if page==0 | |
cur_page = doc.css(".current-comment-page")[0].content | |
page = Integer(/\d+/.match(cur_page).to_s) | |
end | |
puts "page #{page} done!" | |
page # ruby中每条语句都有返回值,函数内最后一条语句的返回值会被return | |
end | |
def save_pic(url) | |
urlpath = Pathname.new(url) | |
filename = urlpath.basename.to_s | |
dirpath = Pathname.new(@dir) | |
filepath = dirpath.join(filename).to_s | |
open(url) { |fin| | |
open(filepath,"wb") { |fout| | |
while buf = fin.read(4096) do | |
fout.write buf | |
end | |
} | |
} unless File.exists? filepath # 仅当文件不存在时进行抓取 | |
puts 'done!' | |
end | |
end | |
if ARGV.length == 1 | |
spider = JanDanSpider.new($*[0]) # 可从 ARGV 或 $* 读取命令行参数 | |
spider.crawl() | |
else | |
puts 'please input pagesize' #tip: puts, 转义+换行符 print, 转义 p, 换行 | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment