Skip to content

Instantly share code, notes, and snippets.

@itkq
Last active January 20, 2019 05:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save itkq/11d9953777cc2a2ba5f433cf8a60f408 to your computer and use it in GitHub Desktop.
Save itkq/11d9953777cc2a2ba5f433cf8a60f408 to your computer and use it in GitHub Desktop.
require 'net/http'
require 'open-uri'
require 'fileutils'
require 'nokogiri'
class KonoiroIinaCollector
BASE_URL = 'https://webnewtype.com'.freeze
DEFAULT_RESOLUTION = 'w3600h2700'.freeze
def self.save_all_images(save_dir = nil, resolution = nil)
new(save_dir, resolution).save_all_images
end
def initialize(save_dir, resolution)
@save_dir = save_dir || './images'
@resolution = resolution || DEFAULT_RESOLUTION
create_dir(@save_dir)
end
def create_dir(dir)
unless Dir.exist?(dir)
FileUtils.mkdir_p(dir)
end
end
def save_all_images
enumerate_article do |href, date|
puts "#{date}: #{href}"
save_images_in_article(href, date)
end
end
def save_images_in_article(href, date, sleep_sec: 0.5)
create_dir(File.join(@save_dir, date))
enumerate_image_sources(href) do |src|
save_image(date, src)
sleep(sleep_sec)
end
end
def save_image(date, src)
target_src = image_source_with_resolution(src)
filename = File.join(@save_dir, date, image_file_name(src))
print target_src
if File.exist?(filename)
puts " => already exists (#{filename})"
return
end
open(target_src) do |img|
open(filename, "w+b") do |out|
out.write(img.read)
end
end
puts " => saved (to #{filename})"
end
def image_file_name(src)
name, ext = src.sub(/\/w\d+h\d+\/\z/, '').split('/').last.split('.')
"#{name}_#{@resolution}.#{ext}"
end
def image_source_with_resolution(src)
genuine_img_src = src.sub(/\/w\d+h\d+\/\z/, '')
assert_genuine_image_source!(genuine_img_src)
"#{genuine_img_src}/#{@resolution}/"
end
def assert_genuine_image_source!(genuine_img_src)
unless genuine_img_src.end_with?('.jpg')
raise "#{src} is not genuine image source"
end
end
def enumerate_image_sources(article_href)
url = URI.join(BASE_URL, "#{article_href}1/")
res = Net::HTTP.get_response(url)
res.value
html = Nokogiri::HTML(res.body)
img_area = html.css('div.related_imgArea')
img_area.css('div.imgBox > img').each do |img|
src = img.attr('src')
yield(src)
end
end
def enumerate_article
p = 1
loop do
url = URI.join(BASE_URL, "/column/color/p#{p}/")
res = Net::HTTP.get_response(url)
res.value
html = Nokogiri::HTML(res.body)
html.css('div.listBox > ul > li > a').map do |a|
title = a.css('p.columnTitle').text.strip
return if title.empty?
href = a.attr('href')
date = a.css('span.columnDate').text.strip
yield(href, normalize_date(date))
p += 1
end
end
end
# yyyy-mm-dd
def normalize_date(date)
yyyy, mm, dd = date.match(/\A(\d{4})年(\d{2})月(\d{2})日/)[1..3]
"#{yyyy}-#{mm}-#{dd}"
end
end
KonoiroIinaCollector.save_all_images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment