|
=begin |
|
illsut image url pattern list |
|
|
|
single |
|
page: http://www.pixiv.net/member_illust.php?mode=medium&illust_id=55266709 |
|
small: http://i2.pixiv.net/c/150x150/img-master/img/2016/02/14/02/07/22/55266709_p0_master1200.jpg |
|
middle: http://i2.pixiv.net/c/600x600/img-master/img/2016/02/14/02/07/22/55266709_p0_master1200.jpg |
|
original: http://i2.pixiv.net/img-original/img/2016/02/14/02/07/22/55266709_p0.jpg |
|
|
|
multi |
|
page: http://www.pixiv.net/member_illust.php?mode=medium&illust_id=54375745 |
|
small: http://i2.pixiv.net/c/150x150/img-master/img/2016/01/01/02/19/41/54375745_p0_master1200.jpg |
|
middle: http://i2.pixiv.net/c/600x600/img-master/img/2016/01/01/02/19/41/54375745_p0_master1200.jpg |
|
original: http://i2.pixiv.net/c/1200x1200/img-master/img/2016/01/01/02/19/41/54375745_p0_master1200.jpg |
|
=end |
|
|
|
require 'mechanize' |
|
require 'logger' |
|
|
|
module Scrape |
|
class Pixiv |
|
attr_reader :agent, :login_id, :login_pass, :current_url, :member_ilust_q_url |
|
@@base_url = 'http://www.pixiv.net' |
|
@@login_url = @@base_url + '/login.php' |
|
@@member_url = @@base_url + '/member.php' |
|
@@member_illust_url = @@base_url + '/member_illust.php' |
|
|
|
def initialize(args) |
|
args.each do |key, value| |
|
instance_variable_set("@#{key}", value) unless value.nil? |
|
end |
|
@agent = Mechanize.new |
|
@agent.user_agent = 'Mac Safari' |
|
#@agent.log = Logger.new $stderr |
|
unless login |
|
puts 'You failed login. Please try again' |
|
exit |
|
end |
|
end |
|
|
|
def member_illust_q_url(user_id) |
|
@@member_illust_url + '?id=' + user_id |
|
end |
|
|
|
def my_illust_list_url |
|
my_member_id = get_my_member_id |
|
my_illust_list_url = member_illust_q_url(my_member_id) |
|
my_illust_list_url |
|
end |
|
|
|
def move_page(url, &block) |
|
@agent.get(url) |
|
@current_url = @agent.page.uri.to_s |
|
true |
|
rescue Mechanize::ResponseCodeError |
|
false |
|
end |
|
|
|
def move_next_page(url=@current_url) |
|
get_page(url) |
|
if anchor = @agent.page.at('span.next > a') |
|
page = Mechanize::Page::Link.new(anchor, @agent, @agent.page).click |
|
@current_url = page.uri.to_s |
|
return true |
|
else |
|
return false |
|
end |
|
end |
|
|
|
def get_page(url, &block) |
|
url ? @agent.get(url) : @agent.page |
|
rescue Mechanize::ResponseCodeError |
|
false |
|
end |
|
|
|
def login |
|
page = get_page(@@login_url) |
|
page.form_with(action: '/login.php') do |form| |
|
form.field_with(name: 'pixiv_id').value = @login_id |
|
form.field_with(name: 'pass').value = @login_pass |
|
end.submit |
|
@current_url = @agent.page.uri.to_s |
|
# TODO ヘッドレスブラウザだと.homeがない? |
|
# 誤入力が続いた場合は別ページに飛ばされる |
|
# ホームへのボタンがあるかでログインの成功を判断 |
|
#return page.at('.home') ? true : false |
|
end |
|
|
|
def is_multi_illust?(illust_url=nil) |
|
page = get_page(illust_url) |
|
page.at('div._layout-thumbnail div.multiple') ? true : false |
|
end |
|
|
|
def is_single_illust?(illust_url=nil) |
|
is_ugoku_illust?(illust_url) ? false : !is_multi_illust?(illust_url) |
|
end |
|
|
|
def is_ugoku_illust?(illust_url=nil) |
|
page = get_page(illust_url) |
|
page.at('div._ugoku-illust-player-container') ? true : false |
|
end |
|
|
|
def kind_of_illust(illust_url=nil) |
|
case |
|
when is_multi_illust?(illust_url) |
|
return :multi |
|
when is_single_illust?(illust_url) |
|
return :single |
|
when is_ugoku_illust?(illust_url) |
|
return :ugoku |
|
else |
|
return false |
|
end |
|
end |
|
|
|
def get_my_member_id |
|
my_pixiv_page_url = get_page(@@member_url).at('.profile-url')[:value] |
|
match_data = my_pixiv_page_url.match(/id=(\d+)/) |
|
match_data ? match_data[1] : nil |
|
end |
|
|
|
def get_change_mode_url(url=@current_url, mode) |
|
url.gsub(/(?<=mode=)\w+/, mode) |
|
end |
|
|
|
def get_illust_id(url) |
|
match_data = url.match(/illust_id=(\d+)/) |
|
match_data ? match_data[1] : nil |
|
end |
|
|
|
def get_illust_caption(url) |
|
get_page(url).page.at('span.next > a').text |
|
end |
|
|
|
def get_illust_info(illust_list_url=@current_url, add_count_info=false) |
|
illust_info = [] |
|
page = get_page(illust_list_url) |
|
page.search('li.image-item').each do |item| |
|
info = {} |
|
url = File.join(@@base_url, item.at('a._work')[:href]) |
|
info = { |
|
id: get_illust_id(url), |
|
page_url: url, |
|
title: item.at('h1.title').text, |
|
kind: kind_of_illust(url), |
|
caption: get_illust_caption(url), |
|
image_urls: get_original_image_urls(url) |
|
} |
|
if add_count_info |
|
add_info = { |
|
rating_count: item.at('a.rating-count > span.count').text, |
|
score: item.at('a.score > span.count').text, |
|
comments: item.at('a.comments > span.count').text, |
|
views: item.at('a.views > span.count').text, |
|
bookmark_count: item.at('a.bookmark-count')&.text # ブックマークが0件だと表示されない |
|
} |
|
info.merge!(add_info) |
|
end |
|
illust_info << info |
|
end |
|
illust_info |
|
end |
|
|
|
def get_illust_info_until(until_next_page_num, illust_list_url=@current_url, add_count_info=false) |
|
previous_url = @current_url |
|
illust_info = [] |
|
move_page(illust_list_url) |
|
while until_next_page_num > 0 |
|
current_info = get_illust_info(@current_url, add_count_info) |
|
illust_info.concat(current_info) |
|
until_next_page_num -= 1 |
|
break unless move_next_page |
|
end |
|
@current_url = previous_url |
|
illust_info |
|
end |
|
|
|
def get_illust_info_until_last_page(illust_list_url=@current_url, add_count_info=false) |
|
previous_url = @current_url |
|
illust_info = [] |
|
move_page(illust_list_url) |
|
while true |
|
current_info = get_illust_info(illust_list_url, add_count_info) |
|
illust_info.concat(current_info) |
|
break unless move_next_page |
|
end |
|
@current_url = previous_url |
|
illust_info |
|
end |
|
|
|
def get_illust_caption(illust_url) |
|
# イラストに説明がない場合もある |
|
caption_node = get_page(illust_url).at('div.ui-expander-target > p.caption') |
|
return nil if caption_node.nil? |
|
caption_node.css("br").each { |node| node.replace("\n") } |
|
caption_node.text |
|
end |
|
|
|
def get_original_image_urls(illust_url=nil) |
|
image_url = nil |
|
page = illust_url ? get_page(illust_url) : @agent.page |
|
case kind_of_illust(illust_url) |
|
when :multi |
|
manga_url = get_change_mode_url(illust_url, "manga") |
|
image_url = get_original_image_urls_from_manga(manga_url) |
|
when :single |
|
# TODO |
|
illust_thumnail_url = page.at('div._layout-thumbnail img')[:src] |
|
image_url = get_original_single_image_url(illust_thumnail_url) |
|
end |
|
# ugokuの場合もnil |
|
image_url |
|
end |
|
|
|
def get_original_single_image_url(single_illust_thumnail_url) |
|
single_illust_thumnail_url.gsub(/\/c/, '') |
|
.gsub(/\/\d+x\d+/, '') |
|
.gsub('img-master', 'img-original') |
|
.gsub(/_master.+?(?=.jpg)/, '') |
|
end |
|
|
|
def get_original_image_urls_from_manga(manga_url) |
|
urls = [] |
|
get_page(manga_url).search('div.item-container img').each do |img| |
|
urls << img["data-src"] |
|
end |
|
urls |
|
end |
|
|
|
#def save_image(url, dist_dir, as_name=nil) |
|
# as_name ||= File.basename(url, "?*") |
|
# dist_path = File.join(dist_dir, as_name) |
|
# @agent.get(url, [], @current_url).save_as(dist_path) |
|
# #@agent.get(url, @current_url) do |page| |
|
# # save_as(dist_path) |
|
# #end |
|
#end |
|
end |
|
end |
|
|
|
if ENV['PIXIV_LOGIN_ID'].nil? |
|
puts "PIXIV_LOGIN_ID is undefined" |
|
exit |
|
end |
|
if ENV['PIXIV_LOGIN_PASS'].nil? |
|
puts "PIXIV_LOGIN_PASS is undefined" |
|
exit |
|
end |
|
unless output_path = ARGV.shift |
|
puts "require arg1: output yaml path" |
|
exit |
|
end |
|
meta_option = ARGV.shift == 'true' |
|
|
|
require 'yaml' |
|
pixiv = Scrape::Pixiv.new(login_id: ENV['PIXIV_LOGIN_ID'], login_pass: ENV['PIXIV_LOGIN_PASS']) |
|
illust_info = pixiv.get_illust_info_until_last_page(pixiv.my_illust_list_url, meta_option) |
|
File.open(output_path, "w") do |file| |
|
file.write(illust_info.to_yaml) |
|
end |