#!/usr/bin/env ruby # encoding: utf-8 # # Web Archiveからスクレイピングして小説家になろうの目次ページをnarou.rbのtoc.yamlと同じ形式で出力するやつ # require 'yaml' require 'nokogiri' require 'open-uri' begin url = "http://web.archive.org/web/20131127132412/http://ncode.syosetu.com/n7145bl/" x = "" open(url).each{ |d| x += d.clone } doc = Nokogiri.HTML(x) rescue doc = Nokogiri.HTML("<html><body><p>abort</p></body></html>") ans[ans.length] = "Nokogiri aborted" end begin title = doc.xpath("//div[@class='novel_title']/div/a").inner_text author = doc.xpath("//div[@class='novel_writername']/a[@href]").inner_text toc_url = "http://ncode.syosetu.com/n7145bl/" story = doc.xpath("//div[@class='novel_ex']").inner_text ans = [] idx = 1 chapter = '' subchapter = '' change_chapter = false # change_subchapter = false doc.xpath("//div[@class='novel_sublist']/table/tr").each do |node| if node.at(".//td[@class='chapter']") then chapter = node.xpath(".//td[@class='chapter']").inner_text change_chapter = true else nsa = node.xpath(".//td[@class='period_subtitle']/a[@href]")[0] href = nsa.get_attribute("href").sub(/\/web\/20131127132412\/http:\/\/ncode.syosetu.com(.*$)/, '\1') subtitle = nsa.inner_text file_subtitle = subtitle long_update = node.xpath(".//td[@class='long_update']")[0] if long_update.at(".//span") then span = long_update.xpath(".//span[@title]")[0] subupdate = span.get_attribute("title").sub(/(.*) 改稿/, '\1') subdate = long_update.inner_text.sub(/^\r\n(.*)\r\n\r\n(改)\r\n/, '\1') else subdate = long_update.inner_text.sub(/\r\n(.*)\r\n/, '\1') subupdate = subdate end if change_chapter then ans.push({"index"=>idx, "href"=>href, "chapter"=>chapter, "subchapter"=>"", "subtitle"=>subtitle, "file_subtitle"=>file_subtitle, "subdate"=>subdate, "subupdate"=>subupdate, "download_time"=>Time.now}) change_chapter = false else ans.push({"index"=>idx, "href"=>href, "chapter"=>"", "subchapter"=>"", "subtitle"=>subtitle, "file_subtitle"=>file_subtitle, "subdate"=>subdate, "subupdate"=>subupdate, "download_time"=>Time.now}) end idx += 1 end File.open("toc.yaml", 'w') { |file| YAML.dump({"title"=>title,"author"=>author,"toc_url"=>toc_url,"story"=>story,"subtitles"=>ans},file) } end rescue p "something wrong" end