Created
December 8, 2010 22:16
-
-
Save tdtds/734013 to your computer and use it in GitHub Desktop.
getting MM9's text from webmysteries.jp.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This command was obsolete. see also https://github.com/tdtds/aozoragen | |
# | |
#!/usr/bin/env ruby | |
# | |
# getting text from webmysteries.jp's novel. | |
# | |
require 'open-uri' | |
require 'nokogiri' | |
require 'cgi' | |
class WebMysteries | |
def get( *indexes ) | |
indexes.flatten.each do |index| | |
get_pages( index ) do |text| | |
yield text | |
end | |
end | |
end | |
:private | |
def get_pages( index ) | |
begin | |
pages = [] | |
html = Nokogiri( open( index, 'r', &:read ) ) | |
(html / 'ul.pageNavi a').each do |a| | |
pages << a.attr( 'href' ) | |
end | |
pages.shift # delete current page | |
begin | |
(html / 'noscript param[name="FlashVars"]')[0].attr( 'value' ).scan( /entry=(\d+)/ ) do |i| | |
yield get_text( i[0] ) | |
end | |
end while html = Nokogiri( open( pages.shift, 'r', &:read ) ) | |
rescue TypeError | |
# ignore open nil | |
end | |
end | |
def get_text( xml_id ) | |
result = '' | |
open( "http://www.webmysteries.jp/entry_xml_data/#{xml_id}.xml" ) do |fx| | |
CGI::unescape( fx.read ).scan( %r|<entryBody>(.*?)</entryBody>|m ) do |entry| | |
result << entry[0].gsub( %r|<.*?>|m, "" ) | |
end | |
end | |
result | |
end | |
end | |
if __FILE__ == $0 | |
WebMysteries::new.get( ARGV ) do |text| | |
puts text | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment