Created
April 30, 2014 11:20
-
-
Save miwarin/58c34b12f8ff3b65d3d0 to your computer and use it in GitHub Desktop.
hatenablog をスクレイピング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# hatenablog をスクレイピング | |
# | |
# 使い方: | |
# hatenablog.rb <hatenablog URI> [カテゴリ] | |
# | |
# 例: | |
# ruby hatenablog.rb http://jkondo.hatenablog.com/ > jkondo.txt | |
# | |
# ruby hatenablog.rb http://dennou-kurage.hatenablog.com/ 仕事観 > kurage.txt | |
# | |
require 'mechanize' | |
require 'uri' | |
require 'pp' | |
def get_text(uri) | |
agent = Mechanize.new | |
agent.get(uri) | |
texts = "" | |
while true | |
agent.page.at("//div[@class='entry-content']").children.each {|node| | |
text = node.text | |
texts << text | |
} | |
link = agent.page.link_with(:text => '次のページ') | |
break if link == nil | |
agent.page.link_with(:text => '次のページ').click | |
end | |
return texts | |
end | |
def main(argv) | |
uri_base = argv[0] | |
cat = "" | |
if argv[1] != nil | |
cat = "category/" + URI::encode(argv[1]) | |
end | |
uri = uri_base + cat | |
text = get_text(uri) | |
puts text | |
end | |
main(ARGV) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment