Created
June 12, 2016 04:49
-
-
Save atuyosi/decc10eb454b5061f8739cd67356b81d to your computer and use it in GitHub Desktop.
対象のはてなブログの各エントリにおいて、見出しにh1タグが使われているURLを見つけ出すやっつけ作業スクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /usr/bin/env ruby | |
require 'open-uri' | |
require 'nokogiri' | |
baseurl = "対象のはてなブログURL" | |
year = [ 2015,2016 ] # 必要に応じて追加 | |
month = *1..12 | |
arc_url = baseurl + '/' + 'archive' | |
#url_list = Array.new | |
page_url_list = Array.new | |
def parse_html(html) | |
list = Array.new | |
doc = Nokogiri::HTML.parse(html) | |
elem = doc.xpath('//*[@id="main-inner"]/div/section/div/h1/a') | |
# if elem.class =~ "Nokogiri::XML::NodeSet" then | |
# puts elem.class | |
elem.each do | node | | |
# puts node.values[1] | |
list << node.values[1] | |
end | |
# end | |
return list | |
end | |
def check_entry(url) | |
begin | |
html = open(url) do |f| ; charset = f.charset ; f.read ; end | |
rescue OpenURI::HTTPError => e | |
puts "Error #{e} : #{src_url}" | |
end | |
doc = Nokogiri::HTML.parse(html) | |
elem = doc.xpath('//div[@class="entry-content"]/h1') | |
# 記事タイトル以外にh1要素が存在する場合、NodeSetオブジェクトの要素数は0以上となる | |
if elem.length > 0 then | |
puts url | |
end | |
end | |
year.each do | y| | |
month.each do | m | | |
src_url = arc_url + '/' + y.to_s + '/' + m.to_s | |
html = nil | |
begin | |
html = open(src_url) do |f| ; charset = f.charset ; f.read ; end | |
rescue OpenURI::HTTPError => e | |
puts "Error #{e} : #{src_url}" | |
end | |
# その月のエントリがない場合、"この期間に記事はありません" と表示されるので、そのページは無視する | |
if html && !html.include?("この期間に記事はありません") then | |
page_url_list << parse_html(html) | |
page_url_list.flatten! | |
end | |
sleep 0.3 # DoSにならないように | |
end | |
end | |
puts "check h1 tags" | |
page_url_list.each do |url| | |
check_entry(url) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment