Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# -*- coding: utf-8 -*-
require 'open-uri'
require 'nokogiri'
def before(url)
io = URI.parse(url).read
Nokogiri(io)
end
# open-uriのcharsetか,charsetがiso-8859-1のとき失敗してる可能性が高いので,HTMLに最も多く出現するcharsetをNokogiriに渡す
def after(url)
io = URI.parse(url).read
charset = io.charset
if charset == "iso-8859-1"
charset = io.scan(/charset="?([^\s"]*)/i).flatten.inject(Hash.new{0}){|a, b|
a[b]+=1
a
}.to_a.sort_by{|a|
a[1]
}.reverse.first[0]
end
Nokogiri(io, url, charset)
end
puts 'before'
p before('http://www.nicovideo.jp/watch/1297306177').at('title').content
puts 'after'
p after('http://www.nicovideo.jp/watch/1297306177').at('title').content
puts 'before'
p before('http://todeskin.g.hatena.ne.jp/eigokun/20110213/1297573835').at('title').content
puts 'after'
p after('http://todeskin.g.hatena.ne.jp/eigokun/20110213/1297573835').at('title').content
# before
# "ã\u0082«ã\u0083¼ã\u0083\u0089ã\u0083\u0095ã\u0082¡ã\u0082¤ã\u0083\u0088!! ã\u0083´ã\u0082¡ã\u0083³ã\u0082¬ã\u0083¼ã\u0083\u0089ã\u0080\u0080第6話ã\u0080\u008Cè¬\u008Eã\u0081®ã\u0082«ã\u0083¼ã\u0083\u0089ã\u0082·ã\u0083§ã\u0083\u0083ã\u0083\u0097ã\u0080\u008D â\u0080\u0090 ã\u0083\u008Bã\u0082³ã\u0083\u008Bã\u0082³å\u008B\u0095ç\u0094»(å\u008E\u009F宿)"
# after
# "カードファイト!! ヴァンガード 第6話「謎のカードショップ」 ‐ ニコニコ動画(原宿)"
# before
# "復讐 - eigokunの手記 - - -"
# after
# "復讐 - eigokunの手記 - - -"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment