Skip to content

Instantly share code, notes, and snippets.

@kkosuge
Last active March 26, 2016 18:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kkosuge/61b3d029a4f4b74f6244 to your computer and use it in GitHub Desktop.
Save kkosuge/61b3d029a4f4b74f6244 to your computer and use it in GitHub Desktop.
ウェブサイト記事の本文に含まれる画像URLを抽出するやつ
# Gemfile
gem "extract_content_images", git: "https://gist.github.com/61b3d029a4f4b74f6244.git", require: "extract_content_images"
require "extract_content_images"

url = 'http://blog.kksg.net/posts/%E5%B0%8F%E7%AC%A0%E5%8E%9F%E8%AB%B8%E5%B3%B6'
html = open(url).read
images = ExtractContentImages.analyse(html, url: url)

p images #=> ["http://farm8.staticflickr.com/7295/10513932323_11d5742d7a.jpg", "http://farm4.staticflickr.com/3713/10513941743_9766db4d99.jpg", "http://farm8.staticflickr.com/7405/10513732496_23fab362b1.jpg", "http://farm6.staticflickr.com/5509/10513709075_b49c961b13.jpg", "http://farm4.staticflickr.com/3703/10513715785_8760383036.jpg", "http://farm4.staticflickr.com/3700/10513754444_3f55c2f376.jpg", "http://farm8.staticflickr.com/7324/10513731094_2bc41d2dff.jpg", "http://farm8.staticflickr.com/7348/10513708725_724c3bc511.jpg", "http://farm4.staticflickr.com/3671/10513759744_a7c80f1fa3.jpg", "http://farm8.staticflickr.com/7421/10513928943_0cdb6ce071.jpg", "http://farm6.staticflickr.com/5490/10513733806_dbb0e7b252.jpg", "http://farm4.staticflickr.com/3701/10513727616_9171f0ee8b.jpg", "http://farm4.staticflickr.com/3693/10513750124_67df4dc7e4.jpg", "http://farm6.staticflickr.com/5547/10513926413_884f412077.jpg", "http://farm4.staticflickr.com/3749/10513725766_43d6507737.jpg"] 
# -*- coding: utf-8 -*-
# ExtractContent for Ruby 1.9, 2.x
# modified by mono and kkosuge
# ExtractContent for Ruby 1.9+ https://github.com/mono0x/extractcontent
# Author:: Nakatani Shuyo
# Copyright:: (c)2007 Cybozu Labs Inc. All rights reserved.
# License:: BSD
# Extract Content Module for html
# ExtractContent : 本文抽出モジュール
#
# 与えられた html テキストから本文と思わしきテキストを抽出します。
# - html をブロックに分離、スコアの低いブロックを除外
# - 評価の高い連続ブロックをクラスタ化し、クラスタ間でさらに比較を行う
# - スコアは配置、テキスト長、アフィリエイトリンク、フッタ等に特有のキーワードが含まれているかによって決定
# - Google AdSense Section Target ブロックには本文が記述されているとし、特に抽出
require 'cgi'
require 'uri'
require 'open-uri'
require 'nokogiri'
module ExtractContent
# Default option parameters.
@default = {
:threshold => 100, # 本文と見なすスコアの閾値
:min_length => 80, # 評価を行うブロック長の最小値
:decay_factor => 0.73, # 減衰係数(小さいほど先頭に近いブロックのスコアが高くなる)
:continuous_factor => 1.62, # 連続ブロック係数(大きいほどブロックを連続と判定しにくくなる)
:punctuation_weight => 10, # 句読点に対するスコア
:punctuations => /([、。,.!?]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/, # 句読点
:waste_expressions => /Copyright|All Rights Reserved/i, # フッターに含まれる特徴的なキーワードを指定
:dom_separator => '', # DOM間に挿入する文字列を指定
:debug => false, # true の場合、ブロック情報を標準出力に
:include_tags => false # 本文にHTMLタグを含める
}
# 実体参照変換
CHARREF = {
' ' => ' ',
'&lt;' => '<',
'&gt;' => '>',
'&amp;' => '&',
'&laquo;'=> "\xc2\xab",
'&raquo;'=> "\xc2\xbb",
}
# Sets option parameters to default.
# Parameter opt is given as Hash instance.
# デフォルトのオプション値を指定する。
# 引数は @default と同じ形式で与える。
def self.set_default(opt)
@default.update(opt) if opt
end
# Analyses the given HTML text, extracts body and title.
def self.analyse(html, opt=nil)
# frameset or redirect
return ["", extract_title(html)] if html =~ /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i
# option parameters
opt = if opt then @default.merge!(opt) else @default end
b = binding # local_variable_set があれば……
threshold=min_length=decay_factor=continuous_factor=punctuation_weight=punctuations=waste_expressions=dom_separator=debug=nil
opt.each do |key, value|
eval("#{key.id2name} = opt[:#{key.id2name}]", b)
end
# header & title
title = if html =~ /<\/head\s*>/im
html = $' #'
extract_title($`)
else
extract_title(html)
end
# Google AdSense Section Target
html.gsub!(/<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/m, '')
if html =~ /<!--\s*google_ad_section_start[^>]*-->/
html = html.scan(/<!--\s*google_ad_section_start[^>]*-->.*?<!--\s*google_ad_section_end.*?-->/m).join("\n")
end
# eliminate useless text
html = eliminate_useless_tags(html)
# h? block including title
html.gsub!(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/i) do |m|
if $2.length >= 3 && title.include?($2) then "<div>#{$2}</div>" else $1 end
end
# extract text blocks
factor = continuous = 1.0
body = ''
score = 0
bodylist = []
list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/)
list.each do |block|
next unless block
block.strip!
next if has_only_tags(block)
continuous /= continuous_factor if body.length > 0
# リンク除外&リンクリスト判定
notlinked = eliminate_link(block)
next if notlinked.length < min_length
# スコア算出
c = (notlinked.length + notlinked.scan(punctuations).length * punctuation_weight) * factor
factor *= decay_factor
not_body_rate = block.scan(waste_expressions).length + block.scan(/amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0
c *= (0.72 ** not_body_rate) if not_body_rate>0
c1 = c * continuous
puts "----- #{c}*#{continuous}=#{c1} #{notlinked.length} \n#{strip_tags(block)[0,100]}\n" if debug
# ブロック抽出&スコア加算
if c1 > threshold
body += block + "\n"
score += c1
continuous = continuous_factor
elsif c > threshold # continuous block end
bodylist << [body, score]
body = block + "\n"
score = c
continuous = continuous_factor
end
end
bodylist << [body, score]
body = bodylist.inject{|a,b| if a[1]>=b[1] then a else b end }
if @default[:include_tags]
[body[0], title]
else
[strip_tags(body[0], dom_separator), title]
end
end
# Extracts title.
def self.extract_title(st)
if st =~ /<title[^>]*>\s*(.*?)\s*<\/title\s*>/i
strip_tags($1)
else
""
end
end
private
# Eliminates useless tags
def self.eliminate_useless_tags(html)
# eliminate useless symbols
html.gsub!(/[\342\200\230-\342\200\235]|[\342\206\220-\342\206\223]|[\342\226\240-\342\226\275]|[\342\227\206-\342\227\257]|\342\230\205|\342\230\206/,'')
# eliminate useless html tags
html.gsub!(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/im, '')
html.gsub!(/<!--.*?-->/m, '')
html.gsub!(/<![A-Za-z].*?>/, '')
html.gsub!(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/m, '')
html.gsub!(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/i, '')
html
end
# Checks if the given block has only tags without text.
def self.has_only_tags(st)
st.gsub(/<[^>]*>/im, '').gsub("&nbsp;",'').strip.length == 0
end
# リンク除外&リンクリスト判定
def self.eliminate_link(html)
count = 0
notlinked = html.gsub(/<a\s[^>]*>.*?<\/a\s*>/im){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/im, '')
notlinked = strip_tags(notlinked)
return "" if notlinked.length < 20 * count || islinklist(html)
return notlinked
end
# リンクリスト判定
# リストであれば非本文として除外する
def self.islinklist(st)
if st=~/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/im
listpart = $1
outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/im, '').gsub(/<.+?>/m, '').gsub(/\s+/, ' ')
list = listpart.split(/<li[^>]*>/)
list.shift
rate = evaluate_list(list)
outside.length <= st.length / (45 / rate)
end
end
# リンクリストらしさを評価
def self.evaluate_list(list)
return 1 if list.length == 0
hit = 0
list.each do |line|
hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/im
end
return 9 * (1.0 * hit / list.length) ** 2 + 1
end
# Strips tags from html.
def self.strip_tags(html, separator = '')
st = html.gsub(/<.+?>/m, separator)
# Convert from wide character to ascii
st.gsub!(/([\357\274\201-\357\274\272])/){($1.bytes.to_a[2]-96).chr} # symbols, 0-9, A-Z
st.gsub!(/([\357\275\201-\357\275\232])/){($1.bytes.to_a[2]-32).chr} # a-z
st.gsub!(/[\342\224\200-\342\224\277]|[\342\225\200-\342\225\277]/, '') # keisen
st.gsub!(/\343\200\200/, ' ')
self::CHARREF.each{|ref, c| st.gsub!(ref, c) }
st = CGI.unescapeHTML(st)
st.gsub(/[ \t]+/, " ")
st.gsub(/\n\s*/, "\n")
end
end
Gem::Specification.new do |spec|
spec.name = "extract_content_images"
spec.version = "0.0.3"
spec.authors = ["kkosuge"]
spec.email = ["root@kksg.net"]
spec.summary = "ウェブサイト記事の本文に含まれる画像URLを抽出する"
spec.files = ["extract_content.rb", "extract_content_images.rb"]
spec.require_path = "."
spec.license = "BSD"
spec.add_dependency "nokogiri"
spec.add_dependency "addressable"
end
require 'extract_content'
require "addressable/uri"
class ExtractContentImages
def self.analyse(html, opts={})
content, title = ExtractContent.analyse(html, opts.merge(include_tags: true))
doc = Nokogiri::HTML(content)
urls = doc.css('img').map do |img|
uri = Addressable::URI.parse(img[:src])
if uri.host.nil? && opts[:url]
site = Addressable::URI.parse(opts[:url])
site.join(uri)
else
uri
end
end
urls.compact.map(&:normalize).map(&:to_s)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment