Skip to content

Instantly share code, notes, and snippets.

@takuya
Last active Aug 29, 2015
Embed
What would you like to do?
#!/usr/bin/env ruby
# coding : utf-8
require 'mechanize'
class Mechanize::Page
def embed_body
self.embed_images
self.embed_style
self.embed_script
self.search("/").to_s
end
def embed_script(page=nil,base_uri=nil)
page = @mech.page unless page
base_uri = @mech.page.uri unless base_uri
page.search("script[src]").each{|e|
begin
uri = URI.join(base_uri, e.attr("src"))
@mech.get uri
#e.remove
script_text = @mech.page.body.toutf8
contents = Base64.encode64(script_text).gsub(/\n|\r/, "")
#head = page.search("head").first
#head.add_child("\n<script type='text/javascript' src='data:;base64,#{contents}' ></script>\n\n")
base64_str = "data:;base64,#{contents}"
e["src"] = base64_str
@mech.history.clear
@mech.history.push page, base_uri
rescue Net::HTTPNotFound, Mechanize::ResponseCodeError => e404
$stderr.puts e404.backtrace if $DEBUG
puts "404 エラー出たっぽい: #{uri}"
next
rescue => e
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG
$stderr.puts e.backtrace if $DEBUG
raise e
end
}
end
def remove_script(page=nil)
page = @mech.page unless page
base_uri = @mech.page.uri unless base_uri
page.search("script").each{|e| e.remove}
page.search("a").each{|e|
e["href"] = "#noscript" if e.attr("href") =~ /script/i
e.attributes.keys.each{|name| e.remove_attribute(name) if name =~ /^on/i }
}
end
def embed_css_url(css_text)
page = @mech.page
base_uri = page.uri
css = css_text
css = css.lines.map{|line|
css_line = line
if line =~ %r|url\s*\(| then
line =~ %r|url\(([^\)]+)\)|
ret = $1
next unless ret
embed_url = ret.gsub( /"|'/, "" )
next if embed_url =~/^data/
href = self.parse_uri(base_uri, embed_url.to_s)
begin
@mech.get href
content_type = @mech.page.header["content-type"]
contents = @mech.page.body
contents = contents.toutf8 if content_type =~ /^text/
contents = Base64.encode64(contents).gsub(/\n|\r/, "")
line = line.gsub( ret.to_s, "\n'data:#{content_type};base64,#{contents}'\n" )
@mech.history.clear
@mech.history.push page, base_uri
rescue => e
$stderr.puts "uri err occured. => '#{href}'" if $DEBUG
$stderr.puts e.backtrace if $DEBUG
raise e
end
end
line
}
css = css.join
css
end
def embed_style_import(css,page=nil,base_uri=nil)
page = @mech.page unless page
base_uri = page.uri unless base_uri # import 呼び出し元のCSSのURLが必要
css = css.lines.map{|line|
if line=~/@import/i
line = line.gsub( /@import/, "")
line = line.gsub( /'|"|;/ , "")
line = line.strip
line = line.gsub(%r|url\(([^\)]+)\)|){ $1 }
line = line.strip
u = self.parse_uri(base_uri, line)
#u = URI.join( base_uri, line )
begin
@mech.get u
line = @mech.page.body.toutf8
line = self.embed_css_url(line)
line += "\n"
@mech.history.clear
@mech.history.push page, base_uri
rescue => e
$stderr.puts "uri err occured. => '#{u}'"
$stderr.puts e.backtrace
raise e
end
end
line
}.join.toutf8
end
def embed_style(page=nil,base_uri=nil)
page = @mech.page unless page
base_uri = @mech.page.uri unless base_uri
page.search("style").each{|e|
css = e.text
css = self.embed_css_url(css)
e.content = css
}
page.search("link[rel*=stylesheet][href]").each{|e|
begin
u = self.parse_uri(base_uri, e.attr("href"))
@mech.get u
e.remove
head = page.search("head").first
css = @mech.page.body.lines.reject{|line| line=~/@charset/i }.join.toutf8
css = css.gsub( /\/\*(?:(?!\*\/).)*\*\//m , "");
css = self.embed_style_import(css)
css = self.embed_css_url(css)
#css.gsub!("\n", " ")
head.add_child("\n<style type='text/css'>\n\n#{css}\n\n</style>\n")
@mech.history.clear
@mech.history.push page, base_uri
rescue => e
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG
$stderr.puts e.backtrace if $DEBUG
raise e
end
}
end
def embed_images(page=nil,base_uri=nil)
page = @mech.page unless page
base_uri = @mech.page.uri unless base_uri
page.search("img[src],input[src]").each{|e|
next unless e.attr("src")
u = URI.join(base_uri, e.attr("src"))
begin
@mech.get u
e["src"] = "data:#{@mech.page['content-type']};base64,#{Base64.encode64(@mech.page.body)}"
@mech.history.clear
@mech.history.push page, base_uri
rescue Net::HTTPNotFound, Mechanize::ResponseCodeError => e404
$stderr.puts e404.backtrace if $DEBUG
puts "404 エラー出たっぽい: #{u}"
next
rescue => e
$stderr.puts "uri err occured. => '#{u}'" if $DEBUG
$stderr.puts e.backtrace if $DEBUG
raise e
end
}
end
def parse_uri( base_uri, href )
unless( href =~ /^http/ ) then
u = URI.join(base_uri, href)
else
u = href.split("?")
query = URI.unescape( u[1..(u.size-1)].join(""))
query = URI.escape(query)
u = u[0] + "?" + query
u = URI.parse(u)
end
return u
end
end
#$DEBUG=true
url = ARGV.shift
unless url then
puts "Usage #{__FILE__} url "
puts " exmaple 1 : #{__FILE__} http://example.com "
puts " example 2 : #{__FILE__} file://Users/takuya/Desktop/hoge.html "
exit
end
m = Mechanize.new
m.get(url)
m.page.embed_body
puts m.page.search("/").first.to_html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment