Skip to content

Instantly share code, notes, and snippets.

@axelchalon
Last active October 31, 2015 15:17
Show Gist options
  • Save axelchalon/58f1b5459fab4a1e715c to your computer and use it in GitHub Desktop.
Save axelchalon/58f1b5459fab4a1e715c to your computer and use it in GitHub Desktop.
Retrieves every page from a WP feed; for each page, saves every article onto disk with image links replaced by inline base64 (useful for offline use) – style suggestions welcome
class Fixnum
def one?
return self == 1
end
end
require 'nokogiri'
require 'net/http'
require 'openssl'
require 'base64'
Dir.mkdir 'output' unless File.directory? 'output'
page = 1
loop do
p 'Page ' + page.to_s
if (page.one?)
response = Net::HTTP.get_response(URI('http://throwingmyarmsaroundparis.com/feed/'))
else
response = Net::HTTP.get_response(URI('http://throwingmyarmsaroundparis.com/feed/?paged=' + page.to_s))
end
break if Net::HTTPNotFound === response
# Removes special characters (useful for invalid xml docs) then parses XML
doc = Nokogiri::XML(response.body.gsub(/[\u0001-\u001A]/ , '')) do |config|
config.options = Nokogiri::XML::ParseOptions::RECOVER | Nokogiri::XML::ParseOptions::SAX1
end
p doc.xpath('//item').length.to_s + ' items'
doc.xpath('//item').each_with_index do |item,index|
slug = item.at_xpath('link').content.match(/\/([^\/]+)\/$/).to_a[1]
title = item.at_xpath('title').content
content = item.at_xpath('content:encoded').content
p "Page #{page.to_s} - Item #{index}"
# replaces image urls with inline base64
content.gsub!(/(?<=src=")[^"]+(?=")/) { |url| # replace src value with base64 equivalent
extension = url.sub(/\?.+/,'').match(/(?<=\.)[^.]+$/).to_a[0]; # get extension from url for base64 mime type
if extension != 'jpg' and extension != 'png' and extension != 'jpeg' and extension != 'gif' then next url end # videos etc
p "Page #{page.to_s} - Item #{index} - #{extension}"
uri = URI(URI.encode(url))
http = Net::HTTP.new(uri.host, 443) # (!) ASSUMING IMAGES LINKS ARE ALL OVER HTTPS
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
'data:image/'+extension+';base64,' + Base64.encode64(response.body)
}
content = '<!DOCTYPE HTML><html><head><meta charset="UTF-8"><title>' + title + '</title></head><body>' + content + '</body></html>'
file = File.open('output/' + slug + '.html', 'w:UTF-8')
file.write(content);
file.close
end
page += 1
break if (page > 50)
end
p 'Done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment