Last active
October 31, 2015 15:17
-
-
Save axelchalon/58f1b5459fab4a1e715c to your computer and use it in GitHub Desktop.
Retrieves every page from a WP feed; for each page, saves every article onto disk with image links replaced by inline base64 (useful for offline use) – style suggestions welcome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Fixnum | |
def one? | |
return self == 1 | |
end | |
end | |
require 'nokogiri' | |
require 'net/http' | |
require 'openssl' | |
require 'base64' | |
Dir.mkdir 'output' unless File.directory? 'output' | |
page = 1 | |
loop do | |
p 'Page ' + page.to_s | |
if (page.one?) | |
response = Net::HTTP.get_response(URI('http://throwingmyarmsaroundparis.com/feed/')) | |
else | |
response = Net::HTTP.get_response(URI('http://throwingmyarmsaroundparis.com/feed/?paged=' + page.to_s)) | |
end | |
break if Net::HTTPNotFound === response | |
# Removes special characters (useful for invalid xml docs) then parses XML | |
doc = Nokogiri::XML(response.body.gsub(/[\u0001-\u001A]/ , '')) do |config| | |
config.options = Nokogiri::XML::ParseOptions::RECOVER | Nokogiri::XML::ParseOptions::SAX1 | |
end | |
p doc.xpath('//item').length.to_s + ' items' | |
doc.xpath('//item').each_with_index do |item,index| | |
slug = item.at_xpath('link').content.match(/\/([^\/]+)\/$/).to_a[1] | |
title = item.at_xpath('title').content | |
content = item.at_xpath('content:encoded').content | |
p "Page #{page.to_s} - Item #{index}" | |
# replaces image urls with inline base64 | |
content.gsub!(/(?<=src=")[^"]+(?=")/) { |url| # replace src value with base64 equivalent | |
extension = url.sub(/\?.+/,'').match(/(?<=\.)[^.]+$/).to_a[0]; # get extension from url for base64 mime type | |
if extension != 'jpg' and extension != 'png' and extension != 'jpeg' and extension != 'gif' then next url end # videos etc | |
p "Page #{page.to_s} - Item #{index} - #{extension}" | |
uri = URI(URI.encode(url)) | |
http = Net::HTTP.new(uri.host, 443) # (!) ASSUMING IMAGES LINKS ARE ALL OVER HTTPS | |
http.use_ssl = true | |
http.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
request = Net::HTTP::Get.new(uri.request_uri) | |
response = http.request(request) | |
'data:image/'+extension+';base64,' + Base64.encode64(response.body) | |
} | |
content = '<!DOCTYPE HTML><html><head><meta charset="UTF-8"><title>' + title + '</title></head><body>' + content + '</body></html>' | |
file = File.open('output/' + slug + '.html', 'w:UTF-8') | |
file.write(content); | |
file.close | |
end | |
page += 1 | |
break if (page > 50) | |
end | |
p 'Done' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment