Skip to content

Instantly share code, notes, and snippets.

@debrouwere
Created June 8, 2014 22:45
Show Gist options
  • Save debrouwere/0b545409caa2da836269 to your computer and use it in GitHub Desktop.
Save debrouwere/0b545409caa2da836269 to your computer and use it in GitHub Desktop.
Using content-addressable storage to efficiently and continually archive new versions of an HTML page including all related resources (images etc.)
### An interesting thing about news website homepages is that, while they
change all the time, the media on them doesn't change quite that fast: both
actual images but also stylesheets, javascript, logos and so on. Therefore,
when archiving these pages, it is possible to achieve significant space
savings by modifying the links to every image or other resource in the HTML to
instead refer to a file path that's a hash of the file's content: a type of
content-addressable storage.
The storage size can be further reduced by storing e.g. a day's worth of HTML
for one page (e.g. one fetch every hour) into a single lzip file, as LZMA can
very efficiently compress repetitions of almost-identical content, such as a
homepage changing over time. ###
request = require 'request'
cheerio = require 'cheerio'
async = require 'async'
url = require 'url'
fs = require 'fs'
fs.path = require 'path'
fs.mkdirp = require 'mkdirp'
crypto = require 'crypto'
_ = require 'underscore'
_.str = require 'underscore.string'
chunk = (l, n) ->
(l.slice i, i+n) for i in _.range 0, l.length, n
save = (content, extension, callback) ->
sum = crypto.createHash 'sha1'
sum.update content
digest = sum.digest 'hex'
chunkedDigest = (chunk digest, 4).join '/'
filename = chunkedDigest + extension
path = fs.path.join 'filestore', filename
console.log path
fs.exists path, (exists) ->
if exists
callback null, filename
else
mkdir = async.apply fs.mkdirp, (fs.path.dirname path)
write = async.apply fs.writeFile, path, content, undefined
done = (err) -> callback err, filename
async.waterfall [mkdir, write], done
# TODO: ideally do a HEAD request first that looks at the last-modified
# date and/or ETag, checks those against a cache we keep of
# <url> | <last_modified> | <hash> (w/ a TTL on each key, or a LRU)
# and if the last_modified we have corresponds to the last_modified
# of the HEAD request, we can simply use the existing hash,
# which cuts down on both bandwidth and processing time
download = (link, callback) ->
extension = fs.path.extname link
request.get {uri: link, encoding: null}, (err, res, body) ->
save body, extension, callback
# TODO: it should be possible to specify, per site and independently
# for the homepage and for article pages, whether we can run a
# straight GET request or whether we need to run it in a
# headless browser, and extract the computed HTML instead
# (perhaps in some cases after executing an action, like a
# scroll event)
page = 'http://theguardian.com/uk'
request.get page, (err, res, body) ->
$ = cheerio.load(body)
hrefs = ($ ':not(a)[href]').map -> ($ this).attr 'href'
srcs = ($ ':not(a)[src]').map -> ($ this).attr 'src'
relativeLinks = hrefs.concat srcs
absoluteLinks = relativeLinks.map _.partial url.resolve, page
# links = _.zip relativeLinks, absoluteLinks
async.map absoluteLinks, download, (err, digests) ->
links = _.object _.zip relativeLinks, digests
# TODO: ideally we'd also rewrite any CSS file and
# suck out references, because they may contain
# sprites and such
# for both reliability and speed, don't recreate the
# HTML from the cheerio representation, but
# simply replace relative links with absolute ones,
# our content addresses
for link, hash of links
body = body.replace link, hash, 'g'
save body, '.html', (err, digest) ->
# TODO: at this point, you'd want to make a symlink
# from the digest (a.k.a. the content-addressable filename)
# to something that actually makes sense to human beings,
# like /theguardian.com/2014/01/01/23:17.html
console.log digest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment