Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape Gutenberg handbook content to local markdown & images
/*
-------------
mkdirp usage:
-------------
var mkdirp = require('mkdirp');
mkdirp('/tmp/foo/bar/baz', function (err) {
if (err) console.error(err)
else console.log('pow!')
});
-----------------
Salticidae Usage:
-----------------
spider( 'http://some.url/', {
key: $ => $( '.some-selector' ).find( '.to #return' ).text(),
} ).then( ( results ) => {
console.log( results );
// { key: 'Whatever the text of that node was' }
} );
---------------
Turndown Usage:
---------------
// For Node.js
var TurndownService = require('turndown')
var turndownService = new TurndownService()
var markdown = turndownService.turndown('<h1>Hello world!</h1>')
*/
/* eslint-disable no-unused-vars */
const { join } = require( 'path' );
const { spider } = require( 'salticidae' );
const fs = require( 'fs' );
const https = require( 'https' );
const mkdirp = require( 'mkdirp' );
const rimraf = require( 'rimraf' );
const TurndownService = require( 'turndown' );
const turndownService = new TurndownService();
const turndown = ( html ) => turndownService.turndown( html );
/**
* Configure verbosity of logging.
*/
const VERBOSE = false;
/**
* console.log wrapper that does nothing when VERBOSE is false.
*/
function log() {
if ( ! VERBOSE ) {
return;
}
console.log.apply( console, arguments );
}
/** Entrypoint to Gutenberg handbook. */
const HANDBOOK_URI = 'https://wordpress.org/gutenberg/handbook/';
/** Output directory. */
const handbookRoot = join( process.cwd(), 'handbook' );
/**
* Take a handbook URI and get its relative path within the handbook root.
*
* @param {String} uri A web URL for a handbook page
*/
const getSubpath = ( uri ) => uri
// Get the path relative to the handbook URI root
.replace( /^.*gutenberg\/handbook\//, '' )
// Strip the last part of the URL, since that becomes the filename
.replace( /\/[^/]+\/$/, '' );
/**
* Convert the last part of the handbook page URI into a markdown filename.
*
* @param {String} uri A web URL for a handbook page
*/
const getFilename = ( uri ) => `${
uri.trim().split( '/' ).filter( Boolean ).pop()
}.md`;
const imgFilename = ( imageURI ) => imageURI.split( '/' ).pop();
/** Useful DOM query selectors. */
const selectors = {
menu: '.menu-table-of-contents-container',
title: '.entry-header',
content: '.entry-content',
};
/**
* Delete & re-create the handbook root.
*/
const emptyHandbookRoot = () => new Promise( ( resolve, reject ) => {
rimraf( handbookRoot, ( err ) => {
if ( err ) {
return reject( err );
}
mkdirp( handbookRoot, ( err ) => {
if ( err ) {
return reject( err );
}
resolve();
} );
} );
} );
/**
* Ensure a path exists on disk.
*
* @param {String} path An absolute file system path
* @return Promise
*/
const ensureExists = ( path ) => new Promise( ( resolve, reject ) => {
log( `Ensuring path ${ path }` );
mkdirp( path, ( err ) => {
if ( err ) {
reject( err );
} else {
resolve();
}
} );
} );
/**
* Write a string to a file on disk.
*
* @param {String} outputFile An absolute filesystem path to the output file
* @param {String} content The file content to write
* @return Promise
*/
const write = ( filePath, content ) => new Promise( ( resolve, reject ) => {
fs.writeFile( filePath, content, {
encoding: 'utf8',
}, ( err ) => {
if ( err ) {
return reject( err );
}
console.log( `Wrote ${ filePath } to disk` );
resolve( content );
} );
} );
/**
* Download a file to disk.
*
* @param {String} filePath An absolute path to which to save the file.
* @param {String} fileURI A remote file URI.
* @return Promise
*/
const download = ( filePath, fileURI ) => new Promise( ( resolve, reject ) => {
log( `Downloading ${ fileURI }\n to ${ filePath }` );
const writeToDisk = fs.createWriteStream( filePath );
https
.get( fileURI, ( response ) => response.pipe( writeToDisk ) )
.on( 'finish', resolve )
.on( 'error', reject );
console.log( `Saved ${ filePath } to disk` );
} );
/**
* Write a markdown file to disk.
*
* @param {String} path An absolute filesystem path
* @param {String} filename A string filename
* @param {String} content The markdown content to write
* @return Promise
*/
async function saveMarkdown( path, filename, content ) {
log( `Saving ${ content.length } chars to ${ join( path, filename ) }` );
await ensureExists( path );
return write( join( path, filename ), content );
}
/**
* Download an image file into a handbook subdirectory.
*
* @param {String} subpath A relative directory within the handbook
* @param {String} imageURI A remote image URI
* @return Promise
*/
async function saveImage( subpath, imageURI ) {
log( `Attempting to download ${ imageURI }` );
const outputDir = join( handbookRoot, subpath );
const filename = imageURI.split( '/' ).pop();
const filePath = join( outputDir, filename );
await ensureExists( outputDir );
return download( filePath, imageURI );
}
const replace = ( str, substr, replacement ) => {
const newStr = str.replace( substr, replacement );
if ( str.indexOf( substr ) > -1 ) {
return replace( newStr, substr, replacement );
}
return newStr;
};
/**
* Download a Gutenberg handbook page as markdown and save any images.
*
* @param {String} uri A handbook URI.
*/
async function downloadPage( uri ) {
log( `Downloading handbook page ${ uri }` );
const data = await spider( uri, {
links: ( $ ) => $( selectors.menu )
.find( 'a' )
.map( ( i, el ) => $( el ).attr( 'href' ).trim() )
.get(),
title: ( $ ) => $( selectors.title )
.text()
.trim(),
content: ( $ ) => $( selectors.content )
.html()
.trim(),
images: ( $ ) => $( selectors.content )
.find( 'img' )
.map( ( i, el ) => $( el ).attr( 'src' ).trim() )
.get(),
} );
const subpath = getSubpath( uri );
// Special handling for the root handbook index page.
const filename = HANDBOOK_URI === uri ?
'index.md' :
getFilename( uri );
const outputPath = subpath ?
join( handbookRoot, subpath ) :
handbookRoot;
if ( data.images.length ) {
console.log( ` Found ${ data.images.length } image${ data.images.length > 1 ? 's' : '' }` );
}
// Download images
await data.images.reduce( async function( lastImageDownloaded, imageURI ) {
await lastImageDownloaded;
return saveImage( subpath, imageURI );
}, Promise.resolve() );
// Relative image links
let markdown = data.images.reduce( ( markdown, imageURI ) => (
replace( markdown, imageURI, `./${ imgFilename( imageURI ) }` )
), turndown( data.content ) );
// Relative intra-handbook links
markdown = data.links.reduce( ( markdown, uri ) => (
replace( markdown, uri, uri.replace( /^.*gutenberg\/handbook/, '' ) )
), markdown );
// Heading
markdown = `# ${ data.title }\n\n${ markdown }`;
// Jekyll front matter
markdown = `---\ntitle: ${ data.title }\n---\n\n${ markdown }`;
await saveMarkdown( outputPath, filename, markdown );
return data.links;
}
async function wait( delay ) {
return await new Promise( ( resolve ) => setTimeout( resolve, delay ) );
}
// Async IIFE to kick of script
void async function() {
await emptyHandbookRoot();
const links = await downloadPage( HANDBOOK_URI );
const handbookPages = links.filter( ( link ) => link !== HANDBOOK_URI );
await handbookPages.reduce( async function( lastPageComplete, uri ) {
await lastPageComplete;
await wait( 200 );
await downloadPage( uri );
}, Promise.resolve() );
console.log( '\nDownload complete!' );
}();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.