Skip to content

Instantly share code, notes, and snippets.

@hughgrigg
Last active August 29, 2015 14:18
Show Gist options
  • Save hughgrigg/4c2449ebb15ecbf22e92 to your computer and use it in GitHub Desktop.
Save hughgrigg/4c2449ebb15ecbf22e92 to your computer and use it in GitHub Desktop.
Jekyll WordPress import post processing gulpfile for Hugo
// This is a throwaway / single-use gulpfile I used to process and tidy up the
// files resulting from using Jekyll's WordPress exporter. I wanted to move from
// WordPress to Hugo with a relatively large number of posts; this gulpfile let
// me quickly process all the content files and get on with building the site.
var
gulp = require('gulp'),
replace = require('gulp-replace'),
rename = require('gulp-rename'),
entities = require('gulp-html-entities'),
filter = require('gulp-filter'),
chalk = require('chalk'),
through = require('through-gulp');
function fileName(file) {
var pathParts = file.path.split('/');
return pathParts[pathParts.length - 1];
}
gulp.task('posts', function() {
return gulp.src('./_posts/*.markdown')
// get rid of junk files and warn about short ones
.pipe(filter(function(file) {
if (file.contents.length < 2000) {
var name = fileName(file);
if (file.contents.length < 1500) {
console.log(chalk.red(
'Removed junk file ' + name +' (' + file.contents.length + ' chars)'
));
return false;
} else {
console.log(chalk.yellow(
name + ' is quite short (' + file.contents.length + ' chars)'
));
}
}
return true;
}))
// organise into year-month folders
.pipe(rename(function(path) {
var year, month;
year = path.basename.slice(0,4);
month = path.basename.slice(5,7);
path.dirname += '/' + year + '/' + month + '/';
path.extname = '.md';
}))
// save slug in front matter
.pipe(through.map(function(file) {
var name = fileName(file),
slug = name.slice(11, name.length - 3);
file.contents = new Buffer(file.contents.toString().replace(
'---\nlayout: post\n',
'---\nlayout: post\nslug: ' + slug + '\n'
));
return file;
}))
// tidy up front matter
.pipe(replace(/(wordpress_id|wordpress_url|author_login|author_email|author_url): .*?\n/g, ''))
.pipe(replace(/\npublished: .*\n/, '\n'))
.pipe(replace(/\nstatus: .*\n/, '\n'))
.pipe(replace(/author:\n(.*?\n){4}/, 'author: Hugh Grigg\n'))
.pipe(replace(/.*?: !binary \|-\n .*?\n/g, ''))
.pipe(replace(/\n(title|excerpt):[\s!]+(.*)\n/g, '\n$1: >\n $2\n'))
.pipe(replace(/\ntitle: >\n '(.*?)'/, '\ntitle: >\n $1'))
.pipe(replace(/\nexcerpt:/, '\ndescription:'))
// decode wordpress' html entity encoding
.pipe(entities('decode'))
// fix urls
.pipe(replace('http://eastasiastudent.net/wp-content/uploads/', '/img/'))
.pipe(replace('http://eastasiastudent.net/', '/'))
// apply some html -> markdown replacements
.pipe(replace(/<br ?\/?>\n/g, '\n'))
.pipe(replace(/<a href="(.*?)">(.*?)<\/a>/g, '[$2]($1)'))
.pipe(replace(/<a href="(.*?)" title="(.*?)">(.*?)<\/a>/g, '[$3]($1 "$2")'))
.pipe(replace(/<a title="(.*?)" href="(.*?)">(.*?)<\/a>/g, '[$3]($2 "$1")'))
.pipe(replace(/<em>(.*?)<\/em>/g, '_$1_'))
.pipe(replace(/<strong>(.*?)<\/strong>/g, '*$1*'))
.pipe(replace(/<h([0-9])>(.*?)<\/h([0-9])>/g, '\n<h$1>$2</h$3>\n'))
.pipe(replace(/<(ul|ol|p|h[0-9]|blockquote)/g, '\n<$1'))
.pipe(replace(/<\/(ul|ol|p|h[0-9]|blockquote)>/g, '\n</$1>\n'))
.pipe(replace(/<h1>([\s\S]*?)<\/h1>/g, '# $1'))
.pipe(replace(/<h2>([\s\S]*?)<\/h2>/g, '## $1'))
.pipe(replace(/<h3>([\s\S]*?)<\/h3>/g, '### $1'))
.pipe(replace(/<h4>([\s\S]*?)<\/h4>/g, '#### $1'))
.pipe(replace(/<h5>([\s\S]*?)<\/h5>/g, '##### $1'))
.pipe(replace(/<h6>([\s\S]*?)<\/h6>/g, '###### $1'))
.pipe(replace(/<p( style=".*?")?>([\s\S]*?)<\/p>/g, '\n$2\n'))
.pipe(replace(/<li>(.*?)<\/li>/g, ' - $1'))
.pipe(replace(/<\/?[ou]l( style=.*?)?>/g, ''))
.pipe(replace(/\n\s?<\/p>\n?/g, ''))
.pipe(replace(/\n\s+\n/g, '\n\n'))
// apply some custom hugo tags
.pipe(replace(/<span class="chinese"( lang="zh")?>(.*?)<\/span>/g, '{{< hanzi >}}$2{{< /hanzi >}}'))
.pipe(replace(/<span class="hanzi"( lang="zh")?>(.*?)<\/span>/g, '{{< hanzi >}}$2{{< /hanzi >}}'))
.pipe(replace(/<span class="reading">(.*?)<\/span>/g, '{{< reading >}}$1{{< /reading >}}'))
.pipe(replace(/<span class="pinyin">(.*?)<\/span>/g, '{{< reading >}}$1{{< /reading >}}'))
.pipe(replace(/<span class="gloss">(.*?)<\/span>/g, '{{< gloss >}}$1{{< /gloss >}}'))
.pipe(replace(/<span class="note">(.*?)<\/span>/g, '{{< note >}}$1{{< /note >}}'))
.pipe(replace(/<span style="font-size: smaller;">(.*?)<\/span>/g, '{{< note >}}$1{{< /note >}}'))
.pipe(replace(/<p style="font-size: smaller;">(.*?)<\/p>/g, '\n{{< note >}}$1{{< /note >}}\n'))
.pipe(replace(/<span style="color: #888888;">(.*?)<\/span>/g, '{{< note >}}$1{{< /note >}}'))
.pipe(replace(/<p style="font-size: larger;">(.*?)<\/p>/g, '{{< larger >}}$1{{< /larger >}}'))
// tidy up some leftovers
.pipe(replace(/<p style="font-size: larger;">/, ''))
.pipe(replace(/<span style="line-height: ?(.*?);?">(.*?)<\/span>/g, '$2'))
.pipe(replace(/ ?wp-image-[0-9]+/g, ''))
.pipe(replace(/ (width|height)="[0-9]+"/g, ''))
.pipe(replace(/\n\n\n/g, '\n\n'))
.pipe(replace(/\n\n$/, '\n'))
// write
.pipe(gulp.dest('./processed/'));
});
{
"name": "post-process-jekyll-wordpress",
"version": "0.0.0",
"description": "",
"main": "gulpfile.js",
"author": "",
"license": "BSD-2-Clause",
"devDependencies": {
"gulp-rename": "~1.2.2",
"gulp-replace": "~0.5.3",
"gulp": "~3.8.11",
"gulp-html-entities": "0.0.3",
"gulp-filter": "~2.0.2",
"chalk": "~1.0.0",
"through-gulp": "~0.3.8"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment