Skip to content

Instantly share code, notes, and snippets.

@jamiedumont
Last active October 5, 2021 21:49
Show Gist options
  • Save jamiedumont/f52399ff5f7fd0e75c40d85aa63d1476 to your computer and use it in GitHub Desktop.
Save jamiedumont/f52399ff5f7fd0e75c40d85aa63d1476 to your computer and use it in GitHub Desktop.
Custom Wordpress to Statamic export script
// Grab all the dependencies we'll need for this.
var fs = require('fs'),
xml2js = require('xml2js'),
jsonfile = require('jsonfile'),
toMarkdown = require('to-markdown'),
_ = require('underscore'),
yaml = require('js-yaml'),
jsdom = require('jsdom'),
mkdirp = require('mkdirp'),
download = require('image-downloader');
const { JSDOM } = jsdom;
// Setup some variables that we'll need at the end of the script
var outputFolder = "./output/";
var parser = new xml2js.Parser();
// Read the XML file produced by Wordpress
fs.readFile(__dirname + '/wpexport.xml', function(err, data) {
parser.parseString(data, function (err, result) {
// All the posts
const posts = result.data.post;
// An empty Object that we'll push the finished posts too
const articles = {};
// Begin a huge loop over all the posts.
// This is where 90% of the hard work gets done.
posts.forEach((post) => {
// Start with the simple stuff.
// Grab the slug...
const slug = post.slug[0];
// ... the title (cleaning up ampersands)...
let title = post.title[0];
title = title.replace('&', '&');
// ... and the date.
let date = post.date[0];
// Create an array of all the image ids related to this post.
// Will be used later when we're replacing gallery shortcodes.
const image_ids_str = post.image_id[0];
const image_ids = image_ids_str.split(',');
// Each of these map to an author, where the key is the Wordpress
// id and the value is the Statamic id. All names bar mine removed.
const author_map = {
6: "6624f5ee-0a6e-483e-83b5-34c588c6fcbf",
16: "46aab519-0723-42dc-9c4a-51d321b03a49",
13: "11e3d834-5713-4094-ad27-f4b48c588112",
30: "34246703-a3da-4085-ba8f-8ccd8f65ba3b",
29: "f2eda8a7-b4ca-4e85-9024-833628f1400a",
28: "0ed28477-1918-43c4-ba69-6c406e8670f5",
2: "df9649bd-82a8-43b9-83d4-ca1c28f08ca8",
19: "633898d2-6477-4e2f-a4ad-c496becfd026",
23: "9e61f6a5-3c52-44b1-8db0-5c82220012e0", //jamiedumont
22: "580ed808-75b5-4510-986e-9462f67f6f44",
27: "2f75983b-eb88-465e-9d4c-e6254ab9d3d3",
20: "a21d452d-389e-4834-9bb7-45ba689500c1",
31: "f89550ca-65dc-40af-ac52-48667411aa6f",
25: "c7815f02-6166-4615-a93a-80245c8b14db",
15: "59e5f70f-1942-4434-952d-ce90f85f240e",
14: "7c789b99-0653-42ea-af2b-6541b989237d",
26: "1eecf805-0f20-4de3-b7b5-2eae0c1e03f6",
11: "ad250945-1538-4ce7-9282-e462f18e458b",
21: "f91ea037-fbe8-4e4c-bb0c-44821569b77d",
24: "b9a7129f-c2c3-4f7e-a955-ababe24f8ac6"
};
// Grab the Wordpress author id...
const author_id = post.author[0];
// ..use it to find the Statamic author id for later.
const author = author_map[author_id];
// Create an array of categories from the XML string
const category_str = post.category[0];
let category = category_str.split("|");
// Clean up any ampersands again.
category = category.map((cat) => {
cat = cat.replace('&', '&');
return cat;
});
// Remove "Uncategorized" and empty elements from category array
category = category.filter((el) => {
return el !== ("Uncategorized" || undefined || null || '');
});
// Repeat the same process for "tags"
const tag_str = post.tag[0];
let tags = tag_str.split("|");
// Remove empty elements from tags array
if (tags[0] == "") {
tags = [];
}
// Create an array of all the images used in the post (as URLs)
const img_str = post.image_url[0];
let images = img_str.split(',');
// An array that will be used when replacing gallery and content references to images
const urlsToReplace = [
'http://www.bikesoup.com/magazine/wp-content/uploads',
'http://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets',
'https://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets'
];
// An array that we'll later push too. Stores all the images for this post
// after they've been made relative.
const allImages = [];
// Loop over all images in this post
images.forEach((image) => {
if (image) {
// Fetch the file, and store at proper location
// Replace the absolute URL with a relative one. All retrival must
// take place before this.
let imageOutput, fileDest, path;
// Loop over the absolute URLs we want to replace, creating variations
// that get used throughout download and storage process
urlsToReplace.forEach((url) => {
imageOutput = image.replace(url, '/assets/uploads');
fileDest = image.replace(url, './uploads');
path = fileDest.substring(0, fileDest.lastIndexOf("/"));
});
// Create the params required for the 'download' function
const opts = {
url: image,
dest: path,
done: function(err, filename, image) {
if (err) { console.error(err); }
console.log(`File saved to: ${filename}`);
}
};
// If the required destination exists, download
// the image to it. If not, create the destination, then
// download the image.
if (fs.existsSync(opts.dest)) {
download(opts);
} else {
mkdirp(opts.dest, (err) => {
if (err) { console.error(err); }
else {
console.log(`${opts.dest} created`);
download(opts);
}
});
}
download.image(opts).then(({ filename, image }) => {
console.log(`File saved to: ${filename}`);
}).catch((err) => {
throw err;
});
// Add the local location of the image to our array
allImages.push(imageOutput);
}
});
// Grab the lead_image of each post. Used in header of new design.
const lead_image = allImages[0];
// Images now stores a key:value mapping of Image UIDs to Image URLs
// Used later when we replace gallery shortcodes
let imageURLs = _.object(image_ids, allImages);
// Grab the body of the post. This is HTML + shortcodes.
// Now the fun really starts.
let body = post.body[0];
// Add a new function to String to replace all instances, not just
// the first found.
String.prototype.replaceAll = function(search, replacement) {
var target = this;
return target.replace(new RegExp(search, 'g'), replacement);
};
// Weed out any absolute image URLs in the content.
urlsToReplace.forEach((url) => {
body = body.replaceAll(url, '/assets/uploads');
});
// A carefully crafted RegEx that grabs a gallery shortcode...
let galleryRegex = /\[gallery ids=\".*?\"\]/g;
// ... and the ids within it.
let galleryImageUIDSRegex = /"([^"]+)"/;
// Function.
// Takes Regex match of gallery shortcode
// Returns array of Image UIDs
function returnImageUIDs(match) {
let imageArray = match.match(galleryImageUIDSRegex);
imageArray = imageArray[1];
return imageArray.split(',');
}
// Function.
// Takes array of UIDs
// Returns replicator segment with required URLs for a gallery
function returnGallery(uidArray) {
let returnedImages = uidArray.map((uid) => {
return imageURLs[uid];
});
return returnedImages;
}
// We'll push instances of galleries to this.
let galleries = []
// Use the RegEx from above to find the gallery instances
body = body.replace(galleryRegex, function(match) {
// Add it to our array...
galleries.push(match);
// ... leave an empty shortcode for us to replace later
// with a Statamic Replicator block.
return '[gallery]';
});
// Replace captions with images here
let caption_regex = /(\[caption.*?])(.*?)(\[\/caption\])/g;
// Very messily replace caption shortcodes with semantic <figure> elements
body = body.replace(caption_regex, function(match, p1, p2, p3) {
let imgTag = p2;
let caption = imgTag.match(/(\/>.*)/g);
caption = caption[0].substring(3);
const dom = new JSDOM(imgTag);
const src = dom.window.document.querySelector("img").src;
return `<figure><img src="${src}" alt="${caption}"><figcaption>${caption}</figcaption></figure>`;
});
// Use the shortcode we put back to split the content into blocks.
let bodyArray = body.split('[gallery]');
// Create custom filters for the toMarkdown function. This gives us the right structure
// (plenty of <p> tags) and preserves the <figure>'s we just created.
let replaceSpanDiv = {
filter: ['span', 'div'],
replacement: function(content) {
return content;
}
};
let preserveFigure = {
filter: function(node) {
return node.nodeName === 'IMG' && node.parentNode.nodeName === 'FIGURE';
},
replacement: function(innetHTML, node) {
return `<img src="${node.src}" />`;
}
};
// For each content block, convert to markdown, using our custom filters.
bodyArray = bodyArray.map(function(md) {
let content = toMarkdown(md, { converters: [replaceSpanDiv, preserveFigure] });
// Create a Replicator block for this section of content.
let myObj = {
type: "markdown",
content: content
};
return myObj;
});
// Create Replicator blocks for each gallery.
galleries = galleries.map(function(gallery) {
let myObj = {
type: "gallery",
images: returnGallery(returnImageUIDs(gallery))
};
return myObj;
});
// Insert a gallery block between each content block, giving us the complete
// Replicator field, called 'article_body' here.
let article_body = bodyArray.reduce(function(arr, v, i) {
if (galleries[i]) {
return arr.concat(v, galleries[i]);
}
return arr.concat(v);
}, []);
// Arrange all the data for this post into an Object.
let toYAML = {
title: title,
content: "",
categories: category,
tags: tags,
top_story: false,
author: author,
description: post.meta_description[0],
article_body: article_body
};
// Not all articles have lead images specified
if (lead_image) {
toYAML.lead_image = lead_image;
}
// Handles unpublished entries
if (post.status == "draft") {
date = `_2018-01-01`;
}
// The check for "slug" removes the one article that doesn't have one!!!
if (slug) {
// Insert each article into the global 'articles' Object (ln: 26) with it's slug as the key
articles[slug] = {
order: date,
data: toYAML
};
}
}); // End of huge posts.forEach loop.
// Object to create the JSON format that Statamic expects
const output = {
collections: {
// Here's all our posts.
articles: articles
},
pages: {},
// Not using taxonomies during import. Will be sorting those
// later within Statamic
taxonomies: {
categories: [],
tags: []
}
};
// Write to file system.
fs.writeFile("./bikesoup.json", JSON.stringify(output, null, 4), (err) => {
if (err) {
console.error(err);
return;
};
console.log("File has been created");
});
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment