-
-
Save jamiedumont/f52399ff5f7fd0e75c40d85aa63d1476 to your computer and use it in GitHub Desktop.
Custom Wordpress to Statamic export script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Grab all the dependencies we'll need for this. | |
var fs = require('fs'), | |
xml2js = require('xml2js'), | |
jsonfile = require('jsonfile'), | |
toMarkdown = require('to-markdown'), | |
_ = require('underscore'), | |
yaml = require('js-yaml'), | |
jsdom = require('jsdom'), | |
mkdirp = require('mkdirp'), | |
download = require('image-downloader'); | |
const { JSDOM } = jsdom; | |
// Setup some variables that we'll need at the end of the script | |
var outputFolder = "./output/"; | |
var parser = new xml2js.Parser(); | |
// Read the XML file produced by Wordpress | |
fs.readFile(__dirname + '/wpexport.xml', function(err, data) { | |
parser.parseString(data, function (err, result) { | |
// All the posts | |
const posts = result.data.post; | |
// An empty Object that we'll push the finished posts too | |
const articles = {}; | |
// Begin a huge loop over all the posts. | |
// This is where 90% of the hard work gets done. | |
posts.forEach((post) => { | |
// Start with the simple stuff. | |
// Grab the slug... | |
const slug = post.slug[0]; | |
// ... the title (cleaning up ampersands)... | |
let title = post.title[0]; | |
title = title.replace('&', '&'); | |
// ... and the date. | |
let date = post.date[0]; | |
// Create an array of all the image ids related to this post. | |
// Will be used later when we're replacing gallery shortcodes. | |
const image_ids_str = post.image_id[0]; | |
const image_ids = image_ids_str.split(','); | |
// Each of these map to an author, where the key is the Wordpress | |
// id and the value is the Statamic id. All names bar mine removed. | |
const author_map = { | |
6: "6624f5ee-0a6e-483e-83b5-34c588c6fcbf", | |
16: "46aab519-0723-42dc-9c4a-51d321b03a49", | |
13: "11e3d834-5713-4094-ad27-f4b48c588112", | |
30: "34246703-a3da-4085-ba8f-8ccd8f65ba3b", | |
29: "f2eda8a7-b4ca-4e85-9024-833628f1400a", | |
28: "0ed28477-1918-43c4-ba69-6c406e8670f5", | |
2: "df9649bd-82a8-43b9-83d4-ca1c28f08ca8", | |
19: "633898d2-6477-4e2f-a4ad-c496becfd026", | |
23: "9e61f6a5-3c52-44b1-8db0-5c82220012e0", //jamiedumont | |
22: "580ed808-75b5-4510-986e-9462f67f6f44", | |
27: "2f75983b-eb88-465e-9d4c-e6254ab9d3d3", | |
20: "a21d452d-389e-4834-9bb7-45ba689500c1", | |
31: "f89550ca-65dc-40af-ac52-48667411aa6f", | |
25: "c7815f02-6166-4615-a93a-80245c8b14db", | |
15: "59e5f70f-1942-4434-952d-ce90f85f240e", | |
14: "7c789b99-0653-42ea-af2b-6541b989237d", | |
26: "1eecf805-0f20-4de3-b7b5-2eae0c1e03f6", | |
11: "ad250945-1538-4ce7-9282-e462f18e458b", | |
21: "f91ea037-fbe8-4e4c-bb0c-44821569b77d", | |
24: "b9a7129f-c2c3-4f7e-a955-ababe24f8ac6" | |
}; | |
// Grab the Wordpress author id... | |
const author_id = post.author[0]; | |
// ..use it to find the Statamic author id for later. | |
const author = author_map[author_id]; | |
// Create an array of categories from the XML string | |
const category_str = post.category[0]; | |
let category = category_str.split("|"); | |
// Clean up any ampersands again. | |
category = category.map((cat) => { | |
cat = cat.replace('&', '&'); | |
return cat; | |
}); | |
// Remove "Uncategorized" and empty elements from category array | |
category = category.filter((el) => { | |
return el !== ("Uncategorized" || undefined || null || ''); | |
}); | |
// Repeat the same process for "tags" | |
const tag_str = post.tag[0]; | |
let tags = tag_str.split("|"); | |
// Remove empty elements from tags array | |
if (tags[0] == "") { | |
tags = []; | |
} | |
// Create an array of all the images used in the post (as URLs) | |
const img_str = post.image_url[0]; | |
let images = img_str.split(','); | |
// An array that will be used when replacing gallery and content references to images | |
const urlsToReplace = [ | |
'http://www.bikesoup.com/magazine/wp-content/uploads', | |
'http://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets', | |
'https://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets' | |
]; | |
// An array that we'll later push too. Stores all the images for this post | |
// after they've been made relative. | |
const allImages = []; | |
// Loop over all images in this post | |
images.forEach((image) => { | |
if (image) { | |
// Fetch the file, and store at proper location | |
// Replace the absolute URL with a relative one. All retrival must | |
// take place before this. | |
let imageOutput, fileDest, path; | |
// Loop over the absolute URLs we want to replace, creating variations | |
// that get used throughout download and storage process | |
urlsToReplace.forEach((url) => { | |
imageOutput = image.replace(url, '/assets/uploads'); | |
fileDest = image.replace(url, './uploads'); | |
path = fileDest.substring(0, fileDest.lastIndexOf("/")); | |
}); | |
// Create the params required for the 'download' function | |
const opts = { | |
url: image, | |
dest: path, | |
done: function(err, filename, image) { | |
if (err) { console.error(err); } | |
console.log(`File saved to: ${filename}`); | |
} | |
}; | |
// If the required destination exists, download | |
// the image to it. If not, create the destination, then | |
// download the image. | |
if (fs.existsSync(opts.dest)) { | |
download(opts); | |
} else { | |
mkdirp(opts.dest, (err) => { | |
if (err) { console.error(err); } | |
else { | |
console.log(`${opts.dest} created`); | |
download(opts); | |
} | |
}); | |
} | |
download.image(opts).then(({ filename, image }) => { | |
console.log(`File saved to: ${filename}`); | |
}).catch((err) => { | |
throw err; | |
}); | |
// Add the local location of the image to our array | |
allImages.push(imageOutput); | |
} | |
}); | |
// Grab the lead_image of each post. Used in header of new design. | |
const lead_image = allImages[0]; | |
// Images now stores a key:value mapping of Image UIDs to Image URLs | |
// Used later when we replace gallery shortcodes | |
let imageURLs = _.object(image_ids, allImages); | |
// Grab the body of the post. This is HTML + shortcodes. | |
// Now the fun really starts. | |
let body = post.body[0]; | |
// Add a new function to String to replace all instances, not just | |
// the first found. | |
String.prototype.replaceAll = function(search, replacement) { | |
var target = this; | |
return target.replace(new RegExp(search, 'g'), replacement); | |
}; | |
// Weed out any absolute image URLs in the content. | |
urlsToReplace.forEach((url) => { | |
body = body.replaceAll(url, '/assets/uploads'); | |
}); | |
// A carefully crafted RegEx that grabs a gallery shortcode... | |
let galleryRegex = /\[gallery ids=\".*?\"\]/g; | |
// ... and the ids within it. | |
let galleryImageUIDSRegex = /"([^"]+)"/; | |
// Function. | |
// Takes Regex match of gallery shortcode | |
// Returns array of Image UIDs | |
function returnImageUIDs(match) { | |
let imageArray = match.match(galleryImageUIDSRegex); | |
imageArray = imageArray[1]; | |
return imageArray.split(','); | |
} | |
// Function. | |
// Takes array of UIDs | |
// Returns replicator segment with required URLs for a gallery | |
function returnGallery(uidArray) { | |
let returnedImages = uidArray.map((uid) => { | |
return imageURLs[uid]; | |
}); | |
return returnedImages; | |
} | |
// We'll push instances of galleries to this. | |
let galleries = [] | |
// Use the RegEx from above to find the gallery instances | |
body = body.replace(galleryRegex, function(match) { | |
// Add it to our array... | |
galleries.push(match); | |
// ... leave an empty shortcode for us to replace later | |
// with a Statamic Replicator block. | |
return '[gallery]'; | |
}); | |
// Replace captions with images here | |
let caption_regex = /(\[caption.*?])(.*?)(\[\/caption\])/g; | |
// Very messily replace caption shortcodes with semantic <figure> elements | |
body = body.replace(caption_regex, function(match, p1, p2, p3) { | |
let imgTag = p2; | |
let caption = imgTag.match(/(\/>.*)/g); | |
caption = caption[0].substring(3); | |
const dom = new JSDOM(imgTag); | |
const src = dom.window.document.querySelector("img").src; | |
return `<figure><img src="${src}" alt="${caption}"><figcaption>${caption}</figcaption></figure>`; | |
}); | |
// Use the shortcode we put back to split the content into blocks. | |
let bodyArray = body.split('[gallery]'); | |
// Create custom filters for the toMarkdown function. This gives us the right structure | |
// (plenty of <p> tags) and preserves the <figure>'s we just created. | |
let replaceSpanDiv = { | |
filter: ['span', 'div'], | |
replacement: function(content) { | |
return content; | |
} | |
}; | |
let preserveFigure = { | |
filter: function(node) { | |
return node.nodeName === 'IMG' && node.parentNode.nodeName === 'FIGURE'; | |
}, | |
replacement: function(innetHTML, node) { | |
return `<img src="${node.src}" />`; | |
} | |
}; | |
// For each content block, convert to markdown, using our custom filters. | |
bodyArray = bodyArray.map(function(md) { | |
let content = toMarkdown(md, { converters: [replaceSpanDiv, preserveFigure] }); | |
// Create a Replicator block for this section of content. | |
let myObj = { | |
type: "markdown", | |
content: content | |
}; | |
return myObj; | |
}); | |
// Create Replicator blocks for each gallery. | |
galleries = galleries.map(function(gallery) { | |
let myObj = { | |
type: "gallery", | |
images: returnGallery(returnImageUIDs(gallery)) | |
}; | |
return myObj; | |
}); | |
// Insert a gallery block between each content block, giving us the complete | |
// Replicator field, called 'article_body' here. | |
let article_body = bodyArray.reduce(function(arr, v, i) { | |
if (galleries[i]) { | |
return arr.concat(v, galleries[i]); | |
} | |
return arr.concat(v); | |
}, []); | |
// Arrange all the data for this post into an Object. | |
let toYAML = { | |
title: title, | |
content: "", | |
categories: category, | |
tags: tags, | |
top_story: false, | |
author: author, | |
description: post.meta_description[0], | |
article_body: article_body | |
}; | |
// Not all articles have lead images specified | |
if (lead_image) { | |
toYAML.lead_image = lead_image; | |
} | |
// Handles unpublished entries | |
if (post.status == "draft") { | |
date = `_2018-01-01`; | |
} | |
// The check for "slug" removes the one article that doesn't have one!!! | |
if (slug) { | |
// Insert each article into the global 'articles' Object (ln: 26) with it's slug as the key | |
articles[slug] = { | |
order: date, | |
data: toYAML | |
}; | |
} | |
}); // End of huge posts.forEach loop. | |
// Object to create the JSON format that Statamic expects | |
const output = { | |
collections: { | |
// Here's all our posts. | |
articles: articles | |
}, | |
pages: {}, | |
// Not using taxonomies during import. Will be sorting those | |
// later within Statamic | |
taxonomies: { | |
categories: [], | |
tags: [] | |
} | |
}; | |
// Write to file system. | |
fs.writeFile("./bikesoup.json", JSON.stringify(output, null, 4), (err) => { | |
if (err) { | |
console.error(err); | |
return; | |
}; | |
console.log("File has been created"); | |
}); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment