jamiedumont/export.js Secret

## export.js
// Grab all the dependencies we'll need for this.
var fs = require('fs'),
xml2js = require('xml2js'),
jsonfile = require('jsonfile'),
toMarkdown = require('to-markdown'),
_ = require('underscore'),
yaml = require('js-yaml'),
jsdom = require('jsdom'),
mkdirp = require('mkdirp'),
download = require('image-downloader');

const { JSDOM } = jsdom;

// Setup some variables that we'll need at the end of the script
var outputFolder = "./output/";
var parser = new xml2js.Parser();

// Read the XML file produced by Wordpress
fs.readFile(__dirname + '/wpexport.xml', function(err, data) {
  parser.parseString(data, function (err, result) {

    // All the posts
    const posts = result.data.post;

    // An empty Object that we'll push the finished posts too
    const articles = {};

    // Begin a huge loop over all the posts.
    // This is where 90% of the hard work gets done.
    posts.forEach((post) => {

      // Start with the simple stuff.
      // Grab the slug...
      const slug = post.slug[0];

      // ... the title (cleaning up ampersands)...
      let title = post.title[0];
      title = title.replace('&amp;', '&');

      // ... and the date.
      let date = post.date[0];

      // Create an array of all the image ids related to this post.
      // Will be used later when we're replacing gallery shortcodes.
      const image_ids_str = post.image_id[0];
      const image_ids = image_ids_str.split(',');

      // Each of these map to an author, where the key is the Wordpress
      // id and the value is the Statamic id. All names bar mine removed.
      const author_map = {
        6: "6624f5ee-0a6e-483e-83b5-34c588c6fcbf",
        16: "46aab519-0723-42dc-9c4a-51d321b03a49",
        13: "11e3d834-5713-4094-ad27-f4b48c588112",
        30: "34246703-a3da-4085-ba8f-8ccd8f65ba3b",
        29: "f2eda8a7-b4ca-4e85-9024-833628f1400a",
        28: "0ed28477-1918-43c4-ba69-6c406e8670f5",
        2: "df9649bd-82a8-43b9-83d4-ca1c28f08ca8",
        19: "633898d2-6477-4e2f-a4ad-c496becfd026",
        23: "9e61f6a5-3c52-44b1-8db0-5c82220012e0", //jamiedumont
        22: "580ed808-75b5-4510-986e-9462f67f6f44",
        27: "2f75983b-eb88-465e-9d4c-e6254ab9d3d3",
        20: "a21d452d-389e-4834-9bb7-45ba689500c1",
        31: "f89550ca-65dc-40af-ac52-48667411aa6f",
        25: "c7815f02-6166-4615-a93a-80245c8b14db",
        15: "59e5f70f-1942-4434-952d-ce90f85f240e",
        14: "7c789b99-0653-42ea-af2b-6541b989237d",
        26: "1eecf805-0f20-4de3-b7b5-2eae0c1e03f6",
        11: "ad250945-1538-4ce7-9282-e462f18e458b",
        21: "f91ea037-fbe8-4e4c-bb0c-44821569b77d",
        24: "b9a7129f-c2c3-4f7e-a955-ababe24f8ac6"
      };

      // Grab the Wordpress author id...
      const author_id = post.author[0];
      // ..use it to find the Statamic author id for later.
      const author = author_map[author_id];

      // Create an array of categories from the XML string
      const category_str = post.category[0];
      let category = category_str.split("|");

      // Clean up any ampersands again.
      category = category.map((cat) => {
        cat = cat.replace('&amp;', '&');
        return cat;
      });

      // Remove "Uncategorized" and empty elements from category array
      category = category.filter((el) => {
        return el !== ("Uncategorized" || undefined || null || '');
      });

      // Repeat the same process for "tags"
      const tag_str = post.tag[0];
      let tags = tag_str.split("|");

      // Remove empty elements from tags array
      if (tags[0] == "") {
        tags = [];
      }

      // Create an array of all the images used in the post (as URLs)
      const img_str = post.image_url[0];
      let images = img_str.split(',');

      // An array that will be used when replacing gallery and content references to images
      const urlsToReplace = [
        'http://www.bikesoup.com/magazine/wp-content/uploads',
        'http://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets',
        'https://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets'
      ];

      // An array that we'll later push too. Stores all the images for this post
      // after they've been made relative.
      const allImages = [];

      // Loop over all images in this post
      images.forEach((image) => {
        if (image) {
          // Fetch the file, and store at proper location
          // Replace the absolute URL with a relative one. All retrival must
          // take place before this.
          let imageOutput, fileDest, path;

          // Loop over the absolute URLs we want to replace, creating variations
          // that get used throughout download and storage process
          urlsToReplace.forEach((url) => {
            imageOutput = image.replace(url, '/assets/uploads');
            fileDest = image.replace(url, './uploads');
            path = fileDest.substring(0, fileDest.lastIndexOf("/"));
          });

         // Create the params required for the 'download' function
         const opts = {
           url: image,
           dest: path,
           done: function(err, filename, image) {
             if (err) { console.error(err); }
             console.log(`File saved to: ${filename}`);
           }
         };

         // If the required destination exists, download
         // the image to it. If not, create the destination, then
         // download the image.
         if (fs.existsSync(opts.dest)) {
           download(opts);
         } else {
           mkdirp(opts.dest, (err) => {
             if (err) { console.error(err); }
             else {
               console.log(`${opts.dest} created`);
               download(opts);
             }
           });
         }

         download.image(opts).then(({ filename, image }) => {
           console.log(`File saved to: ${filename}`);
         }).catch((err) => {
           throw err;
         });

          // Add the local location of the image to our array
          allImages.push(imageOutput);
        }
      });

      // Grab the lead_image of each post. Used in header of new design.
      const lead_image = allImages[0];

      // Images now stores a key:value mapping of Image UIDs to Image URLs
      // Used later when we replace gallery shortcodes
      let imageURLs = _.object(image_ids, allImages);

      // Grab the body of the post. This is HTML + shortcodes.
      // Now the fun really starts.
      let body = post.body[0];

      // Add a new function to String to replace all instances, not just
      // the first found.
      String.prototype.replaceAll = function(search, replacement) {
        var target = this;
        return target.replace(new RegExp(search, 'g'), replacement);
      };

      // Weed out any absolute image URLs in the content.
      urlsToReplace.forEach((url) => {
        body = body.replaceAll(url, '/assets/uploads');
      });

      // A carefully crafted RegEx that grabs a gallery shortcode...
      let galleryRegex = /\[gallery ids=\".*?\"\]/g;
      // ... and the ids within it.
      let galleryImageUIDSRegex = /"([^"]+)"/;

      // Function.
      // Takes Regex match of gallery shortcode
      // Returns array of Image UIDs
      function returnImageUIDs(match) {
        let imageArray = match.match(galleryImageUIDSRegex);
        imageArray = imageArray[1];
        return imageArray.split(',');
      }

      // Function.
      // Takes array of UIDs
      // Returns replicator segment with required URLs for a gallery
      function returnGallery(uidArray) {
        let returnedImages = uidArray.map((uid) => {
          return imageURLs[uid];
        });
        return returnedImages;
      }

      // We'll push instances of galleries to this.
      let galleries = []

      // Use the RegEx from above to find the gallery instances
      body = body.replace(galleryRegex, function(match) {
        // Add it to our array...
        galleries.push(match);
        // ... leave an empty shortcode for us to replace later
        // with a Statamic Replicator block.
        return '[gallery]';
      });


      // Replace captions with images here
      let caption_regex = /(\[caption.*?])(.*?)(\[\/caption\])/g;

      // Very messily replace caption shortcodes with semantic <figure> elements
      body = body.replace(caption_regex, function(match, p1, p2, p3) {
        let imgTag = p2;
        let caption = imgTag.match(/(\/>.*)/g);
        caption = caption[0].substring(3);
        const dom  = new JSDOM(imgTag);
        const src = dom.window.document.querySelector("img").src;
        return `<figure><img src="${src}" alt="${caption}"><figcaption>${caption}</figcaption></figure>`;
      });

      // Use the shortcode we put back to split the content into blocks.
      let bodyArray = body.split('[gallery]');


      // Create custom filters for the toMarkdown function. This gives us the right structure
      // (plenty of <p> tags) and preserves the <figure>'s we just created.
      let replaceSpanDiv = {
        filter: ['span', 'div'],
        replacement: function(content) {
          return content;
        }
      };

      let preserveFigure = {
        filter: function(node) {
          return node.nodeName === 'IMG' && node.parentNode.nodeName === 'FIGURE';
        },
        replacement: function(innetHTML, node) {
          return `<img src="${node.src}" />`;
        }
      };

      // For each content block, convert to markdown, using our custom filters.
      bodyArray = bodyArray.map(function(md) {
        let content = toMarkdown(md, { converters: [replaceSpanDiv, preserveFigure] });

        // Create a Replicator block for this section of content.
        let myObj = {
          type: "markdown",
          content: content
        };

        return myObj;
      });

      // Create Replicator blocks for each gallery.
      galleries = galleries.map(function(gallery) {
        let myObj = {
          type: "gallery",
          images: returnGallery(returnImageUIDs(gallery))
        };
        return myObj;
      });


      // Insert a gallery block between each content block, giving us the complete
      // Replicator field, called 'article_body' here.
      let article_body = bodyArray.reduce(function(arr, v, i) {
        if (galleries[i]) {
          return arr.concat(v, galleries[i]);
        }
        return arr.concat(v);
      }, []);

      // Arrange all the data for this post into an Object.
      let toYAML = {
        title: title,
        content: "",
        categories: category,
        tags: tags,
        top_story: false,
        author: author,
        description: post.meta_description[0],
        article_body: article_body
      };

      // Not all articles have lead images specified
      if (lead_image) {
        toYAML.lead_image = lead_image;
      }

      // Handles unpublished entries
      if (post.status == "draft") {
        date = `_2018-01-01`;
      }

      // The check for "slug" removes the one article that doesn't have one!!!
      if (slug) {
        // Insert each article into the global 'articles' Object (ln: 26) with it's slug as the key
        articles[slug] = {
          order: date,
          data: toYAML
        };
      }

    }); // End of huge posts.forEach loop.


    // Object to create the JSON format that Statamic expects
    const output = {
      collections: {
        // Here's all our posts.
        articles: articles
      },
      pages: {},
      // Not using taxonomies during import. Will be sorting those
      // later within Statamic
      taxonomies: {
        categories: [],
        tags: []
      }
    };

    // Write to file system.
    fs.writeFile("./bikesoup.json", JSON.stringify(output, null, 4), (err) => {
        if (err) {
            console.error(err);
            return;
        };
        console.log("File has been created");
    });
  });
});
	// Grab all the dependencies we'll need for this.
	var fs = require('fs'),
	xml2js = require('xml2js'),
	jsonfile = require('jsonfile'),
	toMarkdown = require('to-markdown'),
	_ = require('underscore'),
	yaml = require('js-yaml'),
	jsdom = require('jsdom'),
	mkdirp = require('mkdirp'),
	download = require('image-downloader');

	const { JSDOM } = jsdom;

	// Setup some variables that we'll need at the end of the script
	var outputFolder = "./output/";
	var parser = new xml2js.Parser();

	// Read the XML file produced by Wordpress
	fs.readFile(__dirname + '/wpexport.xml', function(err, data) {
	parser.parseString(data, function (err, result) {

	// All the posts
	const posts = result.data.post;

	// An empty Object that we'll push the finished posts too
	const articles = {};

	// Begin a huge loop over all the posts.
	// This is where 90% of the hard work gets done.
	posts.forEach((post) => {

	// Start with the simple stuff.
	// Grab the slug...
	const slug = post.slug[0];

	// ... the title (cleaning up ampersands)...
	let title = post.title[0];
	title = title.replace('&', '&');

	// ... and the date.
	let date = post.date[0];

	// Create an array of all the image ids related to this post.
	// Will be used later when we're replacing gallery shortcodes.
	const image_ids_str = post.image_id[0];
	const image_ids = image_ids_str.split(',');

	// Each of these map to an author, where the key is the Wordpress
	// id and the value is the Statamic id. All names bar mine removed.
	const author_map = {
	6: "6624f5ee-0a6e-483e-83b5-34c588c6fcbf",
	16: "46aab519-0723-42dc-9c4a-51d321b03a49",
	13: "11e3d834-5713-4094-ad27-f4b48c588112",
	30: "34246703-a3da-4085-ba8f-8ccd8f65ba3b",
	29: "f2eda8a7-b4ca-4e85-9024-833628f1400a",
	28: "0ed28477-1918-43c4-ba69-6c406e8670f5",
	2: "df9649bd-82a8-43b9-83d4-ca1c28f08ca8",
	19: "633898d2-6477-4e2f-a4ad-c496becfd026",
	23: "9e61f6a5-3c52-44b1-8db0-5c82220012e0", //jamiedumont
	22: "580ed808-75b5-4510-986e-9462f67f6f44",
	27: "2f75983b-eb88-465e-9d4c-e6254ab9d3d3",
	20: "a21d452d-389e-4834-9bb7-45ba689500c1",
	31: "f89550ca-65dc-40af-ac52-48667411aa6f",
	25: "c7815f02-6166-4615-a93a-80245c8b14db",
	15: "59e5f70f-1942-4434-952d-ce90f85f240e",
	14: "7c789b99-0653-42ea-af2b-6541b989237d",
	26: "1eecf805-0f20-4de3-b7b5-2eae0c1e03f6",
	11: "ad250945-1538-4ce7-9282-e462f18e458b",
	21: "f91ea037-fbe8-4e4c-bb0c-44821569b77d",
	24: "b9a7129f-c2c3-4f7e-a955-ababe24f8ac6"
	};

	// Grab the Wordpress author id...
	const author_id = post.author[0];
	// ..use it to find the Statamic author id for later.
	const author = author_map[author_id];

	// Create an array of categories from the XML string
	const category_str = post.category[0];
	let category = category_str.split("\|");

	// Clean up any ampersands again.
	category = category.map((cat) => {
	cat = cat.replace('&', '&');
	return cat;
	});

	// Remove "Uncategorized" and empty elements from category array
	category = category.filter((el) => {
	return el !== ("Uncategorized" \|\| undefined \|\| null \|\| '');
	});

	// Repeat the same process for "tags"
	const tag_str = post.tag[0];
	let tags = tag_str.split("\|");

	// Remove empty elements from tags array
	if (tags[0] == "") {
	tags = [];
	}

	// Create an array of all the images used in the post (as URLs)
	const img_str = post.image_url[0];
	let images = img_str.split(',');

	// An array that will be used when replacing gallery and content references to images
	const urlsToReplace = [
	'http://www.bikesoup.com/magazine/wp-content/uploads',
	'http://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets',
	'https://s3-eu-west-1.amazonaws.com/bikesoup-magazine-image-assets'
	];

	// An array that we'll later push too. Stores all the images for this post
	// after they've been made relative.
	const allImages = [];

	// Loop over all images in this post
	images.forEach((image) => {
	if (image) {
	// Fetch the file, and store at proper location
	// Replace the absolute URL with a relative one. All retrival must
	// take place before this.
	let imageOutput, fileDest, path;

	// Loop over the absolute URLs we want to replace, creating variations
	// that get used throughout download and storage process
	urlsToReplace.forEach((url) => {
	imageOutput = image.replace(url, '/assets/uploads');
	fileDest = image.replace(url, './uploads');
	path = fileDest.substring(0, fileDest.lastIndexOf("/"));
	});

	// Create the params required for the 'download' function
	const opts = {
	url: image,
	dest: path,
	done: function(err, filename, image) {
	if (err) { console.error(err); }
	console.log(`File saved to: ${filename}`);
	}
	};

	// If the required destination exists, download
	// the image to it. If not, create the destination, then
	// download the image.
	if (fs.existsSync(opts.dest)) {
	download(opts);
	} else {
	mkdirp(opts.dest, (err) => {
	if (err) { console.error(err); }
	else {
	console.log(`${opts.dest} created`);
	download(opts);
	}
	});
	}

	download.image(opts).then(({ filename, image }) => {
	console.log(`File saved to: ${filename}`);
	}).catch((err) => {
	throw err;
	});

	// Add the local location of the image to our array
	allImages.push(imageOutput);
	}
	});

	// Grab the lead_image of each post. Used in header of new design.
	const lead_image = allImages[0];

	// Images now stores a key:value mapping of Image UIDs to Image URLs
	// Used later when we replace gallery shortcodes
	let imageURLs = _.object(image_ids, allImages);

	// Grab the body of the post. This is HTML + shortcodes.
	// Now the fun really starts.
	let body = post.body[0];

	// Add a new function to String to replace all instances, not just
	// the first found.
	String.prototype.replaceAll = function(search, replacement) {
	var target = this;
	return target.replace(new RegExp(search, 'g'), replacement);
	};

	// Weed out any absolute image URLs in the content.
	urlsToReplace.forEach((url) => {
	body = body.replaceAll(url, '/assets/uploads');
	});

	// A carefully crafted RegEx that grabs a gallery shortcode...
	let galleryRegex = /\[gallery ids=\".*?\"\]/g;
	// ... and the ids within it.
	let galleryImageUIDSRegex = /"([^"]+)"/;

	// Function.
	// Takes Regex match of gallery shortcode
	// Returns array of Image UIDs
	function returnImageUIDs(match) {
	let imageArray = match.match(galleryImageUIDSRegex);
	imageArray = imageArray[1];
	return imageArray.split(',');
	}

	// Function.
	// Takes array of UIDs
	// Returns replicator segment with required URLs for a gallery
	function returnGallery(uidArray) {
	let returnedImages = uidArray.map((uid) => {
	return imageURLs[uid];
	});
	return returnedImages;
	}

	// We'll push instances of galleries to this.
	let galleries = []

	// Use the RegEx from above to find the gallery instances
	body = body.replace(galleryRegex, function(match) {
	// Add it to our array...
	galleries.push(match);
	// ... leave an empty shortcode for us to replace later
	// with a Statamic Replicator block.
	return '[gallery]';
	});


	// Replace captions with images here
	let caption_regex = /(\[caption.?])(.?)(\[\/caption\])/g;

	// Very messily replace caption shortcodes with semantic <figure> elements
	body = body.replace(caption_regex, function(match, p1, p2, p3) {
	let imgTag = p2;
	let caption = imgTag.match(/(\/>.*)/g);
	caption = caption[0].substring(3);
	const dom = new JSDOM(imgTag);
	const src = dom.window.document.querySelector("img").src;
	return `<figure><img src="${src}" alt="${caption}"><figcaption>${caption}</figcaption></figure>`;
	});

	// Use the shortcode we put back to split the content into blocks.
	let bodyArray = body.split('[gallery]');


	// Create custom filters for the toMarkdown function. This gives us the right structure
	// (plenty of <p> tags) and preserves the <figure>'s we just created.
	let replaceSpanDiv = {
	filter: ['span', 'div'],
	replacement: function(content) {
	return content;
	}
	};

	let preserveFigure = {
	filter: function(node) {
	return node.nodeName === 'IMG' && node.parentNode.nodeName === 'FIGURE';
	},
	replacement: function(innetHTML, node) {
	return `<img src="${node.src}" />`;
	}
	};

	// For each content block, convert to markdown, using our custom filters.
	bodyArray = bodyArray.map(function(md) {
	let content = toMarkdown(md, { converters: [replaceSpanDiv, preserveFigure] });

	// Create a Replicator block for this section of content.
	let myObj = {
	type: "markdown",
	content: content
	};

	return myObj;
	});

	// Create Replicator blocks for each gallery.
	galleries = galleries.map(function(gallery) {
	let myObj = {
	type: "gallery",
	images: returnGallery(returnImageUIDs(gallery))
	};
	return myObj;
	});


	// Insert a gallery block between each content block, giving us the complete
	// Replicator field, called 'article_body' here.
	let article_body = bodyArray.reduce(function(arr, v, i) {
	if (galleries[i]) {
	return arr.concat(v, galleries[i]);
	}
	return arr.concat(v);
	}, []);

	// Arrange all the data for this post into an Object.
	let toYAML = {
	title: title,
	content: "",
	categories: category,
	tags: tags,
	top_story: false,
	author: author,
	description: post.meta_description[0],
	article_body: article_body
	};

	// Not all articles have lead images specified
	if (lead_image) {
	toYAML.lead_image = lead_image;
	}

	// Handles unpublished entries
	if (post.status == "draft") {
	date = `_2018-01-01`;
	}

	// The check for "slug" removes the one article that doesn't have one!!!
	if (slug) {
	// Insert each article into the global 'articles' Object (ln: 26) with it's slug as the key
	articles[slug] = {
	order: date,
	data: toYAML
	};
	}

	}); // End of huge posts.forEach loop.



	// Object to create the JSON format that Statamic expects
	const output = {
	collections: {
	// Here's all our posts.
	articles: articles
	},
	pages: {},
	// Not using taxonomies during import. Will be sorting those
	// later within Statamic
	taxonomies: {
	categories: [],
	tags: []
	}
	};

	// Write to file system.
	fs.writeFile("./bikesoup.json", JSON.stringify(output, null, 4), (err) => {
	if (err) {
	console.error(err);
	return;
	};
	console.log("File has been created");
	});
	});
	});