Created
July 12, 2017 21:08
-
-
Save karlwilcox/6ff5767a0fe7b4f8bbbdebd250565ab5 to your computer and use it in GitHub Desktop.
XSL Transform to Convert a WordPress Page/Post export XML to separate files and build a _data.json file (e.g. for converting sites to HarpJs / EJS)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<!-- | |
Basic XSL transform to convert a WordPress "export" file into a set of separate files, one for each WordPress Page (or Post). This uses | |
xsl:result-document and so needs an XSL 3.0 processor such as | |
Michael Kay's SaxonHE. A typical command line would be: | |
java -jar saxon9HE.jar <WORDPRESS-EXPORT-FILE>.xml convertWPtoEJS.xsl > _data.json | |
--> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0"> | |
<!-- The content is enclosed in CDATA, we want to output the raw text --> | |
<xsl:output method="text" cdata-section-elements="content:encoded"/> | |
<xsl:template match="/rss/channel"> | |
<xsl:text>{</xsl:text> <!-- start JSON standard output --> | |
<xsl:for-each select="item"> | |
<!-- If the link element is not in a useful format consider the guid element instead--> | |
<xsl:variable name="linkString" select="link" /> | |
<xsl:variable name="contentString" select="content:encoded" /> | |
<xsl:variable name="titleString" select="title" /> | |
<!-- This regex can be anything that you want to pick out of the | |
link element, just make sure that it will be unique --> | |
<xsl:analyze-string select="$linkString" regex="http://your-web-site.com/(.*)/$"> | |
<xsl:matching-substring> | |
<!-- You can add other parts to the (unique) filename here --> | |
<xsl:variable name="fileName">_<xsl:value-of select="regex-group(1)"/>.ejs</xsl:variable> | |
<!-- This section copies the WP page content to a separate file --> | |
<xsl:result-document href="{$fileName}" method="text"> | |
<xsl:value-of select="$contentString"/> | |
<!-- You can also output other WordPress elements as required here, wrapping them in HTML if necessary, for | |
example dc:creator or wp:post_date. If these fields | |
are CDATA encoded remember to add them to the xsl:output | |
attribute above. --> | |
</xsl:result-document> | |
<!-- The following lines write a basic Harp JSON entry to standard output --> | |
<xsl:text>"</xsl:text> | |
<xsl:value-of select="$fileName" /> | |
<xsl:text>": {</xsl:text> | |
<xsl:text>"pageTitle": "</xsl:text> | |
<xsl:value-of select="$titleString" /> | |
<xsl:text>"},</xsl:text> <!-- PROBLEM! - need to remove final comma! --> | |
</xsl:matching-substring> | |
</xsl:analyze-string> | |
</xsl:for-each> | |
<!-- CHEAT SOLUTION! Add a final, dummy entry to the JSON --> | |
<xsl:text>{"_dummy": ""}</xsl:text> | |
<xsl:text>}</xsl:text> <!-- close JSON standard output --> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment