Skip to content

Instantly share code, notes, and snippets.

Created June 6, 2024 15:02
Show Gist options
  • Save adamziel/bfd5f9ee971110c9085164d915fde971 to your computer and use it in GitHub Desktop.
Save adamziel/bfd5f9ee971110c9085164d915fde971 to your computer and use it in GitHub Desktop.
A CLI utility to download all the WXR assets and rewrite their URLs
* Rewrites URLs in a WXR file while keeping track of the URLs found.
* This is a huge deal! It unlocks fast, streamed, resumable, fault-tolerant
* WordPress data migrations through WXR files AND directly between sites.
* In particular, this script:
* * Lists all the URLs found in the XML document
* * Rewrites the domain found in each URL while considering the context
* in which it was found (text nodes, cdata, block attributes, HTML attributes, HTML text)
* With these, we can:
* * Stream-process the WXR export
* * Pipe from ZipStreamReader [1] to stream-read directly from a zip
* * Pipe from AsyncHttp\Client to stream-read directly from a remote data source
* * Start downloading the assets upfront with a configurable degree of parallelization
* * Pipe write the rewritten output to another WXR file
* * Pipe to ZipStreamWriter [2] to stream-write directly to a zip
* * Pipe to AsyncHttp\Client to stream-write directly to a remote data source
* [1] ZipStreamReader
* [2] ZipStreamWriter:
* [3] AsyncHttpClient:
define('WP_INCLUDES', __DIR__.'/..');
// Where to find the streaming WP_XML_Processor in WP_INCLUDES/html-api/
// Use a version from this PR:
define('WP_XML_API_PATH', __DIR__.'/../html-api');
// Don't change anything below
// These aren't supported yet but will be:
$args = parseArguments($argv);
define('WXR_PATH', $args['wxr']);
define('NEW_ORIGIN', $args['new-origin'] ?? 'https://playground.internal');
define('NEW_ASSETS_PREFIX', $args['new-assets-prefix']);
define('WRITE_IMAGES_TO_DIRECTORY', $args['downloads-path']);
function parseArguments($argv) {
$options = [
'wxr' => null,
'new-origin' => null,
'new-assets-prefix' => null,
'downloads-path' => null
foreach ($argv as $arg) {
if (strpos($arg, '--wxr=') === 0) {
$options['wxr'] = substr($arg, 6);
} elseif (strpos($arg, '--new-origin=') === 0) {
$options['new-origin'] = rtrim(substr($arg, 13), '/');
} elseif (strpos($arg, '--new-assets-prefix=') === 0) {
$options['new-assets-prefix'] = rtrim(substr($arg, 19), '/').'/';
} elseif (strpos($arg, '--downloads-path=') === 0) {
$options['downloads-path'] = rtrim(substr($arg, 17), '/').'/';
if(!$options['wxr'] || !$options['new-assets-prefix'] || !$options['downloads-path']) {
fwrite(STDERR, "Usage: php preprocess-wxr.php --wxr=<path-to-wxr-file> --new-assets-prefix=<new-assets-prefix> --downloads-path=<downloads-path>\n");
return $options;
require_once WP_INCLUDES . "/html-api/class-wp-html-token.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-span.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-text-replacement.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-decoder.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-attribute-token.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-decoder.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-tag-processor.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-open-elements.php";
require_once WP_INCLUDES . "/class-wp-token-map.php";
require_once WP_INCLUDES . "/html-api/html5-named-character-references.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-active-formatting-elements.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-active-formatting-elements.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-processor-state.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-unsupported-exception.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-processor.php";
require_once WP_XML_API_PATH . "/class-wp-xml-decoder.php";
require_once WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
require_once WP_XML_API_PATH . "/class-wp-xml-processor.php";
// Gather all the URLs from a WXR file
$input_stream = fopen(WXR_PATH, 'rb+');
$output_stream = fopen('/dev/null', 'wb+');
$normalizer = new WP_WXR_Normalizer(
function ($url) { return $url; }
fwrite(STDERR, "Downloading assets...\n\n");
$urls = $normalizer->get_found_urls();
fwrite(STDERR, print_r($urls, true));
// Download the ones looking like assets
$assets_details = [];
foreach($urls as $url) {
$parsed = parse_url($url);
if(!isset($parsed['path'])) {
$filename = basename($parsed['path']) ?: md5($url);
$extension = pathinfo($filename, PATHINFO_EXTENSION);
// Only download paths that seem like images
if(!in_array($extension, ['jpg', 'jpeg', 'png', 'gif', 'svg', 'css', 'js', 'webp'])) {
$assets_details[$url] = [
'url' => $url,
'extension' => $extension,
'filename' => $filename,
'download_path' => WRITE_IMAGES_TO_DIRECTORY . '/' . $filename,
$url_to_path = [];
foreach($assets_details as $url => $details) {
if (!file_exists($details['download_path'])) {
$url_to_path[$url] = $details['download_path'];
// Rewrite the URLs in the WXR file
$input_stream = fopen(WXR_PATH, 'rb+');
$output_stream = fopen('php://stdout', 'wb+');
$normalizer = new WP_WXR_Normalizer(
function ($url) use($assets_details) {
if(isset($assets_details[$url])) {
return NEW_ASSETS_PREFIX . $assets_details[$url]['filename'];
$parsed = parse_url($url);
if(!isset($parsed['host']) || !isset($parsed['scheme'])) {
return $url;
$parsed_origin = parse_url(NEW_ORIGIN);
$parsed['scheme'] = $parsed_origin['scheme'];
$parsed['host'] = $parsed_origin['host'];
return serialize_url($parsed);
function download_assets($url_to_path) {
$mh = curl_multi_init();
$handles = [];
$window_size = 10;
$active_handles = 0;
foreach ($url_to_path as $url => $local_path) {
// Initialize curl handle
$ch = curl_init($url);
$fp = fopen($local_path, 'w');
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
// Add handle to multi-handle
curl_multi_add_handle($mh, $ch);
$handles[(int) $ch] = ['handle' => $ch, 'fp' => $fp];
// When window_size is reached, execute handles
if ($active_handles == $window_size) {
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($running && $execrun == CURLM_OK) {
if (curl_multi_select($mh) == -1) {
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($done = curl_multi_info_read($mh)) {
$handle = $done['handle'];
$fp = $handles[(int) $handle]['fp'];
curl_multi_remove_handle($mh, $handle);
unset($handles[(int) $handle]);
// Process any remaining handles
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($running && $execrun == CURLM_OK) {
if (curl_multi_select($mh) == -1) {
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($done = curl_multi_info_read($mh)) {
$handle = $done['handle'];
$fp = $handles[(int) $handle]['fp'];
curl_multi_remove_handle($mh, $handle);
unset($handles[(int) $handle]);
* WordPress compat
function esc_attr($text) {
return htmlspecialchars($text, ENT_XML1, 'UTF-8');
function serialize_url($parsedUrl) {
return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
. (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
. $parsedUrl['host']
. (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
. (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
. (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
. (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
class WP_WXR_Normalizer
private $input_stream;
private $output_stream;
private $rewrite_url_callback;
private $found_urls = array();
public function __construct(
) {
$this->input_stream = $input_stream;
$this->output_stream = $output_stream;
$this->rewrite_url_callback = $rewrite_url_callback;
public function get_found_urls()
return array_keys($this->found_urls);
public function process()
$tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000);
foreach ($tokens as $processor) {
if (
in_array('item', $processor->get_breadcrumbs())
// $processor->matches_breadcrumbs(array('item', 'content:encoded')) ||
// $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) ||
// $processor->matches_breadcrumbs(array('wp:comment_content'))
) {
switch ($processor->get_token_type()) {
case '#text':
case '#cdata-section':
$text = $processor->get_modifiable_text();
$updated_text = $this->process_content_node($text);
if ($updated_text !== $text) {
private function process_content_node($text)
$result = $this->process_as_html($text);
if(false !== $result) {
return $result;
$result = $this->process_as_plaintext($text);
if(false !== $result) {
return $result;
return false;
private function process_as_html($text) {
$html = new WP_HTML_Tag_Processor($text);
if(false === $html->next_token()) {
return false;
do {
switch($html->get_token_type()) {
case '#comment':
$text = $html->get_modifiable_text();
// Try to parse as a block. The block parser won't cut it because
// while it can parse blocks, it has no semantics for rewriting the
// block markup. Let's do our best here:
$at = strspn($text, ' \t\f\r\n'); // Whitespace
$at + 3 < strlen($text) &&
$text[$at] === 'w' &&
$text[$at+1] === 'p' &&
$text[$at+2] === ':'
)) {
$at += 3;
$at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name
$at += strspn($text, ' \t\f\r\n', $at); // Whitespace again
if($at >= strlen($text)) {
// Oh, there were no attributes or this wasn't a block
// Either way, we have nothing more to do here.
// It seems we may have block attributes here. Let's try to
// parse them as JSON.
$json_maybe = substr($text, $at);
$attributes = json_decode($json_maybe, true);
if(null === $attributes) {
// This wasn't a block after all, let's move on
// This is a block! Let's process all block attributes and rewrite them
$new_attributes = $this->process_block_attributes($attributes);
substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP)
case '#tag':
$attributes = $html->get_attribute_names_with_prefix('');
if(!$attributes) {
foreach($attributes as $attribute_name) {
$value = $html->get_attribute($attribute_name);
$updated = $this->process_as_plaintext($value);
if($updated !== $value) {
$html->set_attribute($attribute_name, $updated);
case '#text':
$text = $html->get_modifiable_text();
$updated_text = $this->process_as_plaintext($text);
if($updated_text !== $text) {
$this->set_modifiable_html_text($html, $updated_text);
} while($html->next_token());
return $html->get_updated_html();
private function process_block_attributes($attributes) {
if(is_string($attributes)) {
return $this->process_as_plaintext($attributes);
} else if(is_array($attributes)) {
$new_attributes = array();
foreach($attributes as $key => $value) {
$new_attributes[$key] = $this->process_block_attributes($value);
return $new_attributes;
} else {
return $attributes;
* @TODO: Investigate how bad this is – would it stand the test of time, or do we need
* a proper URL-matching state machine?
const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b';
private function process_as_plaintext($text) {
return preg_replace_callback(
function ($matches) {
$this->found_urls[$matches[0]] = true;
$replacer = $this->rewrite_url_callback;
return $replacer($matches[0]);
private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) {
$reflection = new ReflectionClass('WP_HTML_Tag_Processor');
$accessible_text_starts_at = $reflection->getProperty('text_starts_at');
$accessible_text_length = $reflection->getProperty('text_length');
$lexical_updates = $reflection->getProperty('lexical_updates');
switch ( $p->get_token_type() ) {
case '#text':
$lexical_updates_now = $lexical_updates->getValue($p);
$lexical_updates_now[] = new WP_HTML_Text_Replacement(
htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
$lexical_updates->setValue($p, $lexical_updates_now);
return true;
case '#comment':
case '#cdata-section':
$p->get_token_type() === '#comment' && (
strpos($new_value, '-->') !== false ||
strpos($new_value, '--!>') !== false
) {
__( 'Cannot set a comment closer as a text of an HTML comment.' ),
return false;
$p->get_token_type() === '#cdata-section' &&
strpos($new_value, '>') !== false
) {
__( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
return false;
$lexical_updates_now = $lexical_updates->getValue($p);
$lexical_updates_now[] = new WP_HTML_Text_Replacement(
$lexical_updates->setValue($p, $lexical_updates_now);
return true;
__( 'Cannot set text content on a non-text node.' ),
return false;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment