Skip to content

Instantly share code, notes, and snippets.

@adamziel
Last active June 6, 2024 14:39
Show Gist options
  • Save adamziel/ebe64e6ec6195e7f1c8d100e2f206f91 to your computer and use it in GitHub Desktop.
Save adamziel/ebe64e6ec6195e7f1c8d100e2f206f91 to your computer and use it in GitHub Desktop.
A foundation for a beter WXR importer
<?php
/**
* Rewrites URLs in a WXR file while keeping track of the URLs found.
*
* This is a huge deal! It unlocks fast, streamed, resumable, fault-tolerant
* WordPress data migrations through WXR files AND directly between sites.
*
* In particular, this script:
*
* * Lists all the URLs found in the XML document
* * Rewrites the domain found in each URL while considering the context
* in which it was found (text nodes, cdata, block attributes, HTML attributes, HTML text)
*
* With these, we can:
*
* * Stream-process the WXR export
* * Pipe from ZipStreamReader [1] to stream-read directly from a zip
* * Pipe from AsyncHttp\Client to stream-read directly from a remote data source
* * Start downloading the assets upfront with a configurable degree of parallelization
* * Pipe write the rewritten output to another WXR file
* * Pipe to ZipStreamWriter [2] to stream-write directly to a zip
* * Pipe to AsyncHttp\Client to stream-write directly to a remote data source
*
* [1] ZipStreamReader https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamReader.php
* [2] ZipStreamWriter: https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamWriter.php
* [3] AsyncHttpClient: https://github.com/WordPress/blueprints-library/blob/trunk/src/WordPress/AsyncHttp/Client.php
*/
define('WP_INCLUDES', __DIR__.'/..');
// Where to find the streaming WP_XML_Processor in WP_INCLUDES/html-api/
// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43
define('WP_XML_API_PATH', __DIR__.'/../html-api');
// Don't change anything below
// These aren't supported yet but will be:
// define('REPLACE_ALL_DOMAINS_WITH', 'https://stylish-press.wordpress.org');
define('NEW_ASSETS_PREFIX', 'https://raw.githubusercontent.com/wordpress/blueprints/stylish-press/blueprints/stylish-press/images/');
define('WRITE_IMAGES_TO_DIRECTORY', __DIR__ . '/downloads');
require_once WP_INCLUDES . "/html-api/class-wp-html-token.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-span.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-text-replacement.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-decoder.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-attribute-token.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-decoder.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-tag-processor.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-open-elements.php";
require_once WP_INCLUDES . "/class-wp-token-map.php";
require_once WP_INCLUDES . "/html-api/html5-named-character-references.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-active-formatting-elements.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-active-formatting-elements.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-processor-state.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-unsupported-exception.php";
require_once WP_INCLUDES . "/html-api/class-wp-html-processor.php";
require_once WP_XML_API_PATH . "/class-wp-xml-decoder.php";
require_once WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
require_once WP_XML_API_PATH . "/class-wp-xml-processor.php";
$wxr = <<<WXR
<?xml version="1.0" encoding="UTF-8"?>
<!-- generator="WordPress/3.0.4" created="2011-01-25 06:54"-->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
>
<channel>
<link>http://longbets.org</link>
<item>
<title>Connect</title>
<link>http://www.old-site.com/contact-us</link>
<guid isPermaLink="false">http://www.oldsite.wordpress.org/?p=5</guid>
<content:encoded><![CDATA[
<!-- wp:cover {"url":"https:\/\/pd\u002Ew\u002Eorg\u002F2024\/06\/398665facdd5a0ea5.69441088-1536x1370.jpeg--\u003e","dimRatio":50,"overlayColor":"black"} -->
<div class="wp-block-cover" style="background-image:url(https:&#47&#47pd&#46w&#46org/2024/06/3986&amp5facdd5a0ea5.69441088-1536x1370.jpeg);background-size:cover;height:400px;">
https:&#47&#47pd&#46w&#46org/test.jpg?a&ampbcdefg
<img src="https:&#47&#47pd&#46w&#46org/2024/06/398665fa.jpg" aria-hidden="true" class="wp-block-cover__background has-black-background-color has-background-dim"></span>
<span aria-hidden="true" class="wp-block-cover__background has-black-background-color has-background-dim"></span>
<div class="wp-block-cover__inner-container">
<!-- wp:heading {"textAlign":"center","level":1,"style":{"typography":{"fontSize":"48px"}}} -->
<h1 class="has-text-align-center" style="font-size:48px;color:#fff">Connect with Us</h1>
<!-- /wp:heading -->
</div>
</div>
]]></content:encoded>
</item>
</channel>
</rss>
WXR;
/*
Output:
<?xml version="1.0" encoding="UTF-8"?>
<!-- generator="WordPress/3.0.4" created="2011-01-25 06:54"-->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
>
<channel>
<link>http://stylish-press.wordpress.org</link>
<item>
<title>Connect</title>
<link>http://stylish-press.wordpress.org/contact-us</link>
<guid isPermaLink="false">http://stylish-press.wordpress.org/?p=5</guid>
<content:encoded><![CDATA[
<!-- wp:cover {"url":"https:\/\/stylish-press.wordpress.org\/2024\/06\/398665facdd5a0ea5.69441088-1536x1370.jpeg","dimRatio":50,"overlayColor":"black"}-->
<div class="wp-block-cover" style="background-image:url(https://stylish-press.wordpress.org/2024/06/3986&amp;amp5facdd5a0ea5.69441088-1536x1370.jpeg);background-size:cover;height:400px;">
https://stylish-press.wordpress.org/test.jpg?a&amp;bcdefg
<img src="https://stylish-press.wordpress.org/2024/06/398665fa.jpg" aria-hidden="true" class="wp-block-cover__background has-black-background-color has-background-dim"></span>
<span aria-hidden="true" class="wp-block-cover__background has-black-background-color has-background-dim"></span>
<div class="wp-block-cover__inner-container">
<!-- wp:heading {"textAlign":"center","level":1,"style":{"typography":{"fontSize":"48px"}}}-->
<h1 class="has-text-align-center" style="font-size:48px;color:#fff">Connect with Us</h1>
<!-- /wp:heading -->
</div>
</div>
]]></content:encoded>
</item>
</channel>
</rss>
Array
(
[http://longbets.org] => 1
[http://www.old-site.com/contact-us] => 1
[http://www.oldsite.wordpress.org/?p=5] => 1
[https://pd.w.org/2024/06/398665facdd5a0ea5.69441088-1536x1370.jpeg] => 1
[https://pd.w.org/2024/06/3986&amp5facdd5a0ea5.69441088-1536x1370.jpeg] => 1
[https://pd.w.org/test.jpg?a&bcdefg] => 1
[https://pd.w.org/2024/06/398665fa.jpg] => 1
)
*/
// Gather all the URLs from a WXR file
$input_stream = fopen('php://memory', 'rwb+');
fwrite($input_stream, $wxr);
rewind($input_stream);
// $output_stream = fopen('php://stdout', 'wb+');
$output_stream = fopen('php://memory', 'wb+');
$normalizer = new WP_WXR_Normalizer(
$input_stream,
$output_stream,
function ($url) { return $url; }
);
$normalizer->process();
fclose($input_stream);
fclose($output_stream);
echo "\n\n";
$urls = $normalizer->get_found_urls();
print_r($urls);
// Download the ones looking like assets
$assets_details = [];
foreach($urls as $url) {
$parsed = parse_url($url);
if(!isset($parsed['path'])) {
continue;
}
$filename = basename($parsed['path']) ?: md5($url);
$extension = pathinfo($filename, PATHINFO_EXTENSION);
// Only download paths that seem like images
if(!in_array($extension, ['jpg', 'jpeg', 'png', 'gif', 'svg', 'css', 'js', 'webp'])) {
continue;
}
$assets_details[$url] = [
'url' => $url,
'extension' => $extension,
'filename' => $filename,
'download_path' => WRITE_IMAGES_TO_DIRECTORY . '/' . $filename,
];
}
$url_to_path = [];
foreach($assets_details as $url => $details) {
if (!file_exists($details['download_path'])) {
$url_to_path[$url] = $details['download_path'];
}
}
download_assets($url_to_path);
// Rewrite the WXR file
$input_stream = fopen('php://memory', 'rwb+');
fwrite($input_stream, $wxr);
rewind($input_stream);
$output_stream = fopen('php://stdout', 'wb+');
$normalizer = new WP_WXR_Normalizer(
$input_stream,
$output_stream,
function ($url) use($assets_details) {
if(isset($assets_details[$url])) {
return NEW_ASSETS_PREFIX . $assets_details[$url]['filename'];
}
return $url;
}
);
$normalizer->process();
fclose($input_stream);
fclose($output_stream);
function download_assets($url_to_path) {
$mh = curl_multi_init();
$handles = [];
$window_size = 10;
$active_handles = 0;
foreach ($url_to_path as $url => $local_path) {
// Initialize curl handle
$ch = curl_init($url);
$fp = fopen($local_path, 'w');
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);
// Add handle to multi-handle
curl_multi_add_handle($mh, $ch);
$handles[(int) $ch] = ['handle' => $ch, 'fp' => $fp];
$active_handles++;
// When window_size is reached, execute handles
if ($active_handles == $window_size) {
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($running && $execrun == CURLM_OK) {
if (curl_multi_select($mh) == -1) {
usleep(100);
}
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
}
while ($done = curl_multi_info_read($mh)) {
$handle = $done['handle'];
$fp = $handles[(int) $handle]['fp'];
curl_multi_remove_handle($mh, $handle);
curl_close($handle);
fclose($fp);
unset($handles[(int) $handle]);
$active_handles--;
}
}
}
// Process any remaining handles
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
while ($running && $execrun == CURLM_OK) {
if (curl_multi_select($mh) == -1) {
usleep(100);
}
do {
$execrun = curl_multi_exec($mh, $running);
} while ($execrun == CURLM_CALL_MULTI_PERFORM);
}
while ($done = curl_multi_info_read($mh)) {
$handle = $done['handle'];
$fp = $handles[(int) $handle]['fp'];
curl_multi_remove_handle($mh, $handle);
curl_close($handle);
fclose($fp);
unset($handles[(int) $handle]);
}
curl_multi_close($mh);
}
/**
* WordPress compat
*/
function esc_attr($text) {
return htmlspecialchars($text, ENT_XML1, 'UTF-8');
}
function serialize_url($parsedUrl) {
return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
. (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
. $parsedUrl['host']
. (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
. (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
. (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
. (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
}
class WP_WXR_Normalizer
{
private $input_stream;
private $output_stream;
private $rewrite_url_callback;
private $found_urls = array();
public function __construct(
$input_stream,
$output_stream,
$rewrite_url_callback
) {
$this->input_stream = $input_stream;
$this->output_stream = $output_stream;
$this->rewrite_url_callback = $rewrite_url_callback;
}
public function get_found_urls()
{
return array_keys($this->found_urls);
}
public function process()
{
$tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000);
foreach ($tokens as $processor) {
if (
in_array('channel', $processor->get_breadcrumbs())
// $processor->matches_breadcrumbs(array('item', 'content:encoded')) ||
// $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) ||
// $processor->matches_breadcrumbs(array('wp:comment_content'))
) {
switch ($processor->get_token_type()) {
case '#text':
case '#cdata-section':
$text = $processor->get_modifiable_text();
$updated_text = $this->process_content_node($text);
if ($updated_text !== $text) {
$processor->set_modifiable_text($updated_text);
}
break;
}
}
}
}
private function process_content_node($text)
{
$result = $this->process_as_html($text);
if(false !== $result) {
return $result;
}
$result = $this->process_as_plaintext($text);
if(false !== $result) {
return $result;
}
return false;
}
private function process_as_html($text) {
$html = new WP_HTML_Tag_Processor($text);
if(false === $html->next_token()) {
return false;
}
do {
switch($html->get_token_type()) {
case '#comment':
$text = $html->get_modifiable_text();
// Try to parse as a block. The block parser won't cut it because
// while it can parse blocks, it has no semantics for rewriting the
// block markup. Let's do our best here:
$at = strspn($text, ' \t\f\r\n'); // Whitespace
if(!(
$at + 3 < strlen($text) &&
$text[$at] === 'w' &&
$text[$at+1] === 'p' &&
$text[$at+2] === ':'
)) {
break;
}
$at += 3;
$at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name
$at += strspn($text, ' \t\f\r\n', $at); // Whitespace again
if($at >= strlen($text)) {
// Oh, there were no attributes or this wasn't a block
// Either way, we have nothing more to do here.
break;
}
// It seems we may have block attributes here. Let's try to
// parse them as JSON.
$json_maybe = substr($text, $at);
$attributes = json_decode($json_maybe, true);
if(null === $attributes) {
// This wasn't a block after all, let's move on
break;
}
// This is a block! Let's process all block attributes and rewrite them
$new_attributes = $this->process_block_attributes($attributes);
$this->set_modifiable_html_text(
$html,
substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP)
);
break;
case '#tag':
$attributes = $html->get_attribute_names_with_prefix('');
if(!$attributes) {
break;
}
foreach($attributes as $attribute_name) {
$value = $html->get_attribute($attribute_name);
$updated = $this->process_as_plaintext($value);
if($updated !== $value) {
$html->set_attribute($attribute_name, $updated);
}
}
break;
case '#text':
$text = $html->get_modifiable_text();
$updated_text = $this->process_as_plaintext($text);
if($updated_text !== $text) {
$this->set_modifiable_html_text($html, $updated_text);
}
break;
}
} while($html->next_token());
return $html->get_updated_html();
}
private function process_block_attributes($attributes) {
if(is_string($attributes)) {
return $this->process_as_plaintext($attributes);
} else if(is_array($attributes)) {
$new_attributes = array();
foreach($attributes as $key => $value) {
$new_attributes[$key] = $this->process_block_attributes($value);
}
return $new_attributes;
} else {
return $attributes;
}
}
/**
* @TODO: Investigate how bad this is – would it stand the test of time, or do we need
* a proper URL-matching state machine?
*/
const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b';
private function process_as_plaintext($text) {
return preg_replace_callback(
'~'.self::URL_REGEXP.'~',
function ($matches) {
$this->found_urls[$matches[0]] = true;
$replacer = $this->rewrite_url_callback;
return $replacer($matches[0]);
},
$text
);
}
private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) {
$reflection = new ReflectionClass('WP_HTML_Tag_Processor');
$accessible_text_starts_at = $reflection->getProperty('text_starts_at');
$accessible_text_starts_at->setAccessible(true);
$accessible_text_length = $reflection->getProperty('text_length');
$accessible_text_length->setAccessible(true);
$lexical_updates = $reflection->getProperty('lexical_updates');
$lexical_updates->setAccessible(true);
switch ( $p->get_token_type() ) {
case '#text':
$lexical_updates_now = $lexical_updates->getValue($p);
$lexical_updates_now[] = new WP_HTML_Text_Replacement(
$accessible_text_starts_at->getValue($p),
$accessible_text_length->getValue($p),
htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
);
$lexical_updates->setValue($p, $lexical_updates_now);
return true;
case '#comment':
case '#cdata-section':
if(
$p->get_token_type() === '#comment' && (
strpos($new_value, '-->') !== false ||
strpos($new_value, '--!>') !== false
)
) {
_doing_it_wrong(
__METHOD__,
__( 'Cannot set a comment closer as a text of an HTML comment.' ),
'WP_VERSION'
);
return false;
}
if(
$p->get_token_type() === '#cdata-section' &&
strpos($new_value, '>') !== false
) {
_doing_it_wrong(
__METHOD__,
__( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
'WP_VERSION'
);
return false;
}
$lexical_updates_now = $lexical_updates->getValue($p);
$lexical_updates_now[] = new WP_HTML_Text_Replacement(
$accessible_text_starts_at->getValue($p),
$accessible_text_length->getValue($p),
$new_value
);
$lexical_updates->setValue($p, $lexical_updates_now);
return true;
default:
_doing_it_wrong(
__METHOD__,
__( 'Cannot set text content on a non-text node.' ),
'WP_VERSION'
);
return false;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment