Skip to content

Instantly share code, notes, and snippets.

@vishalkakadiya
Last active October 27, 2023 05:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vishalkakadiya/30aac46bcd3342e88ef8382ed041f048 to your computer and use it in GitHub Desktop.
Save vishalkakadiya/30aac46bcd3342e88ef8382ed041f048 to your computer and use it in GitHub Desktop.
WordPress Migration: Convert HTML doms into the common Gutenberg blocks
<?php
/**
* This snippet is useful while doing migration for the WordPress.
* Basically this script will convert the HTML dom into the common Gutenberg block comments,
* so the tags like Pagragraph will be converted into the "<!-- wp:paragraph -->SOME DATA<!-- /wp:paragraph -->".
*
* This script supports paragraph, headings, list tags and many more.
*/
/**
* Parse the post content to convert to Gutenberg blocks.
*
* @param \DOMNode $dom_node DOMNode object.
* @param string $parent_tag_start Parent tag content start string.
* @param string $parent_tag_end Parent tag content end string.
*
* @return string
*/
function parse_dom_node( \DOMNode $dom_node, $parent_tag_start = '', $parent_tag_end = '' ) {
$content = '';
$skip_content = false;
// phpcs:disable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
if ( ! empty( $dom_node->childNodes ) ) {
foreach ( $dom_node->childNodes as $node ) {
if ( in_array( $node->nodeName, [ 'html', 'body', 'head' ], true ) ) {
$content .= $this->parse_dom_node( $node );
} elseif ( '#text' === $node->nodeName && $node->nodeValue ) {
$content .= $node->nodeValue;
} elseif ( $node->hasChildNodes() ) {
$block_start = '';
$block_end = '';
$tag_attrs = '';
// Identify the tag names and convert to Gutenberg block markup.
if ( 'p' === $node->nodeName ) {
if ( 'blockquote' === $node->parentNode->nodeName && 'twitter-tweet' === $node->parentNode->getAttribute( 'class' ) ) {
$content = $this->parse_dom_node( $node );
continue;
} elseif ( 'blockquote' !== $node->parentNode->nodeName ) {
$block_start = '<!-- wp:paragraph -->';
$block_end = '<!-- /wp:paragraph -->';
}
} elseif ( in_array( $node->nodeName, [ 'ol', 'ul' ], true ) ) {
$block_start = '<!-- wp:list -->';
$block_end = '<!-- /wp:list -->';
} elseif ( in_array( $node->nodeName, [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ], true ) ) {
$level = false;
if ( 'h1' === $node->nodeName ) {
$level = 1;
} elseif ( 'h3' === $node->nodeName ) {
$level = 3;
} elseif ( 'h4' === $node->nodeName ) {
$level = 4;
} elseif ( 'h5' === $node->nodeName ) {
$level = 5;
} elseif ( 'h6' === $node->nodeName ) {
$level = 6;
}
if ( $level ) {
$block_start = '<!-- wp:heading {"level":' . $level . '} -->';
} else {
$block_start = '<!-- wp:heading -->';
}
$block_end = '<!-- /wp:heading -->';
} elseif ( 'a' === $node->nodeName ) {
$link = $node->getAttribute( 'href' );
if ( 'p' === $node->parentNode->nodeName && 'img' === $node->firstChild->nodeName ) {
$content = $parent_tag_start . $content . $parent_tag_end;
$skip_content = true;
}
if ( ! empty( $link ) ) {
$content = sprintf( '%1$s <a href="%2$s">%3$s</a>', $content, esc_url( $link ), $node->nodeValue );
continue;
}
if ( '_blank' === $node->getAttribute( 'target' ) ) {
$tag_attrs .= ' rel="noreferrer noopener" aria-label=" (opens in a new tab)"';
}
} elseif ( 'blockquote' === $node->nodeName ) {
if ( ! $node->nodeValue ) {
$skip_content = true;
continue;
} elseif ( 'twitter-tweet' === $node->getAttribute( 'class' ) ) {
$content .= $this->parse_dom_node( $node );
continue;
}
$block_start = '<!-- wp:quote -->';
$block_end = '<!-- /wp:quote -->';
$tag_attrs = ' class="wp-block-quote"';
} elseif ( 'pre' === $node->nodeName ) {
$block_start = '<!-- wp:preformatted -->';
$block_end = '<!-- /wp:preformatted -->';
$tag_attrs = ' class="wp-block-preformatted"';
} elseif ( 'object' === $node->nodeName ) {
$content .= $this->parse_dom_node( $node );
continue;
} elseif ( 'strong' === $node->nodeName ) {
if ( 'p' === $node->firstChild->nodeName ) {
$content .= $this->parse_dom_node( $node );
continue;
}
} elseif ( 'div' === $node->nodeName ) {
$container_classes = $node->getAttribute( 'class' );
$div_start = '';
$div_end = '';
if ( preg_match( '~(field-name-field-see-related)~', $container_classes ) ) {
$div_start = '<!-- wp:paragraph -->';
$div_end = '<!-- /wp:paragraph -->';
}
$content .= $this->parse_dom_node( $node, $div_start, $div_end );
continue;
}
// Final content string.
$tag_start = "{$block_start}<{$node->nodeName}{$tag_attrs}>";
if ( 'ol' === $node->nodeName ) {
$tag_start = "{$block_start}<ul{$tag_attrs}>";
}
$tag_end = "</{$node->nodeName}>{$block_end}";
if ( 'ol' === $node->nodeName ) {
$tag_end = "</ul>{$block_end}";
}
$content .= $this->parse_dom_node( $node, $tag_start, $tag_end );
} else {
$block_start = '';
$block_end = '';
if ( 'script' === $node->nodeName ) {
$src = $node->getAttribute( 'src' );
if ( false !== strpos( $src, 'playbuzz.com' ) ) {
$tag_attrs = ' type="text/javascript" src="' . $src . '"';
$content .= '<!-- wp:html -->';
$content .= "<{$node->nodeName}{$tag_attrs} />";
}
} elseif ( 'param' === $node->nodeName ) {
continue;
} elseif ( 'img' === $node->nodeName ) {
$parsed_url = wp_parse_url( $node->getAttribute( 'src' ) );
$img_url = isset( $parsed_url['path'] ) ? $parsed_url['path'] : '';
$download_image = [];
if ( false !== strpos( $node->getAttribute( 'src' ), '/media/' ) ) {
$img_url = str_replace( '/media/', $this->original_site_domain . '/media/', $img_url );
$download_image = download_inline_image( $img_url );
}
if ( isset( $download_image['url'] ) && ! empty( $download_image['url'] ) ) {
$tag_attrs = ' src="' . $download_image['url'] . '" alt="' . $node->getAttribute( 'alt' ) . '"';
$block_start = '<!-- wp:image -->';
$block_start .= '<figure class="wp-block-image">';
if ( 'p' === $node->parentNode->nodeName ) {
$content = $parent_tag_start . $content . $parent_tag_end;
$skip_content = true;
} elseif ( 'a' === $node->parentNode->nodeName ) {
$block_start .= $parent_tag_start;
$block_end .= $parent_tag_end;
$skip_content = true;
}
$block_end .= '</figure>';
$block_end .= '<!-- /wp:image -->';
$content .= "{$block_start}<{$node->nodeName}{$tag_attrs} />{$block_end}";
}
} elseif ( 'hr' === $node->nodeName ) {
$block_start = '<!-- wp:separator -->';
$block_end = '<!-- /wp:separator -->';
$tag_attrs = ' class="wp-block-separator"';
$content .= "{$block_start}<{$node->nodeName}{$tag_attrs} />{$block_end}";
}
}
}
if ( ! $skip_content ) {
$content = $parent_tag_start . $content . $parent_tag_end;
}
}
// phpcs:enable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
return $content;
}
/**
* Download inline image and return new path.
*
* @param string $url The image URL.
*
* @return array
*/
function download_inline_image( $url ) {
$url = $this->get_img_url( $url );
$attach_id = $this->get_media_id_from_url( $url );
$already_exists = false;
if ( ! $attach_id ) {
$name = basename( $url );
// Download the image.
$tmp_file = download_url( $url, 15 );
if ( is_wp_error( $tmp_file ) ) {
\WP_CLI::log( "\nERROR: Unable to download image: {$url}" );
} else {
$file = [
'name' => $name,
'tmp_name' => $tmp_file,
'error' => 0,
'size' => filesize( $tmp_file ),
];
$attach_id = media_handle_sideload( $file );
if ( is_wp_error( $attach_id ) ) {
\WP_CLI::log( "\nERROR: Unable to open output file: {$url}" );
}
}
} else {
$already_exists = true;
}
if ( ! is_wp_error( $attach_id ) && ! empty( $attach_id ) ) {
if ( ! $already_exists ) {
set_media_meta( $attach_id, $url );
}
return [
'url' => get_post_field( 'guid', $attach_id ),
'id' => $attach_id,
];
}
return [];
}
/**
* Update attachment meta with original url.
*
* @param int $attach_id Attachment id.
* @param string $url Original URL.
*/
function set_media_meta( $attach_id, $url ) {
update_post_meta( $attach_id, '_original_image_url', $url );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment