greg-randall/gtn-brgr-upgrdr.php

## gtn-brgr-upgrdr.php
<?php
/**
 * Plugin Name:     Gutenburger Upgrader
 * Author:          Greg R.
 * Version:         0.0.1
 */

/*

This plugin tries to clean html up so that it can be directly converted into gutenberg as blocks.
You'll lose formatting but generally you can fix that quickly after the cleaning.

This plug edits pages directly on the fly, you should pull backups, and DO NOT USE ON A PRODUCTION SITE.

Use:
Make a new page, add the shortcode '[list_gutenberg_pages]' to the page.
Load the page, open each page and edit link in new tabs.
Check to make sure the page isn't super broken, if it is fix it in the edit link.

Then click 'Classic' at the top of the editor screen, then click 'Convert to blocks', then 'Update'.

Enjoy your Gutenberged Page!

*/


/*
 ini_set('display_errors', 1);
 ini_set('display_startup_errors', 1);
 error_reporting(E_ALL);
*/


function list_gutenberg_pages(){

    $args = array(
        'post_type' => array( 'page' ),
        'order' => 'ASC',
        'orderby' => 'title',
        "posts_per_page" => -1
        );
    $the_query = new WP_Query( $args ); // get all the pages

    $output ="<ul>\n";

    $total_time= microtime(true); //keeping track of total duration of processing
    $page_count = 0;
    while ( $the_query->have_posts() ) { //loop through all the pages
        $the_query->the_post(); //set the page up for processing

        if ( !has_blocks() && get_post_status() !== 'trash' && strlen(trim(get_the_content()))>2 && (microtime(true) - $total_time) <= 5) { // if it's not a blocks page, and it's not in the trash, and it's not zero length
            $start = microtime(true);//keeping track of the duration of processing for each page
            $my_post = array(
                'ID' => get_the_ID(),
                'post_content' => gtn_brgr_cleaner(get_the_content()), //clean the content
                'post_title' => get_the_title()
            );
            wp_update_post( $my_post ); //update the page cotent

            //making an easy to read list of the pages that were updated
            $output .= "<li><a href=". get_permalink() .'">'  . the_title('','',FALSE) . "</a> (".round((microtime(true) - $start),2)."s)<br>&nbsp;&nbsp;&nbsp;<a href=\"/wp-admin/post.php?post=".get_the_ID()."&action=edit\">Edit</a></li>\n";
        }
        $page_count++;
    }
    $output .= "</ul>";
    return ("Total Time: ".round((microtime(true) - $total_time),2)." seconds<br>\nTotal Non-Gutenberg Pages Remaining: $page_count<br>\n<hr>". $output); //return the list of pages updated
}
add_shortcode('list_gutenberg_pages', 'list_gutenberg_pages');


////////////////////////////////
// cleaner
////////////////////////////////


//mostly scooped from  https://gist.github.com/greg-randall/05adf2268c82c89543c159bc2742fce7
function gtn_brgr_cleaner($html){
    error_reporting(E_ERROR | E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down

    /* configuration */
    $allowed_attribute = [
        // attributes to keep on the html i.e. <a href="www.asdf.com">
        "content",
        "http-equiv",
        "src",
        "href",
        "src",
        "alt",
        "colspan",
        "rowspan",
        "id",
    ];
    $tags_to_remove = [
        //tags to remove
        "div",
        "span",
        "figure",
        "font",
        "section",
        "aside", //aside & article trip up the automatic import in wordpress gutenberg
        "article",
        "header"
    ];

    $remove_fancy_quotes = true; // changes  ‘ ’   “  ” and some similar stuff to  ' and "
    $remove_fancy_spaces = true; // changes &nbsp; &thinsp; etc to a regular space.
    $remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes
    $remove_empy_td = false; // keeps or removes empty table cells
    $convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to &amp;
    $run_wordpress_paragraph_tag_adder = true;

    /* end configuration */


    if ($remove_fancy_quotes) {
        // sometimes apostorphies change into &iuml;&iquest;&frac12;
        $html = str_ireplace(
            [   "&iuml;&iquest;&frac12;",
                "&lsquo;",
                "&rsquo;",
                "&#8216;",
                "&#8217;",
                "&apos;",
                "&prime;",
                "&#8242;",
                "’",
                "‘",
                "`",],
            "'",
            $html
        ); //change curly single quote to regular
        $html = str_ireplace(
            [   "&ldquo;",
                "&rdquo;",
                "&#8220;",
                "&#8221;",
                "&quot;",
                "&Prime;",
                "&#8243;",
                "”",
                "“",
                "''",],
            '"',
            $html
        ); //change curly double quotes to regular
    }
    if ($remove_fancy_dashes) {
        $html = str_ireplace(
            [  "&#8208;",
                "‑",
                "&#8209;",
                "‒",
                "&#8210;",
                "–",
                "&#8211;",
                "&ndash;",
                "—",
                "&#8212;",
                "&mdash;",
                "―",
                "&#8213;",],
            "-",
            $html
        ); //change fancy dashes to a regular hyphen
    }


    $html = str_ireplace(array("&lt;!--","--&gt;","<!–","–>","&lt;!&#8211;","&#8211;&gt;"),array("<!--","-->","<!--","-->","<!--","-->"),$html); //change html comments to regular comments

    $html = str_ireplace("<p>&nbsp;</p>","<br>", $html); //change empty paragraphs to a line break

    $dom = new DOMDocument;
    $dom->loadHtml($html);

    $xpath = new DOMXPath($dom);
    foreach ($xpath->query('//comment()') as $comment) {
        $comment->parentNode->removeChild($comment);
    }
    $Html = $body instanceof DOMNode ? $dom->saveXml($body) : 'something failed';


$body = $xpath->query('//body')->item(0);

    if (substr_count($html, "<html") > 0) {
        //determine if the input is a full html document or not, gets passed to the dirtymarkup cleaning below
        $html_fragment = "full";
    } else {
        $html_fragment = "fragment";
    }

    if($run_wordpress_paragraph_tag_adder){
        $html = wpautop($html);
    }

    if($convert_chars_to_entities){
    //encodes charecters into html entities, but only in the text
    $doc = new DOMDocument();
    $doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    $html = $doc->saveHTML();
    }

    $html = beautify_html($html, $html_fragment); //run the html cleaner before processing-- it fixes some html errors that won't make the cleaning as effective

    //this is done prior to the domdocument cleaning, we remove empty tags i.e. '<p></p>' or '<p> </p>', but will not remove '<p>&nbsp;</p>', this solves that issue
    //note we won't remove tags like '<a href="www.asdf.com"></a>'
    if ($remove_fancy_spaces) {
        $html = str_ireplace(
            [   " ",
                "&#8192;",
                " ",
                "&#8193;",
                " ",
                "&#8194;",
                "&ensp;",
                " ",
                "&#8195;",
                "&emsp;",
                " ",
                "&#8196;",
                " ",
                "&#8197;",
                " ",
                "&#8198;",
                " ",
                "&#8199;",
                " ",
                "&#8200;",
                " ",
                "&#8201;",
                "&thinsp;",
                " ",
                "&#8202;",
                "",
                "&#8203;",
                "&#160;",
                "&nbsp;",],
            " ",
            $html
        ); //change spaces to a regular spaces.
        $html = preg_replace("/\s+/", " ", $html); // catches any extra odd stragglers or if the previous step put two spaces next to eachother collapses them.
    }

    // this is a bit kludgy, but i have a function below that removes empty tags, but you probably don't want empty table tags removed
    // i used the string '~~..~~' since it's unlikely to appear in actual text
    if (!$remove_empy_td) {
        $html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html);
        $html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html);
    }


    $dom = new DOMDocument();
    $dom->loadHTML($html);
    $xpath = new DOMXPath($dom);

    //remove comments
    foreach ($xpath->query('//comment()') as $comment) {
        $comment->parentNode->removeChild($comment);
    }

    $elements = $xpath->query("//*");
    foreach ($elements as $element) {
        //loops through all the elements
        for ($i = $element->attributes->length; --$i >= 0; ) {
            //loops through all the attributes backwards (which is required apparently)
            $name = $element->attributes->item($i)->name;
            if (!in_array($name, $allowed_attribute)) {
                //if the attribute doesn't match one of the ones we're saving, we delete it.
                $element->removeAttribute($name);
            }
        }
    }

    //generates an appropriate list of tags to remove for the xpath query
    for ($i = 0; $i < count($tags_to_remove); $i++) {
        $tags_to_remove[$i] = "//$tags_to_remove[$i]";
    }
    $tags_to_remove = implode(" | ", $tags_to_remove);

    //delete all div & span tags
    foreach ($xpath->query($tags_to_remove) as $remove) {
        // Move all span tag content to its parent node just before it.
        while ($remove->hasChildNodes()) {
            $child = $remove->removeChild($remove->firstChild);
            $remove->parentNode->insertBefore($child, $remove);
        }
        $remove->parentNode->removeChild($remove);
    }

    //removes empty tags
    //not(*) does not have children elements
    //not(@*) does not have attributes
    //text()[normalize-space()] nodes that include whitespace text
    while (
        ($node_list = $xpath->query(
            "//*[not(*) and not(@*) and not(text()[normalize-space()])]"
        )) &&
        $node_list->length
    ) {
        foreach ($node_list as $node) {
            $node->parentNode->removeChild($node);
        }
    }

    $clean = $dom->saveHTML();

    // this removes the placeholder text in empty th/td. this is a bit kludgy, but it works fine.
    if (!$remove_empy_td) {
        $clean = preg_replace("/~~\.\.~~/", "", $clean);
    }

    $clean = str_ireplace(array("&lt;!--","--&gt;","<!–","–>","&lt;!&#8211;","&#8211;&gt;"),array("<!--","-->","<!--","-->","<!--","-->"),$clean); //change html comments to regular comments

    $clean = str_ireplace(array("<h1","h1>"),array("<h2","h2>"),$clean); //change html comments to regular comments


    return(beautify_html($clean, $html_fragment));
}
function beautify_html($html, $html_fragment)
{
$url = "https://www.10bestdesign.com/dirtymarkup/api/html";
$context = stream_context_create([
    "http" => [
        "method" => "POST",
        "header" => "Content-type: application/x-www-form-urlencoded",
        "content" => http_build_query([
            "code" => $html,
            "output" => $html_fragment,
        ]),
        "timeout" => 60,
    ],
]);
$resp = file_get_contents($url, false, $context);
$resp = json_decode($resp, true);
return array_pop($resp);
}
	<?php
	/**
	* Plugin Name: Gutenburger Upgrader
	* Author: Greg R.
	* Version: 0.0.1
	*/

	/*

	This plugin tries to clean html up so that it can be directly converted into gutenberg as blocks.
	You'll lose formatting but generally you can fix that quickly after the cleaning.

	This plug edits pages directly on the fly, you should pull backups, and DO NOT USE ON A PRODUCTION SITE.

	Use:
	Make a new page, add the shortcode '[list_gutenberg_pages]' to the page.
	Load the page, open each page and edit link in new tabs.
	Check to make sure the page isn't super broken, if it is fix it in the edit link.

	Then click 'Classic' at the top of the editor screen, then click 'Convert to blocks', then 'Update'.

	Enjoy your Gutenberged Page!

	*/


	/*
	ini_set('display_errors', 1);
	ini_set('display_startup_errors', 1);
	error_reporting(E_ALL);
	*/


	function list_gutenberg_pages(){

	$args = array(
	'post_type' => array( 'page' ),
	'order' => 'ASC',
	'orderby' => 'title',
	"posts_per_page" => -1
	);
	$the_query = new WP_Query( $args ); // get all the pages

	$output ="<ul>\n";

	$total_time= microtime(true); //keeping track of total duration of processing
	$page_count = 0;
	while ( $the_query->have_posts() ) { //loop through all the pages
	$the_query->the_post(); //set the page up for processing

	if ( !has_blocks() && get_post_status() !== 'trash' && strlen(trim(get_the_content()))>2 && (microtime(true) - $total_time) <= 5) { // if it's not a blocks page, and it's not in the trash, and it's not zero length
	$start = microtime(true);//keeping track of the duration of processing for each page
	$my_post = array(
	'ID' => get_the_ID(),
	'post_content' => gtn_brgr_cleaner(get_the_content()), //clean the content
	'post_title' => get_the_title()
	);
	wp_update_post( $my_post ); //update the page cotent

	//making an easy to read list of the pages that were updated
	$output .= "<li><a href=". get_permalink() .'">' . the_title('','',FALSE) . "</a> (".round((microtime(true) - $start),2)."s)<br>   <a href=\"/wp-admin/post.php?post=".get_the_ID()."&action=edit\">Edit</a></li>\n";
	}
	$page_count++;
	}
	$output .= "</ul>";
	return ("Total Time: ".round((microtime(true) - $total_time),2)." seconds<br>\nTotal Non-Gutenberg Pages Remaining: $page_count<br>\n<hr>". $output); //return the list of pages updated
	}
	add_shortcode('list_gutenberg_pages', 'list_gutenberg_pages');




	////////////////////////////////
	// cleaner
	////////////////////////////////



	//mostly scooped from https://gist.github.com/greg-randall/05adf2268c82c89543c159bc2742fce7
	function gtn_brgr_cleaner($html){
	error_reporting(E_ERROR \| E_PARSE); //DOMDocument throws a fair number of errors, we'll quiet them down

	/* configuration */
	$allowed_attribute = [
	// attributes to keep on the html i.e. <a href="www.asdf.com">
	"content",
	"http-equiv",
	"src",
	"href",
	"src",
	"alt",
	"colspan",
	"rowspan",
	"id",
	];
	$tags_to_remove = [
	//tags to remove
	"div",
	"span",
	"figure",
	"font",
	"section",
	"aside", //aside & article trip up the automatic import in wordpress gutenberg
	"article",
	"header"
	];

	$remove_fancy_quotes = true; // changes ‘ ’ “ ” and some similar stuff to ' and "
	$remove_fancy_spaces = true; // changes     etc to a regular space.
	$remove_fancy_dashes = true; // changes EM dashes, EN dashes, etc to regular dashes
	$remove_empy_td = false; // keeps or removes empty table cells
	$convert_chars_to_entities = true; // converts html entities to their character equivalent i.e. & to &
	$run_wordpress_paragraph_tag_adder = true;

	/* end configuration */


	if ($remove_fancy_quotes) {
	// sometimes apostorphies change into ï¿½
	$html = str_ireplace(
	[ "ï¿½",
	"‘",
	"’",
	"‘",
	"’",
	"'",
	"′",
	"′",
	"’",
	"‘",
	"`",],
	"'",
	$html
	); //change curly single quote to regular
	$html = str_ireplace(
	[ "“",
	"”",
	"“",
	"”",
	""",
	"″",
	"″",
	"”",
	"“",
	"''",],
	'"',
	$html
	); //change curly double quotes to regular
	}
	if ($remove_fancy_dashes) {
	$html = str_ireplace(
	[ "‐",
	"‑",
	"‑",
	"‒",
	"‒",
	"–",
	"–",
	"–",
	"—",
	"—",
	"—",
	"―",
	"―",],
	"-",
	$html
	); //change fancy dashes to a regular hyphen
	}


	$html = str_ireplace(array("<!--","-->","<!–","–>","<!–","–>"),array("<!--","-->","<!--","-->","<!--","-->"),$html); //change html comments to regular comments

	$html = str_ireplace("<p> </p>","<br>", $html); //change empty paragraphs to a line break

	$dom = new DOMDocument;
	$dom->loadHtml($html);

	$xpath = new DOMXPath($dom);
	foreach ($xpath->query('//comment()') as $comment) {
	$comment->parentNode->removeChild($comment);
	}
	$Html = $body instanceof DOMNode ? $dom->saveXml($body) : 'something failed';


	$body = $xpath->query('//body')->item(0);

	if (substr_count($html, "<html") > 0) {
	//determine if the input is a full html document or not, gets passed to the dirtymarkup cleaning below
	$html_fragment = "full";
	} else {
	$html_fragment = "fragment";
	}

	if($run_wordpress_paragraph_tag_adder){
	$html = wpautop($html);
	}

	if($convert_chars_to_entities){
	//encodes charecters into html entities, but only in the text
	$doc = new DOMDocument();
	$doc->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);
	$html = $doc->saveHTML();
	}

	$html = beautify_html($html, $html_fragment); //run the html cleaner before processing-- it fixes some html errors that won't make the cleaning as effective

	//this is done prior to the domdocument cleaning, we remove empty tags i.e. '<p></p>' or '<p> </p>', but will not remove '<p> </p>', this solves that issue
	//note we won't remove tags like '<a href="www.asdf.com"></a>'
	if ($remove_fancy_spaces) {
	$html = str_ireplace(
	[ " ",
	" ",
	" ",
	" ",
	" ",
	" ",
	"&ensp;",
	" ",
	" ",
	"&emsp;",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	" ",
	"",
	"",
	" ",
	" ",],
	" ",
	$html
	); //change spaces to a regular spaces.
	$html = preg_replace("/\s+/", " ", $html); // catches any extra odd stragglers or if the previous step put two spaces next to eachother collapses them.
	}

	// this is a bit kludgy, but i have a function below that removes empty tags, but you probably don't want empty table tags removed
	// i used the string '~~..~~' since it's unlikely to appear in actual text
	if (!$remove_empy_td) {
	$html = preg_replace("/\> ?<\/td>/", ">~~..~~</td>", $html);
	$html = preg_replace("/\> ?<\/th>/", ">~~..~~</th>", $html);
	}


	$dom = new DOMDocument();
	$dom->loadHTML($html);
	$xpath = new DOMXPath($dom);

	//remove comments
	foreach ($xpath->query('//comment()') as $comment) {
	$comment->parentNode->removeChild($comment);
	}

	$elements = $xpath->query("//*");
	foreach ($elements as $element) {
	//loops through all the elements
	for ($i = $element->attributes->length; --$i >= 0; ) {
	//loops through all the attributes backwards (which is required apparently)
	$name = $element->attributes->item($i)->name;
	if (!in_array($name, $allowed_attribute)) {
	//if the attribute doesn't match one of the ones we're saving, we delete it.
	$element->removeAttribute($name);
	}
	}
	}

	//generates an appropriate list of tags to remove for the xpath query
	for ($i = 0; $i < count($tags_to_remove); $i++) {
	$tags_to_remove[$i] = "//$tags_to_remove[$i]";
	}
	$tags_to_remove = implode(" \| ", $tags_to_remove);

	//delete all div & span tags
	foreach ($xpath->query($tags_to_remove) as $remove) {
	// Move all span tag content to its parent node just before it.
	while ($remove->hasChildNodes()) {
	$child = $remove->removeChild($remove->firstChild);
	$remove->parentNode->insertBefore($child, $remove);
	}
	$remove->parentNode->removeChild($remove);
	}

	//removes empty tags
	//not(*) does not have children elements
	//not(@*) does not have attributes
	//text()[normalize-space()] nodes that include whitespace text
	while (
	($node_list = $xpath->query(
	"//[not() and not(@*) and not(text()[normalize-space()])]"
	)) &&
	$node_list->length
	) {
	foreach ($node_list as $node) {
	$node->parentNode->removeChild($node);
	}
	}

	$clean = $dom->saveHTML();

	// this removes the placeholder text in empty th/td. this is a bit kludgy, but it works fine.
	if (!$remove_empy_td) {
	$clean = preg_replace("/~~\.\.~~/", "", $clean);
	}

	$clean = str_ireplace(array("<!--","-->","<!–","–>","<!–","–>"),array("<!--","-->","<!--","-->","<!--","-->"),$clean); //change html comments to regular comments

	$clean = str_ireplace(array("<h1","h1>"),array("<h2","h2>"),$clean); //change html comments to regular comments


	return(beautify_html($clean, $html_fragment));
	}
	function beautify_html($html, $html_fragment)
	{
	$url = "https://www.10bestdesign.com/dirtymarkup/api/html";
	$context = stream_context_create([
	"http" => [
	"method" => "POST",
	"header" => "Content-type: application/x-www-form-urlencoded",
	"content" => http_build_query([
	"code" => $html,
	"output" => $html_fragment,
	]),
	"timeout" => 60,
	],
	]);
	$resp = file_get_contents($url, false, $context);
	$resp = json_decode($resp, true);
	return array_pop($resp);
	}