Skip to content

Instantly share code, notes, and snippets.

@zesda
Forked from lyquix-owner/cleanhtml.php
Created September 28, 2022 14:20
Show Gist options
  • Save zesda/b49b455badb2df46e28bd54c097da679 to your computer and use it in GitHub Desktop.
Save zesda/b49b455badb2df46e28bd54c097da679 to your computer and use it in GitHub Desktop.
PHP script to automatically clean dirty HTML. Removes unnecessary attributes (e.g. style, id, dir), replaces deprecated tags with valid ones (e.g. <b> to <strong>), and strips undesirable tags (e.g <font>). We have used this script to safely clean hundreds of blog posts that were littered with inline styling.
<?php
// List of tags to be replaced and their replacement
$replace_tags = [
'i' => 'em',
'b' => 'strong'
];
// List of tags to be stripped. Text and children tags will be preserved.
$remove_tags = [
'acronym',
'applet',
'b',
'basefont',
'big',
'bgsound',
'blink',
'center',
'del',
'dir',
'font',
'frame',
'frameset',
'hgroup',
'i',
'ins',
'kbd',
'marquee',
'nobr',
'noframes',
'plaintext',
'samp',
'small',
'span',
'strike',
'tt',
'u',
'var'
];
// List of attributes to remove. Applied to all tags.
$remove_attribs = [
'class',
'style',
'lang',
'width',
'height',
'align',
'hspace',
'vspace',
'dir'
];
// Your HTML code
$html = '<p class="large-font", style="color: red"><b>Hello</b> <span style="margin-left: 1em">world!</span><br>How are you doing?</p>';
function replaceTags($html, $tags) {
// Clean the HTML
$html = '<div>' . $html . '</div>'; // Workaround to get the HTML back from DOMDocument without the <html><head> and <body> tags
$dom = new DOMDocument;
$dom->loadHTML($html);
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Use simple string replace to replace tags
foreach($tags as $search => $replace) {
$html = str_replace('<' . $search . '>', '<' . $replace . '>', $html);
$html = str_replace('<' . $search . ' ', '<' . $replace . ' ', $html);
$html = str_replace('</' . $search . '>', '</' . $replace . '>', $html);
}
return $html;
}
function stripTags($html, $tags) {
// Remove all attributes from tags to be removed
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
foreach($tags as $tag){
$nodes = $dom->getElementsByTagName($tag);
foreach($nodes as $node) {
// Remove attributes
while($node->attributes->length) {
$node->removeAttribute($node->attributes->item(0)->name);
}
}
}
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Strip tags using string replace
foreach($tags as $tag){
$html = str_replace('<' . $tag . '>', '', $html);
$html = str_replace('</' . $tag . '>', '', $html);
}
return $html;
}
function stripAttributes($html, $attribs) {
// Find all nodes that contain the attribute and remove it
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
foreach($attribs as $attrib) {
$nodes = $xPath->query('//*[@' . $attrib . ']');
foreach($nodes as $node) $node->removeAttribute($attrib);
}
return substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
}
$html = replaceTags($html, $replace_tags);
$html = stripTags($html, $remove_tags);
$html = stripAttributes($html, $remove_attribs);
echo $html;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment