Skip to content

Instantly share code, notes, and snippets.

@kingozorg
Forked from lyquix-owner/cleanhtml.php
Created August 10, 2022 03:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kingozorg/968b79ab81fce1a0d78616105374ed85 to your computer and use it in GitHub Desktop.
Save kingozorg/968b79ab81fce1a0d78616105374ed85 to your computer and use it in GitHub Desktop.
PHP script to automatically clean dirty HTML. Removes unnecessary attributes (e.g. style, id, dir), replaces deprecated tags with valid ones (e.g. <b> to <strong>), and strips undesirable tags (e.g <font>). We have used this script to safely clean hundreds of blog posts that were littered with inline styling.
<?php
// List of tags to be replaced and their replacement
$replace_tags = [
'i' => 'em',
'b' => 'strong'
];
// List of tags to be stripped. Text and children tags will be preserved.
$remove_tags = [
'acronym',
'applet',
'b',
'basefont',
'big',
'bgsound',
'blink',
'center',
'del',
'dir',
'font',
'frame',
'frameset',
'hgroup',
'i',
'ins',
'kbd',
'marquee',
'nobr',
'noframes',
'plaintext',
'samp',
'small',
'span',
'strike',
'tt',
'u',
'var'
];
// List of attributes to remove. Applied to all tags.
$remove_attribs = [
'class',
'style',
'lang',
'width',
'height',
'align',
'hspace',
'vspace',
'dir'
];
// Your HTML code
$html = '<p class="large-font", style="color: red"><b>Hello</b> <span style="margin-left: 1em">world!</span><br>How are you doing?</p>';
function replaceTags($html, $tags) {
// Clean the HTML
$html = '<div>' . $html . '</div>'; // Workaround to get the HTML back from DOMDocument without the <html><head> and <body> tags
$dom = new DOMDocument;
$dom->loadHTML($html);
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Use simple string replace to replace tags
foreach($tags as $search => $replace) {
$html = str_replace('<' . $search . '>', '<' . $replace . '>', $html);
$html = str_replace('<' . $search . ' ', '<' . $replace . ' ', $html);
$html = str_replace('</' . $search . '>', '</' . $replace . '>', $html);
}
return $html;
}
function stripTags($html, $tags) {
// Remove all attributes from tags to be removed
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
foreach($tags as $tag){
$nodes = $dom->getElementsByTagName($tag);
foreach($nodes as $node) {
// Remove attributes
while($node->attributes->length) {
$node->removeAttribute($node->attributes->item(0)->name);
}
}
}
$html = substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
// Strip tags using string replace
foreach($tags as $tag){
$html = str_replace('<' . $tag . '>', '', $html);
$html = str_replace('</' . $tag . '>', '', $html);
}
return $html;
}
function stripAttributes($html, $attribs) {
// Find all nodes that contain the attribute and remove it
$html = '<div>' . $html . '</div>';
$dom = new DOMDocument;
$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
foreach($attribs as $attrib) {
$nodes = $xPath->query('//*[@' . $attrib . ']');
foreach($nodes as $node) $node->removeAttribute($attrib);
}
return substr($dom->saveHTML($dom->getElementsByTagName('div')->item(0)), 5, -6);
}
$html = replaceTags($html, $replace_tags);
$html = stripTags($html, $remove_tags);
$html = stripAttributes($html, $remove_attribs);
echo $html;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment