Skip to content

Instantly share code, notes, and snippets.

@gaffling
Last active February 7, 2020 11:59
Show Gist options
  • Save gaffling/ae9929668c5dc5d57ee08151978b092e to your computer and use it in GitHub Desktop.
Save gaffling/ae9929668c5dc5d57ee08151978b092e to your computer and use it in GitHub Desktop.
[HTML fixer] PHP class taht clean & repair html code, unofficial copy of http://barattalo.it/html-fixer/ #php #class #HTMLrepair
<?PHP
/* -------------------------------------------------------------------------------------------------------------------------------- */
/* [HTML fixer] PHP class taht clean & repair html code, unofficial copy of http://barattalo.it/html-fixer/ #php #class #HTMLrepair */
/* -------------------------------------------------------------------------------------------------------------------------------- */
class HtmlFixer {
/* USAGE:
$class = new HtmlFixer();
$clean_html = $class->getFixedHtml($dirty_html); */
public $dirtyhtml;
public $fixedhtml;
public $allowed_styles; // inline styles array of allowed css (if empty means ALL allowed)
private $matrix; // array used to store nodes
public $debug;
private $fixedhtmlDisplayCode;
public function __construct() {
$this->dirtyhtml = "";
$this->fixedhtml = "";
$this->debug = false;
$this->fixedhtmlDisplayCode = "";
$this->allowed_styles = array();
}
public function getFixedHtml($dirtyhtml) {
$c = 0;
$this->dirtyhtml = $dirtyhtml;
$this->fixedhtml = "";
$this->fixedhtmlDisplayCode = "";
if (is_array($this->matrix)) unset($this->matrix);
$errorsFound=0;
while ($c<10) {
// iterations, every time it's getting better...
if ($c>0) $this->dirtyhtml = $this->fixedxhtml;
$errorsFound = $this->charByCharJob();
if (!$errorsFound) $c=10; // if no corrections made, stops iteration
$this->fixedxhtml=str_replace('<root>','',$this->fixedxhtml);
$this->fixedxhtml=str_replace('</root>','',$this->fixedxhtml);
$this->fixedxhtml = $this->removeSpacesAndBadTags($this->fixedxhtml);
$c++;
}
return $this->fixedxhtml;
}
private function fixStrToLower($m){
// $m is a part of the tag: make the first part of attr=value lowercase
$right = strstr($m, '=');
$left = str_replace($right,'',$m);
return strtolower($left).$right;
}
private function fixQuotes($s){
$q = "\"";
if (!stristr($s,"=")) return $s;
$out = $s;
preg_match_all("|=(.*)|",$s,$o,PREG_PATTERN_ORDER);
for ($i = 0; $i< count ($o[1]); $i++) {
$t = trim ( $o[1][$i] ) ;
$lc="";
if ($t!="") {
if ($t[strlen($t)-1]==">") {
$lc= ($t[strlen($t)-2].$t[strlen($t)-1])=="/>" ? "/>" : ">" ;
$t=substr($t,0,-1);
}
// missing " or ' at the beginning
if (($t[0]!="\"")&&($t[0]!="'")) $out = str_replace( $t, "\"".$t,$out); else $q=$t[0];
// missing " or ' at the end
if (($t[strlen($t)-1]!="\"")&&($t[strlen($t)-1]!="'")) $out = str_replace( $t.$lc, $t.$q.$lc,$out);
}
}
return $out;
}
private function fixTag($t){
// remove non standard attributes and call the fix for quoted attributes
$t = preg_replace (
array(
'/borderColor=([^ >])*/i',
'/border=([^ >])*/i'
),
array(
'',
''
)
, $t);
$ar = explode(" ",$t);
$nt = "";
for ($i=0;$i<count($ar);$i++) {
$ar[$i]=$this->fixStrToLower($ar[$i]);
if (stristr($ar[$i],"=")) $ar[$i] = $this->fixQuotes($ar[$i]); // thanks to emmanuel@evobilis.com
// if (stristr($ar[$i],"=") && !stristr($ar[$i],"=\"")) $ar[$i] = $this->fixQuotes($ar[$i]);
$nt.=$ar[$i]." ";
}
$nt=preg_replace("/<( )*/i","<",$nt);
$nt=preg_replace("/( )*>/i",">",$nt);
return trim($nt);
}
private function extractChars($tag1,$tag2,$tutto) { // extract a block between $tag1 and $tag2
if (!stristr($tutto, $tag1)) return '';
$s=stristr($tutto,$tag1);
$s=substr( $s,strlen($tag1));
if (!stristr($s,$tag2)) return '';
$s1=stristr($s,$tag2);
return substr($s,0,strlen($s)-strlen($s1));
}
private function mergeStyleAttributes($s) {
// merge many style definitions in the same tag in just one attribute style
$x = "";
$temp = "";
$c = 0;
while(stristr($s,"style=\"")) {
$temp = $this->extractChars("style=\"","\"",$s);
if ($temp=="") {
// missing closing quote! add missing quote.
return preg_replace("/(\/)?>/i","\"\\1>",$s);
}
if ($c==0) $s = str_replace("style=\"".$temp."\"","##PUTITHERE##",$s);
$s = str_replace("style=\"".$temp."\"","",$s);
if (!preg_match("/;$/i",$temp)) $temp.=";";
$x.=$temp;
$c++;
}
if (count($this->allowed_styles)>0) {
// keep only allowed styles by Martin Vool 2010-04-19
$check=explode(';', $x);
$x="";
foreach($check as $chk){
foreach($this->allowed_styles as $as)
if(stripos($chk, $as) !== False) { $x.=$chk.';'; break; }
}
}
if ($c>0) $s = str_replace("##PUTITHERE##","style=\"".$x."\"",$s);
return $s;
}
private function fixAutoclosingTags($tag,$tipo=""){
// method called by fix() to fix auto-closing tags (<br/> <img ... />)
if (in_array( $tipo, array ("img","input","br","hr")) ) {
if (!stristr($tag,'/>')) $tag = str_replace('>','/>',$tag );
}
return $tag;
}
private function getTypeOfTag($tag) {
$tag = trim(preg_replace("/[\>\<\/]/i","",$tag));
$a = explode(" ",$tag);
return $a[0];
}
private function checkTree() {
// return the number of errors found
$errorsCounter = 0;
for ($i=1;$i<count($this->matrix);$i++) {
$flag=false;
if ($this->matrix[$i]["tagType"]=="div") { // div cannot stay inside a p, b, etc.
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p","b","i","font","u","small","strong","em"))) $flag=true;
}
if (in_array( $this->matrix[$i]["tagType"], array( "b", "strong" )) ) { // b cannot stay inside b o strong.
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("b","strong"))) $flag=true;
}
if (in_array( $this->matrix[$i]["tagType"], array ( "i", "em") )) { // i cannot stay inside i or em
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("i","em"))) $flag=true;
}
if ($this->matrix[$i]["tagType"]=="p") {
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p","b","i","font","u","small","strong","em"))) $flag=true;
}
if ($this->matrix[$i]["tagType"]=="table") {
$parentType = $this->matrix[$this->matrix[$i]["parentTag"]]["tagType"];
if (in_array($parentType, array("p","b","i","font","u","small","strong","em","tr","table"))) $flag=true;
}
if ($flag) {
$errorsCounter++;
if ($this->debug) echo "<div style='color:#ff0000'>Found a <b>".$this->matrix[$i]["tagType"]."</b> tag inside a <b>".htmlspecialchars($parentType)."</b> tag at node $i: MOVED</div>";
$swap = $this->matrix[$this->matrix[$i]["parentTag"]]["parentTag"];
if ($this->debug) echo "<div style='color:#ff0000'>Every node that has parent ".$this->matrix[$i]["parentTag"]." will have parent ".$swap."</div>";
$this->matrix[$this->matrix[$i]["parentTag"]]["tag"]="<!-- T A G \"".$this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]."\" R E M O V E D -->";
$this->matrix[$this->matrix[$i]["parentTag"]]["tagType"]="";
$hoSpostato=0;
for ($j=count($this->matrix)-1;$j>=$i;$j--) {
if ($this->matrix[$j]["parentTag"]==$this->matrix[$i]["parentTag"]) {
$this->matrix[$j]["parentTag"] = $swap;
$hoSpostato=1;
}
}
}
}
return $errorsCounter;
}
private function findSonsOf($parentTag) {
// build correct html recursively
$out= "";
for ($i=1;$i<count($this->matrix);$i++) {
if ($this->matrix[$i]["parentTag"]==$parentTag) {
if ($this->matrix[$i]["tag"]!="") {
$out.=$this->matrix[$i]["pre"];
$out.=$this->matrix[$i]["tag"];
$out.=$this->matrix[$i]["post"];
} else {
$out.=$this->matrix[$i]["pre"];
$out.=$this->matrix[$i]["post"];
}
if ($this->matrix[$i]["tag"]!="") {
$out.=$this->findSonsOf($i);
if ($this->matrix[$i]["tagType"]!="") {
// write the closing tag
if (!in_array($this->matrix[$i]["tagType"], array ( "br","img","hr","input")))
$out.="</". $this->matrix[$i]["tagType"].">";
}
}
}
}
return $out;
}
private function findSonsOfDisplayCode($parentTag) {
// used for debug
$out= "";
for ($i=1;$i<count($this->matrix);$i++) {
if ($this->matrix[$i]["parentTag"]==$parentTag) {
$out.= "<div style=\"padding-left:15\"><span style='float:left;background-color:#FFFF99;color:#000;'>{$i}:</span>";
if ($this->matrix[$i]["tag"]!="") {
if ($this->matrix[$i]["pre"]!="") $out.=htmlspecialchars($this->matrix[$i]["pre"])."<br>";
$out.="".htmlspecialchars($this->matrix[$i]["tag"])."<span style='background-color:red; color:white'>{$i} <em>".$this->matrix[$i]["tagType"]."</em></span>";
$out.=htmlspecialchars($this->matrix[$i]["post"]);
} else {
if ($this->matrix[$i]["pre"]!="") $out.=htmlspecialchars($this->matrix[$i]["pre"])."<br>";
$out.=htmlspecialchars($this->matrix[$i]["post"]);
}
if ($this->matrix[$i]["tag"]!="") {
$out.="<div>".$this->findSonsOfDisplayCode($i)."</div>\n";
if ($this->matrix[$i]["tagType"]!="") {
if (($this->matrix[$i]["tagType"]!="br") && ($this->matrix[$i]["tagType"]!="img") && ($this->matrix[$i]["tagType"]!="hr")&& ($this->matrix[$i]["tagType"]!="input"))
$out.="<div style='color:red'>".htmlspecialchars("</". $this->matrix[$i]["tagType"].">")."{$i} <em>".$this->matrix[$i]["tagType"]."</em></div>";
}
}
$out.="</div>\n";
}
}
return $out;
}
private function removeSpacesAndBadTags($s) {
$i=0;
while ($i<10) {
$i++;
$s = preg_replace (
array(
'/[\r\n]/i',
'/ /i',
'/<p([^>])*>(&nbsp;)*\s*<\/p>/i',
'/<span([^>])*>(&nbsp;)*\s*<\/span>/i',
'/<strong([^>])*>(&nbsp;)*\s*<\/strong>/i',
'/<em([^>])*>(&nbsp;)*\s*<\/em>/i',
'/<font([^>])*>(&nbsp;)*\s*<\/font>/i',
'/<small([^>])*>(&nbsp;)*\s*<\/small>/i',
'/<\?xml:namespace([^>])*><\/\?xml:namespace>/i',
'/<\?xml:namespace([^>])*\/>/i',
'/class=\"MsoNormal\"/i',
'/<o:p><\/o:p>/i',
'/<!DOCTYPE([^>])*>/i',
'/<!--(.|\s)*?-->/',
'/<\?(.|\s)*?\?>/'
),
array(
' ',
' ',
'',
'',
'',
'',
'',
'',
'',
'',
'',
' ',
'',
''
)
, trim($s));
}
return $s;
}
private function charByCharJob() {
$s = $this->removeSpacesAndBadTags($this->dirtyhtml);
if ($s=="") return;
$s = "<root>".$s."</root>";
$contenuto = "";
$ns = "";
$i=0;
$j=0;
$indexparentTag=0;
$padri=array();
array_push($padri,"0");
$this->matrix[$j]["tagType"]="";
$this->matrix[$j]["tag"]="";
$this->matrix[$j]["parentTag"]="0";
$this->matrix[$j]["pre"]="";
$this->matrix[$j]["post"]="";
$tags=array();
while($i<strlen($s)) {
if ( $s[$i] =="<") {
// found a tag
$contenuto = $ns;
$ns = "";
$tag="";
while( $i<strlen($s) && $s[$i]!=">" ){
// get chars till the end of a tag
$tag.=$s[$i];
$i++;
}
$tag.=$s[$i];
if($s[$i]==">") {
// $tag contains a tag <...chars...> so let's clean it!
$tag = $this->fixTag($tag);
$tagType = $this->getTypeOfTag($tag);
$tag = $this->fixAutoclosingTags($tag,$tagType);
$tag = $this->mergeStyleAttributes($tag);
if (!isset($tags[$tagType])) $tags[$tagType]=0;
$tagok=true;
if (($tags[$tagType]==0)&&(stristr($tag,'/'.$tagType.'>'))) {
$tagok=false;
// there is a close tag without any open tag, I delete it
if ($this->debug) echo "<div style='color:#ff0000'>Found a closing tag <b>".htmlspecialchars($tag)."</b> at char $i without open tag: REMOVED</div>";
}
}
if ($tagok) {
$j++;
$this->matrix[$j]["pre"]="";
$this->matrix[$j]["post"]="";
$this->matrix[$j]["parentTag"]="";
$this->matrix[$j]["tag"]="";
$this->matrix[$j]["tagType"]="";
if (stristr($tag,'/'.$tagType.'>')) {
// it's the closing tag
$ind = array_pop($padri);
$this->matrix[$j]["post"]=$contenuto;
$this->matrix[$j]["parentTag"]=$ind;
$tags[$tagType]--;
} else {
if (@preg_match("/".$tagType."\/>$/i",$tag)||preg_match("/\/>/i",$tag)) {
// it's a autoclosing tag
$this->matrix[$j]["tagType"]=$tagType;
$this->matrix[$j]["tag"]=$tag;
$indexparentTag = array_pop($padri);
array_push($padri,$indexparentTag);
$this->matrix[$j]["parentTag"]=$indexparentTag;
$this->matrix[$j]["pre"]=$contenuto;
$this->matrix[$j]["post"]="";
} else {
// it's a open tag
$tags[$tagType]++;
$this->matrix[$j]["tagType"]=$tagType;
$this->matrix[$j]["tag"]=$tag;
$indexparentTag = array_pop($padri);
array_push($padri,$indexparentTag);
array_push($padri,$j);
$this->matrix[$j]["parentTag"]=$indexparentTag;
$this->matrix[$j]["pre"]=$contenuto;
$this->matrix[$j]["post"]="";
}
}
}
} else {
// content of the tag
$ns.=$s[$i];
}
$i++;
}
// remove not valid tags
for ($eli=$j+1;$eli<count($this->matrix);$eli++) {
$this->matrix[$eli]["pre"]="";
$this->matrix[$eli]["post"]="";
$this->matrix[$eli]["parentTag"]="";
$this->matrix[$eli]["tag"]="";
$this->matrix[$eli]["tagType"]="";
}
$errorsCounter = $this->checkTree(); // errorsCounter contains the number of removed tags
$this->fixedxhtml=$this->findSonsOf(0); // build html fixed
if ($this->debug) {
$this->fixedxhtmlDisplayCode=$this->findSonsOfDisplayCode(0);
echo "<table border=1 cellspacing=0 cellpadding=0>";
echo "<tr><th>node id</th>";
echo "<th>pre</th>";
echo "<th>tag</th>";
echo "<th>post</th>";
echo "<th>parentTag</th>";
echo "<th>tipo</th></tr>";
for ($k=0;$k<=$j;$k++) {
echo "<tr><td>$k</td>";
echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["pre"])."</td>";
echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["tag"])."</td>";
echo "<td>&nbsp;".htmlspecialchars($this->matrix[$k]["post"])."</td>";
echo "<td>&nbsp;".$this->matrix[$k]["parentTag"]."</td>";
echo "<td>&nbsp;<i>".$this->matrix[$k]["tagType"]."</i></td></tr>";
}
echo "</table>";
echo "<hr/>{$j}<hr/>\n\n\n\n".$this->fixedxhtmlDisplayCode;
}
return $errorsCounter;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment