Skip to content

Instantly share code, notes, and snippets.

@bes-internal
Last active January 25, 2019 13:54
Show Gist options
  • Save bes-internal/4f9133f57e9f531c3f70616b02ffd1bd to your computer and use it in GitHub Desktop.
Save bes-internal/4f9133f57e9f531c3f70616b02ffd1bd to your computer and use it in GitHub Desktop.
corpsite cleaner
sub gtextcleaner_start {
my $text=$_[0];
$text=~s/<z>(.*?)<z>/sub{ push @gtextcleanersave, $1; my $ret="<z>gtextcleanersave@{[ $savei || 0 ]}<z>"; $savei++; return $ret}->()/gsie;
return $text;
}
sub gtextcleaner_end {
my $text=$_[0];
for ($text) {
s/<b +>/<b>/gi;
s/<i +>/<i>/gi;
s/<br.*?>/<br>/gi;
s/(<h\d>)<b>/$1/gi;
s/<\/b>(<\/h\d>)/$1/gi;
s/<br> +<br>/<br><br>/g;
s/<br> +<br>/<br><br>/g;
s/<br> +<br>/<br><br>/g;
s/<b> *(<br>)+/$1<b>/g;
s/(?:<br>)+<tr/<tr/gi;
s/(?:<br>)+<td/<td/gi;
s/ *(<t[rd].*?>)/$1/gi;
s/(<td[^>]*>) *(?:<br>)+/$1/gi;
s/<tr> *(?:<br>)+/<tr>/gi;
s/(<h\d>) +/$1/gi;
s/ +(<\/h\d>)/$1/gi;
s/(?:<br>)+ *(<h\d>)/$1/gi;
s/(<h\d>)(?:<br>)+/$1/gi;
s/(?:<br>)+(<\/?h\d.*?>)/$1/gi;
s/(<\/?h\d>) *(?:<br>)+/$1/gi;
s/<b> *(?:<br>)* *<\/b>//gi;
s/<i> *(?:<br>)* *<\/i>//gi;
s/<h\d> *(?:<br>)* *<h\d>//gi;
s/<table> *(?:<br>)* *<\/table>//gi;
s/<center> *(?:<br>)* *<\/center>//gi;
s/<span> *(?:<br>)* *<\/span>//gi;
s/<a> *(?:<br>)* *<\/a>//gi;
s/(<br>){2,}/<br><br>/gi;
s/^ +//; s/ +$//;
s/^(?:<br>)+//; s/(?:<br>)+$//;
s/^(?: )+//; s/(?: )+$//;
s/<z>gtextcleanersave(\d+)<z>/<z>@gtextcleanersave[$1]<z>/g;
}
return $text;
}
sub gtextcleanerlite {
my $text=$_[0];
$text=gtextcleaner_start $text;
for ($text) {
s/(?<!<style>)<!--.*?-->//gs;
s/\&amp;/&/gi;
s/\n/ /gs; #!!!
s/https?:\/\/(www\.)?$shortdomain$cookpath(\/)?//gsi;
s/https?:\/\/index.pl/index.pl/gsi;
s/https?:\/\/files/files/gsi;
s/<a href=""/<a href="$cookpath\/"/g;
s/((?:src|href)=".*?")/slurp_dataurl($1)/ge;
s/<\/?html.*?>//gi;
s/<title>.*?<\/title>//gi;
s/<head.*?>.*?<\/head>//gi;
s/<\/?body.*?>//gi;
s/<meta.*?>//gi;
s/<link.*?>//gi;
# tags and params with ":"
s/(<\/?\w+:.*?>)//gi;
s/ \w+:\w+=(["']).*?\1//gi;
s/mso\-.*?(?=;|"|')//gi;
s/<a href=['"]?mailto.*?>(.*?)<\/a>/$1/gi;
# IMG
s!<img src="resource://skype.*?>!!gi;
s/<IMG/<img/g;
s/<img>//g;
s/\?ver=\d+//g;
s/(<img.*?>)/downloadimg($1)/ge;
# param
s/ on\w+=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ lang=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ v=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ nowrap=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ face=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ valign=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ sdval=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ sdnum=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ rollapp-href=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ sizcache=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ sizset=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ classname=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ scope=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ internalinstanceid=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ style=(['"])\1//gi;
s/ alt=(['"])\1//gi;
s/ data-select-like-a-boss=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ width="null"//gi;
s/"=""//g;
s/<\/?span>//gi;
# prod
s/ border=(['"])\d+\1//gi;
s/ cellpadding=(['"])0\1//gi;
s/ cellspacing=(['"])0\1//gi;
s/<table (.*?)class="(.*?)"(.*?)>/sub{my $s=qq^$1^; my $e=qq^$3^; my $p=qq^ $2 ^; $p=~s! (?:prod|prod--editor) ! !g; $p=~s! prod--noborder prod--default ! prod--noborder !g; $p=~s!^ +!!; $p=~s! +$!!; $p=~s! +! !g; return "<table ${s}class=\"prod $p\"${e}>";}->()/egi;
#nestor
s/<blockquote>\s+<p>/<blockquote>/g;
s/<\/blockquote>\s+<p>/<\/blockquote>/g;
s/<p( .*?)?>/<br><br>/gi;
s/<table/<br><br><table/g;
s/<\/table>/<\/table><br><br>/g;
s/<\/p>//gi;
s/<\/tr.*?>//gi;
s/<\/td.*?>//gi;
s/<\/?tbody.*?>//gi;
s/<\/li.*?>//gsi;
s/<(\/)?strong( .*?)?>/<$1b$2>/gsi;
s/<(\/)?em( .*?)?>/<$1i$2>/gsi;
s/§/&sect/g;
s/\%C2\%A7ion/&section/g;
s/_{10,}/<hr>/g;
if (!$h1cleanheythisishardcodewtfareyoudoingstopstop) {s/<(\/)?h1(.*?)>/<$1h2$2>/gi;}
#h target
s/<a class=['"]?hrefer['"]?.*?<\/a>//g;
s/”/"/g; s/“/"/g;
s/\x{FEFF}//g; s/\xA0/ /g; s/&nbsp;/ /gs; s/\r//gs; s/\t/ /gs; s/ {3,}/ /g; s/^\s+//; s/\s+$//;
s/<(\/)?ibed/<$1embed/g; # FIXME
}
$text=gtextcleaner_end $text;
return $text;
}
sub gtextcleaner {
my $text=$_[0];
$text=gtextcleaner_start $text;
$text=gtextcleanerlite $text;
for ($text) {
s/<\/?font.*?>//gi;
s/<\/?span.*?>//gi;
s/<\/?div.*?>//gi;
s/<script.*?>.*?<\/script>//gi;
s/<del>.*?<\/del>//gi;
s/<colgroup>.*?<\/colgroup>//gi;
s/<col.*?>//gi;
s/<\/?big.*?>//gi;
# s/<\/?small.*?>//gi; #????
# s/(<tr.*?>).*?(<t[dh].*?>)/$1$2/gi; #?????
s/<\/?tt( .*?)?>//gi;
s/<\/?u( .*?)?>//gi;
s/<\/?s( .*?)?>//gi;
s/<style.*?>.*?<\/style>//gi;
s/( *<br> *)+<hr>/<hr>/gi;
s/<hr>/<br><br><hr>/gi;
s/<hr>(?: *<br> *)+/<hr><br>/gi;
s/<\/ul>(?: *<br> *)+/<\/ul>/gi;
s/<\/ol>(?: *<br> *)+/<\/ol>/gi;
#param
s/ style=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ cols=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ rules=((['"]).*?\2|.*?)(?=\s|>)//gi;
s/ frame=((['"]).*?\2|.*?)(?=\s|>)//gi;
#prod clean wide (>1000) width, gscript table clean clean all
s/(<table[^>]* width=(['"])(\d+)\2[^>]*)/sub{my $t=qq^$1^; my $w=qq^$3^; if ($w>1000){$t=~s!width=['"]*\d+['"]*!!} return $t }->()/gie;
s/(<td[^>]* width=(['"])(\d+)\2[^>]*)/sub{my $t=qq^$1^; my $w=qq^$3^; if ($w>1000){$t=~s!width=['"]*\d+['"]*!!} return $t }->()/gie;
s/ class=(['"])xl\d+\1(?=\s|>)//gi;
s/(<(?:table|tr|td)[^>]*) height=(['"])\d+\2([^>]*>)/$1$3/gi; #height in all: table tr td
}
$text=gtextcleaner_end $text;
return $text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment