Last active
January 25, 2019 13:54
-
-
Save bes-internal/4f9133f57e9f531c3f70616b02ffd1bd to your computer and use it in GitHub Desktop.
corpsite cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sub gtextcleaner_start { | |
my $text=$_[0]; | |
$text=~s/<z>(.*?)<z>/sub{ push @gtextcleanersave, $1; my $ret="<z>gtextcleanersave@{[ $savei || 0 ]}<z>"; $savei++; return $ret}->()/gsie; | |
return $text; | |
} | |
sub gtextcleaner_end { | |
my $text=$_[0]; | |
for ($text) { | |
s/<b +>/<b>/gi; | |
s/<i +>/<i>/gi; | |
s/<br.*?>/<br>/gi; | |
s/(<h\d>)<b>/$1/gi; | |
s/<\/b>(<\/h\d>)/$1/gi; | |
s/<br> +<br>/<br><br>/g; | |
s/<br> +<br>/<br><br>/g; | |
s/<br> +<br>/<br><br>/g; | |
s/<b> *(<br>)+/$1<b>/g; | |
s/(?:<br>)+<tr/<tr/gi; | |
s/(?:<br>)+<td/<td/gi; | |
s/ *(<t[rd].*?>)/$1/gi; | |
s/(<td[^>]*>) *(?:<br>)+/$1/gi; | |
s/<tr> *(?:<br>)+/<tr>/gi; | |
s/(<h\d>) +/$1/gi; | |
s/ +(<\/h\d>)/$1/gi; | |
s/(?:<br>)+ *(<h\d>)/$1/gi; | |
s/(<h\d>)(?:<br>)+/$1/gi; | |
s/(?:<br>)+(<\/?h\d.*?>)/$1/gi; | |
s/(<\/?h\d>) *(?:<br>)+/$1/gi; | |
s/<b> *(?:<br>)* *<\/b>//gi; | |
s/<i> *(?:<br>)* *<\/i>//gi; | |
s/<h\d> *(?:<br>)* *<h\d>//gi; | |
s/<table> *(?:<br>)* *<\/table>//gi; | |
s/<center> *(?:<br>)* *<\/center>//gi; | |
s/<span> *(?:<br>)* *<\/span>//gi; | |
s/<a> *(?:<br>)* *<\/a>//gi; | |
s/(<br>){2,}/<br><br>/gi; | |
s/^ +//; s/ +$//; | |
s/^(?:<br>)+//; s/(?:<br>)+$//; | |
s/^(?: )+//; s/(?: )+$//; | |
s/<z>gtextcleanersave(\d+)<z>/<z>@gtextcleanersave[$1]<z>/g; | |
} | |
return $text; | |
} | |
sub gtextcleanerlite { | |
my $text=$_[0]; | |
$text=gtextcleaner_start $text; | |
for ($text) { | |
s/(?<!<style>)<!--.*?-->//gs; | |
s/\&/&/gi; | |
s/\n/ /gs; #!!! | |
s/https?:\/\/(www\.)?$shortdomain$cookpath(\/)?//gsi; | |
s/https?:\/\/index.pl/index.pl/gsi; | |
s/https?:\/\/files/files/gsi; | |
s/<a href=""/<a href="$cookpath\/"/g; | |
s/((?:src|href)=".*?")/slurp_dataurl($1)/ge; | |
s/<\/?html.*?>//gi; | |
s/<title>.*?<\/title>//gi; | |
s/<head.*?>.*?<\/head>//gi; | |
s/<\/?body.*?>//gi; | |
s/<meta.*?>//gi; | |
s/<link.*?>//gi; | |
# tags and params with ":" | |
s/(<\/?\w+:.*?>)//gi; | |
s/ \w+:\w+=(["']).*?\1//gi; | |
s/mso\-.*?(?=;|"|')//gi; | |
s/<a href=['"]?mailto.*?>(.*?)<\/a>/$1/gi; | |
# IMG | |
s!<img src="resource://skype.*?>!!gi; | |
s/<IMG/<img/g; | |
s/<img>//g; | |
s/\?ver=\d+//g; | |
s/(<img.*?>)/downloadimg($1)/ge; | |
# param | |
s/ on\w+=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ lang=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ v=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ nowrap=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ face=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ valign=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ sdval=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ sdnum=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ rollapp-href=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ sizcache=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ sizset=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ classname=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ scope=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ internalinstanceid=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ style=(['"])\1//gi; | |
s/ alt=(['"])\1//gi; | |
s/ data-select-like-a-boss=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ width="null"//gi; | |
s/"=""//g; | |
s/<\/?span>//gi; | |
# prod | |
s/ border=(['"])\d+\1//gi; | |
s/ cellpadding=(['"])0\1//gi; | |
s/ cellspacing=(['"])0\1//gi; | |
s/<table (.*?)class="(.*?)"(.*?)>/sub{my $s=qq^$1^; my $e=qq^$3^; my $p=qq^ $2 ^; $p=~s! (?:prod|prod--editor) ! !g; $p=~s! prod--noborder prod--default ! prod--noborder !g; $p=~s!^ +!!; $p=~s! +$!!; $p=~s! +! !g; return "<table ${s}class=\"prod $p\"${e}>";}->()/egi; | |
#nestor | |
s/<blockquote>\s+<p>/<blockquote>/g; | |
s/<\/blockquote>\s+<p>/<\/blockquote>/g; | |
s/<p( .*?)?>/<br><br>/gi; | |
s/<table/<br><br><table/g; | |
s/<\/table>/<\/table><br><br>/g; | |
s/<\/p>//gi; | |
s/<\/tr.*?>//gi; | |
s/<\/td.*?>//gi; | |
s/<\/?tbody.*?>//gi; | |
s/<\/li.*?>//gsi; | |
s/<(\/)?strong( .*?)?>/<$1b$2>/gsi; | |
s/<(\/)?em( .*?)?>/<$1i$2>/gsi; | |
s/§/§/g; | |
s/\%C2\%A7ion/§ion/g; | |
s/_{10,}/<hr>/g; | |
if (!$h1cleanheythisishardcodewtfareyoudoingstopstop) {s/<(\/)?h1(.*?)>/<$1h2$2>/gi;} | |
#h target | |
s/<a class=['"]?hrefer['"]?.*?<\/a>//g; | |
s/”/"/g; s/“/"/g; | |
s/\x{FEFF}//g; s/\xA0/ /g; s/ / /gs; s/\r//gs; s/\t/ /gs; s/ {3,}/ /g; s/^\s+//; s/\s+$//; | |
s/<(\/)?ibed/<$1embed/g; # FIXME | |
} | |
$text=gtextcleaner_end $text; | |
return $text; | |
} | |
sub gtextcleaner { | |
my $text=$_[0]; | |
$text=gtextcleaner_start $text; | |
$text=gtextcleanerlite $text; | |
for ($text) { | |
s/<\/?font.*?>//gi; | |
s/<\/?span.*?>//gi; | |
s/<\/?div.*?>//gi; | |
s/<script.*?>.*?<\/script>//gi; | |
s/<del>.*?<\/del>//gi; | |
s/<colgroup>.*?<\/colgroup>//gi; | |
s/<col.*?>//gi; | |
s/<\/?big.*?>//gi; | |
# s/<\/?small.*?>//gi; #???? | |
# s/(<tr.*?>).*?(<t[dh].*?>)/$1$2/gi; #????? | |
s/<\/?tt( .*?)?>//gi; | |
s/<\/?u( .*?)?>//gi; | |
s/<\/?s( .*?)?>//gi; | |
s/<style.*?>.*?<\/style>//gi; | |
s/( *<br> *)+<hr>/<hr>/gi; | |
s/<hr>/<br><br><hr>/gi; | |
s/<hr>(?: *<br> *)+/<hr><br>/gi; | |
s/<\/ul>(?: *<br> *)+/<\/ul>/gi; | |
s/<\/ol>(?: *<br> *)+/<\/ol>/gi; | |
#param | |
s/ style=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ cols=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ rules=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
s/ frame=((['"]).*?\2|.*?)(?=\s|>)//gi; | |
#prod clean wide (>1000) width, gscript table clean clean all | |
s/(<table[^>]* width=(['"])(\d+)\2[^>]*)/sub{my $t=qq^$1^; my $w=qq^$3^; if ($w>1000){$t=~s!width=['"]*\d+['"]*!!} return $t }->()/gie; | |
s/(<td[^>]* width=(['"])(\d+)\2[^>]*)/sub{my $t=qq^$1^; my $w=qq^$3^; if ($w>1000){$t=~s!width=['"]*\d+['"]*!!} return $t }->()/gie; | |
s/ class=(['"])xl\d+\1(?=\s|>)//gi; | |
s/(<(?:table|tr|td)[^>]*) height=(['"])\d+\2([^>]*>)/$1$3/gi; #height in all: table tr td | |
} | |
$text=gtextcleaner_end $text; | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment