Created
June 3, 2012 06:43
-
-
Save purwandi/2862265 to your computer and use it in GitHub Desktop.
HTML Cleanup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Remove html tag | |
* | |
* @access public | |
* @param string | |
* @return string | |
*/ | |
function strip_html_tags($text) | |
{ | |
$text = preg_replace( | |
array( | |
// Remove invisible content | |
'@<head[^>]*?>.*?</head>@siu', | |
'@<style[^>]*?>.*?</style>@siu', | |
'@<script[^>]*?.*?</script>@siu', | |
'@<object[^>]*?.*?</object>@siu', | |
'@<embed[^>]*?.*?</embed>@siu', | |
'@<applet[^>]*?.*?</applet>@siu', | |
'@<noframes[^>]*?.*?</noframes>@siu', | |
'@<noscript[^>]*?.*?</noscript>@siu', | |
'@<noembed[^>]*?.*?</noembed>@siu', | |
// Add line breaks before & after blocks | |
'@<((br)|(hr))@iu', | |
'@</?((address)|(blockquote)|(center)|(del))@iu', | |
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', | |
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', | |
'@</?((table)|(th)|(td)|(caption))@iu', | |
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', | |
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', | |
'@</?((frameset)|(frame)|(iframe))@iu', | |
), | |
array( | |
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', | |
"\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", | |
"\n\$0", "\n\$0", | |
), | |
$text ); | |
// Remove all remaining tags and comments and return. | |
return strip_tags( $text ); | |
} | |
/** | |
* [strip_word_html clean copy and paste form word] | |
* | |
* @author Tom <[tom@cowin.us]> | |
* @link http://php.net/manual/en/function.strip-tags.php | |
* @param [type] $text [description] | |
* @param string $allowed_tags [description] | |
* @return [type] [description] | |
*/ | |
function strip_word_html($text, $allowed_tags = '<b><i><sup><sub><em><strong><u><br><ul><li><ol><em><strong><p>') | |
{ | |
mb_regex_encoding('UTF-8'); | |
//replace MS special characters first | |
$search = array('/‘/u', '/’/u', '/“/u', '/”/u', '/—/u'); | |
$replace = array('\'', '\'', '"', '"', '-'); | |
$text = preg_replace($search, $replace, $text); | |
//make sure _all_ html entities are converted to the plain ascii equivalents - it appears | |
//in some MS headers, some html entities are encoded and some aren't | |
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); | |
//try to strip out any C style comments first, since these, embedded in html comments, seem to | |
//prevent strip_tags from removing html comments (MS Word introduced combination) | |
if (mb_stripos($text, '/*') !== FALSE) | |
{ | |
$text = mb_eregi_replace('#/\*.*?\*/#s', '', $text, 'm'); | |
} | |
//introduce a space into any arithmetic expressions that could be caught by strip_tags so that they won't be | |
//'<1' becomes '< 1'(note: somewhat application specific) | |
$text = preg_replace(array('/<([0-9]+)/'), array('< $1'), $text); | |
$text = strip_tags($text, $allowed_tags); | |
//eliminate extraneous whitespace from start and end of line, or anywhere there are two or more spaces, convert it to one | |
$text = preg_replace(array('/^\s\s+/', '/\s\s+$/', '/\s\s+/u'), array('', '', ' '), $text); | |
//strip out inline css and simplify style tags | |
$search = array('#<(strong|b)[^>]*>(.*?)</(strong|b)>#isu', '#<(em|i)[^>]*>(.*?)</(em|i)>#isu', '#<u[^>]*>(.*?)</u>#isu'); | |
$replace = array('<b>$2</b>', '<i>$2</i>', '<u>$1</u>'); | |
$text = preg_replace($search, $replace, $text); | |
//on some of the ?newer MS Word exports, where you get conditionals of the form 'if gte mso 9', etc., it appears | |
//that whatever is in one of the html comments prevents strip_tags from eradicating the html comment that contains | |
//some MS Style Definitions - this last bit gets rid of any leftover comments */ | |
$num_matches = preg_match_all("/\<!--/u", $text, $matches); | |
if ($num_matches) | |
{ | |
$text = preg_replace('/\<!--(.)*--\>/isu', '', $text); | |
} | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment