-
-
Save kevinquillen/a9229f2f1b42048e512888fda7f2f9be to your computer and use it in GitHub Desktop.
Example method that cleaned text of HTML and bad characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Utility function - clean incoming text and strip it of HTML and bad characters. | |
* @param $text | |
* @return string | |
*/ | |
public function cleanText($text) { | |
if (!drupal_strlen($text)) { | |
return ''; | |
} | |
libxml_use_internal_errors(TRUE); | |
$document = new DOMDocument(); | |
$document->loadHTML($text); | |
$document->preserveWhiteSpace = FALSE; | |
$output = ''; | |
$xpath = new DOMXPath($document); | |
// save the document HTML string | |
$document->saveHTML(); | |
$nodes = $document->getElementsByTagName('body')->item(0); | |
foreach ($nodes->childNodes as $node) { | |
$node->normalize(); | |
if (trim($node->textContent, "\xc2 \n \t ") != '' && trim($node->textContent, "\xc2 \n \t ") != ' ') { | |
$output .= trim($node->C14N()); | |
} | |
} | |
$output = str_replace('<p> </p>', '', $output); | |
$output = str_replace('<span> </span>', '', $output); | |
$output = str_replace('<span>_</span>', '', $output); | |
$output = str_replace('<span> </span>', '', $output); | |
$output = str_replace('</img>', '', $output); | |
$output = str_replace('<p> </p>', '', $output); | |
$output = str_replace(chr(130), "'", $output); // baseline single quote | |
$output = str_replace('”', '"', $output); // baseline double quote | |
$output = str_replace(chr(133), '...', $output); // ellipsis | |
$output = str_replace('‘', "'", $output); // left single quote | |
$output = str_replace('’', "'", $output); // right single quote | |
$output = str_replace('“', '"', $output); // left double quote | |
$output = str_replace('”', '"', $output); // right double quote | |
$output = strip_tags($output); | |
$output = preg_replace('/(?!\n)[\p{Cc}]/', ' ', $output); | |
$output = preg_replace('/[\x00-\x1F\x7f-\xFF]/', ' ', $output); | |
$output = preg_replace('/[^\da-z -.,]/i', '', $output); | |
$output = preg_replace(array('/\s{2,}/', '/[\t\n]/'), ' ', $output); | |
$output = mb_convert_encoding($output, 'UTF-8', 'UTF-8'); | |
return $output; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment