Skip to content

Instantly share code, notes, and snippets.

@kevinquillen
Created August 8, 2017 19:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kevinquillen/a9229f2f1b42048e512888fda7f2f9be to your computer and use it in GitHub Desktop.
Save kevinquillen/a9229f2f1b42048e512888fda7f2f9be to your computer and use it in GitHub Desktop.
Example method that cleaned text of HTML and bad characters
/**
* Utility function - clean incoming text and strip it of HTML and bad characters.
* @param $text
* @return string
*/
public function cleanText($text) {
if (!drupal_strlen($text)) {
return '';
}
libxml_use_internal_errors(TRUE);
$document = new DOMDocument();
$document->loadHTML($text);
$document->preserveWhiteSpace = FALSE;
$output = '';
$xpath = new DOMXPath($document);
// save the document HTML string
$document->saveHTML();
$nodes = $document->getElementsByTagName('body')->item(0);
foreach ($nodes->childNodes as $node) {
$node->normalize();
if (trim($node->textContent, "\xc2 \n \t ") != '' && trim($node->textContent, "\xc2 \n \t ") != ' ') {
$output .= trim($node->C14N());
}
}
$output = str_replace('<p>&nbsp;</p>', '', $output);
$output = str_replace('<span>&nbsp;</span>', '', $output);
$output = str_replace('<span>_</span>', '', $output);
$output = str_replace('<span> </span>', '', $output);
$output = str_replace('</img>', '', $output);
$output = str_replace('<p> </p>', '', $output);
$output = str_replace(chr(130), "'", $output); // baseline single quote
$output = str_replace('”', '"', $output); // baseline double quote
$output = str_replace(chr(133), '...', $output); // ellipsis
$output = str_replace('‘', "'", $output); // left single quote
$output = str_replace('’', "'", $output); // right single quote
$output = str_replace('“', '"', $output); // left double quote
$output = str_replace('”', '"', $output); // right double quote
$output = strip_tags($output);
$output = preg_replace('/(?!\n)[\p{Cc}]/', ' ', $output);
$output = preg_replace('/[\x00-\x1F\x7f-\xFF]/', ' ', $output);
$output = preg_replace('/[^\da-z -.,]/i', '', $output);
$output = preg_replace(array('/\s{2,}/', '/[\t\n]/'), ' ', $output);
$output = mb_convert_encoding($output, 'UTF-8', 'UTF-8');
return $output;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment