Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
[PHP] html2text.php - takes HTML and removes/purge the tags so the result can be sensible/readable as a text.
<?php
while (ob_get_level() > 0) ob_end_flush();
date_default_timezone_set("Asia/Jerusalem");
mb_language("uni");
mb_internal_encoding('UTF-8');
setlocale(LC_ALL, 'en_US.UTF-8');
header('Charset: UTF-8');
header('Content-Language: en');
header('Content-Encoding: UTF-8');
header('Content-Type: text/plain; charset=UTF-8');
header('Access-Control-Allow-Origin: *');
header('X-UA-Compatible: IE=edge,chrome=1');
header('Viewport: width=device-width, initial-scale=1.0');
$html = file_get_contents('./demo.html.txt');
$replacements = [
"#<!--(.*?)-->#is" => ""
,"#<head(.*?)>(.*?)</head>#is" => ""
,"#<script(.*?)>(.*?)</script>#is" => ""
,"#<iframe(.*?)>(.*?)</iframe>#is" => ""
,"#<noscript(.*?)>(.*?)</noscript>#is" => ""
,"#<style(.*?)>(.*?)</style>#is"=>""
,"#<meta(.*?)>#is"=>""
,"#<link(.*?)>#is"=>""
];
$html = preg_replace(array_keys($replacements),array_values($replacements),$html);
$html = strip_tags($html);
$replacements = [
"#\d+#is"=>""
,"#[\(\)\[\]\,\.\-\_\!\&\;\:\#]*#is"=>""
,"#\n+#is"=>"\n"
,"#\r+#is"=>""
,"#\s+#is"=>" "
];
$html = preg_replace(array_keys($replacements),array_values($replacements),$html);
$html = filter_var($html, FILTER_SANITIZE_SPECIAL_CHARS, FILTER_FLAG_STRIP_LOW);
$html = filter_var($html, FILTER_SANITIZE_SPECIAL_CHARS, FILTER_FLAG_STRIP_HIGH);
$html = filter_var($html, FILTER_SANITIZE_STRING);
$html = filter_var($html, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_LOW);
$html = filter_var($html, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_HIGH);
echo $html;
$patterns1 = [
"/\r/" => '' // Non-legal carriage return
, "/[\n\t]+/" => ' ' // Newlines and tabs
, '/[ ]{2,}/' => ' ' // Runs of spaces, pre-handling
, '/<script[^>]*>.*?<\/script>/i' => '' // <script>s -- which strip_tags supposedly has problems with
, '/<style[^>]*>.*?<\/style>/i' => '' // <style>s -- which strip_tags supposedly has problems with
, '/<!-- .* -->/' => '' // Comments -- which strip_tags might have problem a with
//, '/<h[123][^>]*>(.*?)<\/h[123]>/ie' => mb_strtoupper("\n\n\\1\n\n") // H1 - H3
//, '/<h[456][^>]*>(.*?)<\/h[456]>/ie' => ucwords("\n\n\\1\n\n") // H4 - H6
, '/<p[^>]*>/i' => "\n\n\t" // <P>
, '/<br[^>]*>/i' => "\n" // <br>
//, '/<b[^>]*>(.*?)<\/b>/ie' => mb_strtoupper("\\1") // <b>
//, '/<strong[^>]*>(.*?)<\/strong>/ie' => mb_strtoupper("\\1") // <strong>
, '/<i[^>]*>(.*?)<\/i>/i' => '_\\1_' // <i>
, '/<em[^>]*>(.*?)<\/em>/i' => '_\\1_' // <em>
, '/(<ul[^>]*>|<\/ul>)/i' => "\n\n" // <ul> and </ul>
, '/(<ol[^>]*>|<\/ol>)/i' => "\n\n" // <ol> and </ol>
, '/<li[^>]*>(.*?)<\/li>/i' => "\t* \\1\n" // <li> and </li>
, '/<li[^>]*>/i' => "\n\t* " // <li>
//, '/<a [^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/ie' => "\\2"//'$this->_build_link_list("\\1", "\\2")'
, '/<hr[^>]*>/i' => "\n-------------------------\n" // <hr>
, '/(<table[^>]*>|<\/table>)/i' => "\n\n" // <table> and </table>
, '/(<tr[^>]*>|<\/tr>)/i' => "\n" // <tr> and </tr>
, '/<td[^>]*>(.*?)<\/td>/i' => "\t\t\\1\n" // <td> and </td>
//, '/<th[^>]*>(.*?)<\/th>/ie' => mb_strtoupper("\t\t\\1\n") // <th> and </th>
, '/&(nbsp|#160);/i' => ' ' // Non-breaking space
, '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i' => '"' // Double quotes
, '/&(apos|rsquo|lsquo|#8216|#8217);/i' => "'" // Single quotes
, '/&gt;/i' => '>' // Greater-than
, '/&lt;/i' => '<' // Less-than
, '/&(amp|#38);/i' => '&' // Ampersand
, '/&(copy|#169);/i' => '(c)' // Copyright
, '/&(trade|#8482|#153);/i' => '(tm)' // Trademark
, '/&(reg|#174);/i' => '(R)' // Registered
, '/&(mdash|#151|#8212);/i' => '--' // mdash
, '/&(ndash|minus|#8211|#8722);/i' => '-' // ndash
, '/&(bull|#149|#8226);/i' => '*' // Bullet
, '/&(pound|#163);/i' => '' // Pound sign
, '/&(euro|#8364);/i' => 'EUR' // Euro sign
, '/&[^&;]+;/i' => '' // Unknown/unhandled entities
, '/[ ]{2,}/' => ' ' // Runs of spaces, post-handling
];
//$html = preg_replace(array_keys($patterns), array_values($patterns), $html);
//$html = strip_tags($html);
echo $html;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.