DrDub/detag.php

## detag.php
<?php

// This code Copyright (C) 2023 Textualization Sofware Ltd. is dual
// licensed PHP and LGPLv2.1 and it comes with NO WARRANTIES.

// v1.0

/**
  How to use it to obtain text for fine-tuning transformer models
  ---------------------------------------------------------------

  Assuming a crawl/ folder with a full crawl of websites of interest, with rendered DOM documents with extension .html

  php detag.php > html_php.txt 2> html_php.log

  Assuming a Linux machine with the following programs installed:

  * pandoc
  * pdftotext
  * antiword

  find crawl -follow -name \*.doc -exec antiword \{} \; > doc_antiword.txt  2> doc_antiword.log
  find crawl -follow -name \*.docx -exec pandoc --to plain \{} \; > docx_pandoc.txt  2> docx_pandoc.log
  find crawl -follow -name \*.pdf -exec pdftotext -nopgbrk \{} - \; > pdf_pdftotext.txt  2> pdf_pdftotext.log

  Then merge al the files together:

  cat html_php.txt doc_antiword.txt docx_pandoc.txt pdf_pdftotext.txt > all.txt
*/

function getHtmls(string $folder): array {
    $result = [];
    foreach (new DirectoryIterator($folder) as $fileInfo) {
        if($fileInfo->isDot()) continue;
        if($fileInfo->isdir()) {
            $rec = getHtmls($fileInfo->getPathname());
            $result = array_merge($result, $rec);
            continue;
        }
        if($fileInfo->getExtension() == 'html'){
            $result[] = $fileInfo->getPathname();
        }
    }
    return $result;
}

$htmls = getHtmls("crawl/");

// array_flip makes a set easily
$HTML5_INLINE = array_flip([
    "a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas", "cite", "code", "data", "del", "dfn", "em", "embed", "i", "iframe", "img", "input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video", "wbr",
]); // from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements

$PRUNE_TAGS = array_flip([ 'head', 'script', 'style' ]);
$END_CHARS = array_flip([ '.', ':', '?', '!' ]);

//turning off some errors
libxml_use_internal_errors(true);

// add a period at the end of "visible elements" like a paragraph, a div or a title.
function recurse(DOMNode $node) : string {
    global $PRUNE_TAGS;
    global $HTML5_INLINE;
    global $END_CHARS;

    if($node->nodeType == XML_TEXT_NODE)
        return $node->nodeValue;
    if($node->nodeName && array_key_exists($node->nodeName, $PRUNE_TAGS))
        return "";
    $result = "";
    foreach ($node->childNodes as $childNode) {
        $rec = recurse($childNode);
        if(array_key_exists($childNode->nodeName, $HTML5_INLINE)){
            $result = "$result$rec";
        }else{
            $trimmed = trim($rec);
            $len = strlen($trimmed);
            if($len){
                if(! array_key_exists($trimmed[$len-1], $END_CHARS)){
                    $rec = "$rec.";
                }
                $result = "$result$rec\n";
            }
        }
    }
    return $result;
}

foreach($htmls as $html) {
    $content = file_get_contents($html);

    if(empty($content))
        continue;

    $content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8');
    $doc = new DOMDocument('1.0', 'utf-8');
    $doc->LoadHTML($content, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);

    print(recurse($doc));
}
	<?php

	// This code Copyright (C) 2023 Textualization Sofware Ltd. is dual
	// licensed PHP and LGPLv2.1 and it comes with NO WARRANTIES.

	// v1.0

	/**
	How to use it to obtain text for fine-tuning transformer models
	---------------------------------------------------------------

	Assuming a crawl/ folder with a full crawl of websites of interest, with rendered DOM documents with extension .html

	php detag.php > html_php.txt 2> html_php.log

	Assuming a Linux machine with the following programs installed:

	* pandoc
	* pdftotext
	* antiword

	find crawl -follow -name \*.doc -exec antiword \{} \; > doc_antiword.txt 2> doc_antiword.log
	find crawl -follow -name \*.docx -exec pandoc --to plain \{} \; > docx_pandoc.txt 2> docx_pandoc.log
	find crawl -follow -name \*.pdf -exec pdftotext -nopgbrk \{} - \; > pdf_pdftotext.txt 2> pdf_pdftotext.log

	Then merge al the files together:

	cat html_php.txt doc_antiword.txt docx_pandoc.txt pdf_pdftotext.txt > all.txt
	*/

	function getHtmls(string $folder): array {
	$result = [];
	foreach (new DirectoryIterator($folder) as $fileInfo) {
	if($fileInfo->isDot()) continue;
	if($fileInfo->isdir()) {
	$rec = getHtmls($fileInfo->getPathname());
	$result = array_merge($result, $rec);
	continue;
	}
	if($fileInfo->getExtension() == 'html'){
	$result[] = $fileInfo->getPathname();
	}
	}
	return $result;
	}

	$htmls = getHtmls("crawl/");

	// array_flip makes a set easily
	$HTML5_INLINE = array_flip([
	"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas", "cite", "code", "data", "del", "dfn", "em", "embed", "i", "iframe", "img", "input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output", "picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small", "span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "u", "tt", "var", "video", "wbr",
	]); // from https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements

	$PRUNE_TAGS = array_flip([ 'head', 'script', 'style' ]);
	$END_CHARS = array_flip([ '.', ':', '?', '!' ]);

	//turning off some errors
	libxml_use_internal_errors(true);

	// add a period at the end of "visible elements" like a paragraph, a div or a title.
	function recurse(DOMNode $node) : string {
	global $PRUNE_TAGS;
	global $HTML5_INLINE;
	global $END_CHARS;

	if($node->nodeType == XML_TEXT_NODE)
	return $node->nodeValue;
	if($node->nodeName && array_key_exists($node->nodeName, $PRUNE_TAGS))
	return "";
	$result = "";
	foreach ($node->childNodes as $childNode) {
	$rec = recurse($childNode);
	if(array_key_exists($childNode->nodeName, $HTML5_INLINE)){
	$result = "$result$rec";
	}else{
	$trimmed = trim($rec);
	$len = strlen($trimmed);
	if($len){
	if(! array_key_exists($trimmed[$len-1], $END_CHARS)){
	$rec = "$rec.";
	}
	$result = "$result$rec\n";
	}
	}
	}
	return $result;
	}

	foreach($htmls as $html) {
	$content = file_get_contents($html);

	if(empty($content))
	continue;

	$content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8');
	$doc = new DOMDocument('1.0', 'utf-8');
	$doc->LoadHTML($content, LIBXML_HTML_NOIMPLIED \| LIBXML_HTML_NODEFDTD);

	print(recurse($doc));
	}