Skip to content

Instantly share code, notes, and snippets.

@HelgeSverre
Created December 3, 2023 20:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HelgeSverre/a0b294991d69120a9411f9b5ddaa76b9 to your computer and use it in GitHub Desktop.
Save HelgeSverre/a0b294991d69120a9411f9b5ddaa76b9 to your computer and use it in GitHub Desktop.
Experimental code to compres HTML without losing the "structure", for a web scraping with ai use-case to reduce required token-usage.
<?php
namespace App;
use Illuminate\Support\Str;
use Symfony\Component\DomCrawler\Crawler;
class HtmlCompressor
{
protected bool $removeComments = true;
protected bool $removeEmptyElements = true;
protected array $elementsToRemove = [
// Elements
'script', 'style', 'link', 'head', 'noscript', 'template', 'footer', 'svg', 'br', 'hr',
// Fontawesome icons
'i.fa', 'i.fas', 'i.far', 'i.fal',
];
protected array $attributesToRemove = [
'alt',
'for',
'href',
'onclick',
'onerror',
'onsubmit',
'placeholder',
'role',
'src',
'style',
'tabindex',
];
public function __construct(
?array $elementsToRemove = null,
?array $attributesToRemove = null,
?bool $removeComments = null,
?bool $removeEmptyElements = null,
) {
$this->elementsToRemove = $elementsToRemove ?? $this->elementsToRemove;
$this->attributesToRemove = $attributesToRemove ?? $this->attributesToRemove;
$this->removeComments = $removeComments ?? $this->removeComments;
$this->removeEmptyElements = $removeEmptyElements ?? $this->removeEmptyElements;
}
public static function compress(string $html): string
{
return (new self())->compressHtml($html);
}
public static function textOnly(string $html): string
{
$raw = self::compress($html);
$dom = new Crawler(Str::of($raw)->replace('><', '> <')->toString());
return Str::of($dom->text())->squish()->trim()->toString();
}
/** @noinspection PhpPossiblePolymorphicInvocationInspection */
public function compressHtml(string $html): string
{
$crawler = new Crawler($html);
// Remove elements we dont need
foreach ($this->elementsToRemove as $element) {
$crawler->filter($element)->each(function (Crawler $node) {
$node->getNode(0)->parentNode->removeChild($node->getNode(0));
});
}
if ($this->removeComments) {
$crawler->filterXPath('//comment()')->each(function (Crawler $node) {
$node->getNode(0)->parentNode->removeChild($node->getNode(0));
});
}
if ($this->removeEmptyElements) {
$crawler->filter('*')->each(function (Crawler $node) {
if (in_array($node->nodeName(), ['div', 'p']) && $node->text() === '') {
$node->getNode(0)->parentNode->removeChild($node->getNode(0));
}
});
}
// Remove attributes
$crawler->filter('*')->each(function (Crawler $node) {
// Need to convert the iterator to an array,
// because we are mutating the attributes as inside the loop.
foreach (iterator_to_array($node->getNode(0)->attributes) as $attribute) {
$name = $attribute->nodeName;
if (in_array($name, $this->attributesToRemove)) {
$node->getNode(0)->removeAttribute($name);
}
// Remove aria and data attributes
if (Str::startsWith($name, ['aria-', 'data-'])) {
$node->getNode(0)->removeAttribute($name);
}
}
});
$filteredHtml = $crawler->filter('body')->html();
$compressedHtml = Str::of($filteredHtml)
->squish()
->replace('> <', '><') // Remove whitespace between closing and ending tags "</div> <div>"
->replace(' </', '</') // Remove whitespace before next 'whatever </tag>'
->replace(' ">', '">')
->replace(' " ', '"')
->replace('> ', '>')
->replace(' <', '<')
->remove('<span>×</span>')
->trim();
return $compressedHtml;
}
public static function classes(bool|string $html)
{
$crawler = new Crawler($html);
$classes = $crawler->filter('*')->each(function (Crawler $node) {
return $node->attr('class');
});
return collect($classes)->filter()->unique()->values();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment