Skip to content

Instantly share code, notes, and snippets.

Last active May 4, 2023 17:18
Show Gist options
  • Save vielhuber/51e7b319fea9ceac1bd0f5638bae6ced to your computer and use it in GitHub Desktop.
Save vielhuber/51e7b319fea9ceac1bd0f5638bae6ced to your computer and use it in GitHub Desktop.
domdocument domxpath xpath #php


$DOMDocument = new \DOMDocument();
$DOMXPath = new \DOMXPath($DOMDocument);

load html file


load html file (with or without header)

// if the html source doesn't contain a valid utf8 header, domdocument interprets is as iso
// we circumvent this with mb_convert_encoding
// warning: if you don't add a doctype/html tag, domdocument adds that information for you
// also if only a text node is provided, it is surrounded by a p-tag
// we also add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> got proper encoding (see below)
$html = file_get_contents('tpl.html');
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
$has_wrapper = strpos($html, '<html') !== false;
if ($has_wrapper === false) { $html = '<!DOCTYPE html><html data-please-remove-wrapper><body>' . $html . '</body></html>'; }
if (mb_strpos($html, '</head>') !== false) { $html = str_replace('</head>', '<!--remove--><meta http-equiv="Content-type" content="text/html; charset=utf-8" /><!--/remove--></head>', $html); }
elseif (mb_strpos($html, '<body') !== false) { $html = str_replace('<body', '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove--><body', $html); }
else { $html = '<!--remove--><head><meta http-equiv="content-type" content="text/html;charset=utf-8" /></head><!--/remove-->' . $html; }

get back html from domdocument

// domdocument does not close empty li tags (because they're valid html)
// to circumvent that, use:
$nodes = $DOMXPath->query('/html/body//*[not(node())]');
foreach($nodes as $nodes__value) { $nodes__value->nodeValue = ''; }
$html = $DOMDocument->saveHTML();
// domdocument converts all umlauts to html entities, revert that
// $html = html_entity_decode($html); 
// this method is bad when we use intentionally encoded code e.g. in <pre> tags; another option to prevent html entities (and leave everything intact)
// is to add <meta http-equiv="content-type" content="text/html;charset=utf-8" /> (see above)
// warning: this still encodes < to &gt; because < is invalid html!
// undo above changes
if (mb_strpos($html, '<!--remove-->') !== false && mb_strpos($html, '<!--/remove-->') !== false) {
    $html = mb_substr($html, 0, mb_strpos($html, '<!--remove-->')) . mb_substr($html, mb_strpos($htmlModified, '<!--/remove-->') + mb_strlen('<!--/remove-->'));
// if domdocument added previously a default header, we squish that
if (mb_stripos($html, 'data-please-remove-wrapper') !== false) {
  $pos1 = mb_strpos($html, '<body>') + mb_strlen('<body>');
  $pos2 = mb_strpos($html, '</body>');
  $html = mb_substr($html, $pos1, $pos2 - $pos1);

query nodes

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
foreach($nodes as $nodes__value) {
    /* ... */

check length of query

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
if( $nodes->length > 0 ) {}
if( count($nodes) > 0 ) {}

get first item

$nodes = $DOMXPath->query('/html/body//*[@id="foo"]');
if( $nodes->length > 0 ) { $node = $nodes[0]; }

types of selectors

  • node(): any node (including text nodes)
  • text(): text nodes
  • comment(): comment nodes
  • *: dom nodes
  • node()[normalize-space()]: any node (including text nodes) excluding whitespace text nodes (and also including
  • text()[normalize-space()]: any text node excluding whitespace
  • /html/body//*|/html/body//text()[normalize-space()]: dom nodes and test nodes (without whitespace)

get all nodes (including text nodes)


get all nodes (without text nodes)


get text nodes only


class selector

$DOMXPath->query('/html/body//*[contains(concat(" ", normalize-space(@class), " "), " foo ")]');

id selector


tag selector


multiple tag selector


tag selector


attribute selector


attribute value selector


attribute value selector

$DOMXPath->query('/html/body//a[starts-with(@href, 'tel:')]);

attribute selector (key wildcard)

$DOMXPath->query('/html/body//@*[starts-with(name(), \'data-\')]/parent::*');

next sibling selector ".foo + .bar"

$DOMXPath->query('//*[contains(concat(" ", normalize-space(@class), " "), " foo ")]/following::*[contains(concat(" ", normalize-space(@class), " "), " bar ")]');

check if is text node

if($node->nodeName === '#text') {}
if($node->nodeType === 3) {}

check if is dom/element node

if($node->nodeType === 1) {}

get tag name of node


get/set content of text node

// difference:
// reading (this is important): if you fetch the variable of a text node with nodeValue (or even textContent) and also getAttribute
// the content is automatically is encoded (what we usually don't want)
// we use htmlentities (or the even weaker htmlspecialchars) to revert that
// writing (this is important): domdocument sets strings with encoded html chars for text nodes as plain text (and not html)
// we therefore use the parent node and set the node value accordingly (so that the encoded strings are properly set)
$node->parentNode->nodeValue = 'That&#39;s cool';
// if you really want to use the text node, you can do:
$node->nodeValue = html_entity_decode('That&#39;s cool', ENT_QUOTES | ENT_XML1, 'UTF-8');

get children of node (recursively)

$DOMXPath->query('.//node()', $node);

get children count of node (recursively)

$DOMXPath->evaluate('count(./node())', $node);

get text siblings (including oneself if text node) node

$DOMXPath->query('./../text()[normalize-space()]', $node);

get text siblings that are longer than 3 chars

$DOMXPath->query('./../text()[normalize-space()][string-length() > 3]', $node);

get text siblings that are longer than 1 char (excluding whitespace)

$DOMXPath->query('./../text()[normalize-space()][string-length(normalize-space(.)) > 1]', $node);

get dom elements without content inside (empty tags)


get direct sibling of node

$DOMXPath->query('(./following-sibling::*|./following-sibling::text()[normalize-space()])[1]', $node);

get attributes of node beginning with "data-"

$attrs = $this->DOMXPath->query('./@*[starts-with(name(),"data-")]', $node);
if (!empty($attrs)) {
    echo $attrs__value->nodeName;
    echo $attrs__value->nodeValue;

get dom attribute


set dom attribute


check if dom attribute exists


get unique id of node (this is very neat for comparing nodes etc)

$nodes = $DOMXPath->query('/html/body//node()');
foreach ($nodes as $nodes__value) {
    $id = intval($DOMXPath->evaluate('count(.//following::node()|.//child::node())',$nodes__value))+1;

get unique id of node (way faster)

$nodes = $DOMXPath->query('/html/body//node()');
foreach ($nodes as $nodes__value) {
    $id = $nodes__value->getNodePath();

add text node


add / append child

$child = $DOMDocument->createElement('a', '');
$child->setAttribute('href', '');

prepend child

if ($parent->hasChildNodes()) {
} else {

insert before

$node->parentNode->insertBefore($newNode, $node);

insert after

if($node->nextSibling === null) { $node->parentNode->appendChild($newNode); }
else { $node->parentNode->insertBefore($newNode, $node->nextSibling);  }

copy clone node


remove node


get outer html of node

$doc = new \DOMDocument();
$doc->appendChild($doc->importNode($node, true));
echo $doc->saveHTML();

get inner html of node

$inner = '';
foreach ($node->childNodes as $child) {
    $inner .= $node->ownerDocument->saveHTML($child);
return $inner;

set inner html of node

for ($x = $node->childNodes->length - 1; $x >= 0; $x--) {
if ($value != '') {
    $f = $node->ownerDocument->createDocumentFragment();
    $result = @$f->appendXML($value);
    if ($result) {
        if ($f->hasChildNodes()) {
    } else {
        $f = new \DOMDocument();
        $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
        $result = @$f->loadHTML('<htmlfragment>' . $value . '</htmlfragment>');
        if ($result) {
            $import = $f->getElementsByTagName('htmlfragment')->item(0);
            foreach ($import->childNodes as $child) {
                $importedNode = $node->ownerDocument->importNode($child, true);
        } else {

string to single node

$DOMDocument = new \DOMDocument(); // master dom document (needed for reference)
$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$tmp = new \DOMDocument();
$node = $DOMDocument->importNode($tmp->documentElement,true);

replace node with string

$str = '<strong>String that replaces the node</strong>';
$tmp = new \DOMDocument();
$str = mb_convert_encoding($str, 'HTML-ENTITIES', 'UTF-8');
$repl = $DOMDocument->importNode($tmp->documentElement, true);
$node->parentNode->replaceChild($repl, $node);

load xml

$DOMDocument = new \DOMDocument();
$DOMXPath = new \DOMXPath($DOMDocument);

write xml

$html = $DOMDocument->saveXML();

if domdocument is from xml

if($dom->xmlVersion != '') {}

search in all namespaces

$DOMXPath->query('//loc'); // this does not work, if the <loc> nodes are inside a socalled namespace (<urlset xmlns="">)
$DOMXPath->query('//*[name()=\'loc\']'); // this works in all namespaces
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment