Skip to content

Instantly share code, notes, and snippets.

@hussainweb
Created May 23, 2016 14:01
Show Gist options
  • Save hussainweb/fec4da41589ded834a3085e6d17a4a9b to your computer and use it in GitHub Desktop.
Save hussainweb/fec4da41589ded834a3085e6d17a4a9b to your computer and use it in GitHub Desktop.
HTML parsing support (primarily for Drupal migrate framework, but could be used generally)
<?php
/**
* @file
* HTML parsing support.
*/
class MigrateHtmlParser {
protected $html;
protected $dom;
protected $elements;
public function __construct($html) {
$this->html = $html;
// DOM can load HTML soup. But, HTML soup can throw warnings, suppress them.
$old = libxml_use_internal_errors(TRUE);
$dom = new DOMDocument();
$dom->loadHTML($html);
libxml_use_internal_errors($old);
if ($dom) {
// It's much easier to work with simplexml than DOM, luckily enough
// we can just simply import our DOM tree.
$this->elements = simplexml_import_dom($dom);
}
else {
throw new UnexpectedValueException("The html could not be parsed");
}
}
public function xpath($xpath) {
$result = $this->elements->xpath($xpath);
// Some combinations of PHP / libxml versions return an empty array
// instead of the documented FALSE. Forcefully convert any falsish values
// to an empty array to allow foreach(...) constructions.
return $result ? $result : array();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment