Skip to content

Instantly share code, notes, and snippets.

@hakre
Created February 23, 2012 18:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hakre/1894360 to your computer and use it in GitHub Desktop.
Save hakre/1894360 to your computer and use it in GitHub Desktop.
TextRange and TextRangeTrimmer HTML/DOMDocument text processing classes
<?php
/**
* TextRange.php - DOMDocument based Textrange and text manipulation
*
* @author hakre <http://hakre.wordpress.com/credits>
* @version 1.1.1
*/
/**
* TextRange - Collection of DOMText nodes
*
* @since 1.0.0
*/
class TextRange implements Countable
{
/**
* @var DOMText[]
*/
private $nodes;
public function __construct($nodes)
{
$this->setNodes($nodes);
}
/**
* @param array|DOMNodeList|DOMElement $nodes DOMText nodes
* @throws InvalidArgumentException
* @since 1.1.0
*/
public function setNodes($nodes)
{
if ($nodes instanceof DOMNode) {
$nodes = $this->getChildTextNodes($nodes);
}
if ($nodes instanceof DOMNodeList) {
$nodes = iterator_to_array($nodes);
}
if (!is_array($nodes)) {
throw new InvalidArgumentException('Not an array/DOMNodeList/DOMElement.');
}
$this->nodes = $this->validateDOMTextArray($nodes);
}
/**
* @return array
*/
public function getNodes()
{
return $this->nodes;
}
/**
* @param DOMNode $DOMNode
* @return array DOMText
*/
private function getChildTextNodes(DOMNode $DOMNode)
{
$xp = new DOMXPath($DOMNode->ownerDocument);
$textNodes = $xp->query('.//child::text()', $DOMNode);
if (!$textNodes) {
throw new RuntimeException('XPath query to obtain DOMText childnodes failed.');
}
return $textNodes;
}
/**
* @return array
*/
public function getStrings()
{
$strings = array();
foreach ($this->nodes as $node)
$strings[] = $node->nodeValue;
return $strings;
}
private function validateDOMTextArray(array $list)
{
foreach ($list as $node)
{
if (!$node instanceof DOMText) {
throw new InvalidArgumentException('Not a DOMText');
}
}
return $list;
}
private function validateOffsetArgument($offset)
{
$list = $this->nodes;
if (!$list) throw new BadMethodCallException('Range has no elements.');
$offset = (int)$offset;
if ($offset < 0) throw new OutOfBoundsException('Invalid offset (<0).');
$length = $this->utf8StrLen($this);
if ($offset > $length) throw new OutOfBoundsException(sprintf('Invalid offset (>%d).', $length));
return $offset;
}
/**
* Split (cutoff) at offset.
*
* @param int|string $offset UTF-8 character offset or string (of which the length is taken as $offset)
* @return TextRange new remainder range
*/
public function split($offset)
{
if (is_string($offset)) {
$offset = $this->utf8StrLen($offset);
}
if ($offset < 0) {
$offset = $this->stringLength() + $offset;
}
$index = $this->splitText($offset);
if ($index === count($this)) {
$new = array(); // virtually the next node
}
else
{
$new = array_splice($this->nodes, $index);
}
return new TextRange($new);
}
/**
* @return int string length (UTF-8)
*/
public function stringLength()
{
return $this->utf8StrLen($this);
}
private function utf8StrLen($string)
{
preg_filter('(.)su', '', $string, -1, $count);
return $count;
}
/**
* @param int $offset UTF-8 character offset
* @return int index of new node / offset start node
*/
public function splitText($offset)
{
$offset = $this->validateOffsetArgument($offset);
$runOffset = 0;
foreach ($this->nodes as $index => $node)
{
$len = $this->utf8StrLen($node->nodeValue);
// at the start or a text node
if ($offset === $runOffset) {
return $index;
}
// at the end of a text node, it's the next node (can be virtual)
if ($offset === $runOffset + $len) {
return $index + 1;
}
// match, split this node
if ($offset > $runOffset && $offset < $runOffset + $len) {
$splitAt = $offset - $runOffset;
$newNode = $node->splitText($splitAt);
array_splice($this->nodes, $index + 1, 0, array($newNode));
return $index + 1;
}
$runOffset += $len;
}
throw new Exception('Implementation Error - should never come here, check input validation or function code.');
}
public function __toString()
{
return implode('', $this->getStrings());
}
public function getCharacter($offset)
{
return mb_substr((string)$this, $offset, 1, 'UTF-8');
}
public function count()
{
return count($this->nodes);
}
}
/**
* TextRangeTrimmer - trim, ltrim and rtrim for TextRange
*
* @since 1.0.0
*/
class TextRangeTrimmer
{
/**
* @var TextRange
*/
private $range;
/**
* @var array
*/
private $charlist;
public function __construct(TextRange $range, array $charlist = NULL)
{
$this->range = $range;
$this->setCharlist($charlist);
}
/**
* @param array $charlist list of UTF-8 encoded characters
* @throws InvalidArgumentException
*/
public function setCharlist(array $charlist = NULL)
{
if (NULL === $charlist)
$charlist = str_split(" \t\n\r\0\x0B");
$list = array();
foreach ($charlist as $char)
{
if (!is_string($char)) {
throw new InvalidArgumentException('Not an array of strings.');
}
if (strlen($char)) {
$list[] = $char;
}
}
$this->charlist = array_flip($list);
}
/**
* @return array characters
*/
public function getCharlist()
{
return array_keys($this->charlist);
}
public function trim()
{
if (!$this->charlist) return;
$this->ltrim();
$this->rtrim();
}
public function ltrim()
{
$count = $this->lengthOfCharacterSequence($this->charlist, 0);
if ($count) {
$remainder = $this->range->split($count);
foreach ($this->range->getNodes() as $textNode)
{
$textNode->parentNode->removeChild($textNode);
}
$this->range->setNodes($remainder->getNodes());
}
}
public function rtrim()
{
$count = $this->lengthOfCharacterSequence($this->charlist, -1, -1);
if ($count) {
$chop = $this->range->split(-$count);
foreach ($chop->getNodes() as $textNode)
{
$textNode->parentNode->removeChild($textNode);
}
}
}
/**
* Number of consecutive characters of $charlist from $start to $direction.
*
* @param array $charlist
* @param int $start offset
* @param int $direction 1: forward, -1: backward
* @return int
* @throws InvalidArgumentException
*/
private function lengthOfCharacterSequence(array $charlist, $start, $direction = 1)
{
$start = (int)$start;
$direction = max(-1, min(1, $direction));
if (!$direction) throw new InvalidArgumentException('Direction must be 1 or -1.');
$count = 0;
for (; $char = $this->range->getCharacter($start), $char !== ''; $start += $direction, $count++)
if (!isset($charlist[$char])) break;
return $count;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment