Skip to content

Instantly share code, notes, and snippets.

@swirtSJW
Created April 3, 2019 02:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swirtSJW/2c584d9c098a4160bd809f4959295d01 to your computer and use it in GitHub Desktop.
Save swirtSJW/2c584d9c098a4160bd809f4959295d01 to your computer and use it in GitHub Desktop.
A Drupal migration_tools obtainer for finding long plain text.
<?php
namespace Drupal\migration_tools\Obtainer;
use Drupal\migration_tools\Obtainer\ObtainHtml;
use Drupal\migration_tools\StringTools;
/**
* Obtainers for Long Plain Text.
*
* @package Drupal\migration_tools\Obtainer
*/
class ObtainLongPlainText extends ObtainHtml {
/**
* Plucker to turn html into long plain text for nth selector on the page.
*
* @param string $selector
* The selector to find.
* @param int $n
* (optional) The depth to find. Default: first item n=1.
*
* @return string
* The text found.
*/
protected function pluckPlain($selector, $n = 1, $pluck = TRUE) {
$text = '';
$n = ($n > 0) ? $n - 1 : 0;
if (!empty($selector)) {
$elements = $this->queryPath->find($selector);
/** @var \QueryPath\DOMQuery $element */
foreach ((is_object($elements)) ? $elements : [] as $i => $element) {
if ($i == $n) {
if ($pluck) {
$this->setElementToRemove($element);
}
$plain_text = '';
/** @var \QueryPath\DOMQuery $item */
foreach ($element->contents() as $item) {
if ($item->tag() == 'p') {
if (strlen($plain_text) > 1 && substr($plain_text, -2) != PHP_EOL . PHP_EOL) {
$plain_text .= PHP_EOL . PHP_EOL;
}
$text = $item->innerHTML() . PHP_EOL . PHP_EOL;
}
else {
$text = $item->html();
}
$sections = self::splitOnBr($text);
$plain_text .= strip_tags(implode(PHP_EOL, $sections));
}
$this->setCurrentFindMethod("pluckPlain($selector, " . ++$n . ')');
break;
}
}
}
return $plain_text;
}
/**
* Finder to turn html into long plain text for nth selector on the page.
*
* @param string $selector
* The selector to find.
* @param int $n
* (optional) The depth to find. Default: first item n=1.
*
* @return string
* The text found.
*/
protected function findPlain($selector, $n = 1) {
return $this->pluckPlain($selector, $n, FALSE);
}
/**
* {@inheritdoc}
*/
public static function cleanString($string) {
// There are also numeric html special chars, let's change those.
$string = StringTools::decodeHtmlEntityNumeric($string);
// Checking again in case another process rendered it non UTF-8.
$is_utf8 = mb_check_encoding($string, 'UTF-8');
if (!$is_utf8) {
$string = StringTools::fixEncoding($string);
}
// Remove white space-like things from the ends and decodes html entities.
$string = StringTools::superTrim($string);
return $string;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment