Created
April 3, 2019 02:15
-
-
Save swirtSJW/2c584d9c098a4160bd809f4959295d01 to your computer and use it in GitHub Desktop.
A Drupal migration_tools obtainer for finding long plain text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Drupal\migration_tools\Obtainer; | |
use Drupal\migration_tools\Obtainer\ObtainHtml; | |
use Drupal\migration_tools\StringTools; | |
/** | |
* Obtainers for Long Plain Text. | |
* | |
* @package Drupal\migration_tools\Obtainer | |
*/ | |
class ObtainLongPlainText extends ObtainHtml { | |
/** | |
* Plucker to turn html into long plain text for nth selector on the page. | |
* | |
* @param string $selector | |
* The selector to find. | |
* @param int $n | |
* (optional) The depth to find. Default: first item n=1. | |
* | |
* @return string | |
* The text found. | |
*/ | |
protected function pluckPlain($selector, $n = 1, $pluck = TRUE) { | |
$text = ''; | |
$n = ($n > 0) ? $n - 1 : 0; | |
if (!empty($selector)) { | |
$elements = $this->queryPath->find($selector); | |
/** @var \QueryPath\DOMQuery $element */ | |
foreach ((is_object($elements)) ? $elements : [] as $i => $element) { | |
if ($i == $n) { | |
if ($pluck) { | |
$this->setElementToRemove($element); | |
} | |
$plain_text = ''; | |
/** @var \QueryPath\DOMQuery $item */ | |
foreach ($element->contents() as $item) { | |
if ($item->tag() == 'p') { | |
if (strlen($plain_text) > 1 && substr($plain_text, -2) != PHP_EOL . PHP_EOL) { | |
$plain_text .= PHP_EOL . PHP_EOL; | |
} | |
$text = $item->innerHTML() . PHP_EOL . PHP_EOL; | |
} | |
else { | |
$text = $item->html(); | |
} | |
$sections = self::splitOnBr($text); | |
$plain_text .= strip_tags(implode(PHP_EOL, $sections)); | |
} | |
$this->setCurrentFindMethod("pluckPlain($selector, " . ++$n . ')'); | |
break; | |
} | |
} | |
} | |
return $plain_text; | |
} | |
/** | |
* Finder to turn html into long plain text for nth selector on the page. | |
* | |
* @param string $selector | |
* The selector to find. | |
* @param int $n | |
* (optional) The depth to find. Default: first item n=1. | |
* | |
* @return string | |
* The text found. | |
*/ | |
protected function findPlain($selector, $n = 1) { | |
return $this->pluckPlain($selector, $n, FALSE); | |
} | |
/** | |
* {@inheritdoc} | |
*/ | |
public static function cleanString($string) { | |
// There are also numeric html special chars, let's change those. | |
$string = StringTools::decodeHtmlEntityNumeric($string); | |
// Checking again in case another process rendered it non UTF-8. | |
$is_utf8 = mb_check_encoding($string, 'UTF-8'); | |
if (!$is_utf8) { | |
$string = StringTools::fixEncoding($string); | |
} | |
// Remove white space-like things from the ends and decodes html entities. | |
$string = StringTools::superTrim($string); | |
return $string; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment