Created
August 28, 2015 15:31
-
-
Save amacgregor/e61c63f904436b0cd742 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Goose\Modules\Formatters; | |
use Goose\Article; | |
use Goose\Traits\NodeCommonTrait; | |
use Goose\Traits\NodeGravityTrait; | |
use Goose\Traits\ArticleMutatorTrait; | |
use Goose\Modules\AbstractModule; | |
use Goose\Modules\ModuleInterface; | |
use DOMWrap\Text; | |
use DOMWrap\Element; | |
/** | |
* Output Formatter | |
* | |
* @package Goose\Modules\Formatters | |
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 | |
*/ | |
class OutputFormatter extends AbstractModule implements ModuleInterface { | |
use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait; | |
/** @var double */ | |
protected static $SIBLING_BASE_LINE_SCORE = 0.30; | |
/** | |
* @param Article $article | |
*/ | |
public function run(Article $article) { | |
$this->article($article); | |
if ($this->article()->getTopNode() instanceof Element) { | |
$this->postExtractionCleanup(); | |
$article->setCleanedArticleText($this->getFormattedText()); | |
$article->setHtmlArticle($this->cleanupHtml()); | |
} | |
} | |
/** | |
* Removes all unnecessary elements and formats the selected text nodes | |
* | |
* @return string Formatted string with all HTML removed | |
*/ | |
private function getFormattedText() { | |
$this->removeNodesWithNegativeScores($this->article()->getTopNode()); | |
$this->convertLinksToText($this->article()->getTopNode()); | |
$this->replaceTagsWithText($this->article()->getTopNode()); | |
$this->removeParagraphsWithFewWords($this->article()->getTopNode()); | |
return $this->convertToText($this->article()->getTopNode()); | |
} | |
/** | |
* Takes an element and turns the P tags into \n\n | |
* | |
* @param Element $topNode The top most node to format | |
* | |
* @return string | |
*/ | |
private function convertToText(Element $topNode) { | |
if (empty($topNode)) { | |
return ''; | |
} | |
$list = []; | |
foreach ($topNode->children() as $child) { | |
if (!method_exists($child,'is')) { | |
throw new Exception("Method is() called on invalid object"); | |
} | |
$list[] = $child->text(DOM_NODE_TEXT_TRIM); | |
} | |
return implode("\n\n", $list); | |
} | |
/** | |
* Scrape the node content and return the html | |
* | |
* @return string Formatted string with all HTML | |
*/ | |
private function cleanupHtml() { | |
$topNode = $this->article()->getTopNode(); | |
if (empty($topNode)) { | |
return ''; | |
} | |
$this->removeParagraphsWithFewWords($topNode); | |
$html = $this->convertToHtml($topNode); | |
return str_replace(['<p></p>', '<p> </p>'], '', $html); | |
} | |
/** | |
* @param Element $topNode | |
* | |
* @return string | |
*/ | |
private function convertToHtml(Element $topNode) { | |
if (empty($topNode)) { | |
return ''; | |
} | |
return $topNode->ownerDocument->saveHTML($topNode); | |
} | |
/** | |
* cleans up and converts any nodes that should be considered text into text | |
* | |
* @param Element $topNode | |
*/ | |
private function convertLinksToText(Element $topNode) { | |
if (!empty($topNode)) { | |
$links = $topNode->filter('a'); | |
foreach ($links as $item) { | |
$images = $item->filter('img'); | |
if ($images->count() == 0) { | |
$item->replace(new Text($item->text(DOM_NODE_TEXT_NORMALISED))); | |
} | |
} | |
} | |
} | |
/** | |
* if there are elements inside our top node that have a negative gravity score, let's | |
* give em the boot | |
* | |
* @param Element $topNode | |
*/ | |
private function removeNodesWithNegativeScores(Element $topNode) { | |
if (!empty($topNode)) { | |
$gravityItems = $topNode->filter('*[gravityScore]'); | |
foreach ($gravityItems as $item) { | |
$score = (int)$item->getAttribute('gravityScore'); | |
if ($score < 1) { | |
$item->remove(); | |
} | |
} | |
} | |
} | |
/** | |
* replace common tags with just text so we don't have any crazy formatting issues | |
* so replace <br>, <i>, <strong>, etc.... with whatever text is inside them | |
* | |
* @param Element $topNode | |
*/ | |
private function replaceTagsWithText(Element $topNode) { | |
if (!empty($topNode)) { | |
$items = $topNode->filter('b, strong, i'); | |
foreach ($items as $item) { | |
$item->replace(new Text($this->getTagCleanedText($item))); | |
} | |
} | |
} | |
/** | |
* @todo Implement | |
* | |
* @param Element $item | |
* | |
* @return string | |
*/ | |
private function getTagCleanedText(Element $item) { | |
return $item->text(DOM_NODE_TEXT_NORMALISED); | |
} | |
/** | |
* remove paragraphs that have less than x number of words, would indicate that it's some sort of link | |
* | |
* @param Element $topNode | |
*/ | |
private function removeParagraphsWithFewWords(Element $topNode) { | |
if (!empty($topNode)) { | |
$nodes = $topNode->filter('p'); | |
foreach ($nodes as $node) { | |
$stopWords = $this->config()->getStopWords()->getStopwordCount($node->text()); | |
if (mb_strlen($node->text(DOM_NODE_TEXT_NORMALISED)) < 8 && $stopWords->getStopWordCount() < 3 && $node->filter('object')->count() == 0 && $node->filter('embed')->count() == 0) { | |
$node->remove(); | |
} | |
} | |
/** @todo Implement */ | |
} | |
} | |
/** | |
* Remove any divs that looks like non-content, clusters of links, or paras with no gusto | |
*/ | |
private function postExtractionCleanup() { | |
$this->addSiblings($this->article()->getTopNode()); | |
foreach ($this->article()->getTopNode()->children() as $node) { | |
if (!method_exists($node,'is')) { | |
throw new Exception("Method is() called on invalid object"); | |
} | |
if ($node->is(':not(p):not(strong)')) { | |
if ($this->isHighLinkDensity($node) | |
|| $this->isTableTagAndNoParagraphsExist($node) | |
|| !$this->isNodeScoreThreshholdMet($this->article()->getTopNode(), $node) | |
) { | |
$node->remove(); | |
} | |
} | |
} | |
} | |
/** | |
* @param Element $topNode | |
*/ | |
private function removeSmallParagraphs(Element $topNode) { | |
$nodes = $topNode->filter('p, strong'); | |
foreach ($nodes as $node) { | |
if (mb_strlen($node->text(DOM_NODE_TEXT_NORMALISED)) < 25) { | |
$node->remove(); | |
} | |
} | |
} | |
/** | |
* @param Element $topNode | |
* | |
* @return bool | |
*/ | |
private function isTableTagAndNoParagraphsExist(Element $topNode) { | |
$this->removeSmallParagraphs($topNode); | |
$nodes = $topNode->filter('p'); | |
if ($nodes->count() == 0 && $topNode->is(':not(td)')) { | |
if ($topNode->is('ul, ol')) { | |
$linkTextLength = array_sum(array_map(function($value) { | |
return mb_strlen($value->text(DOM_NODE_TEXT_NORMALISED)); | |
}, $topNode->filter('a')->toArray())); | |
$elementTextLength = mb_strlen($topNode->text(DOM_NODE_TEXT_NORMALISED)); | |
if ($elementTextLength > 0 && ($linkTextLength / $elementTextLength) < 0.5) { | |
return false; | |
} | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* @param Element $topNode | |
* @param Element $node | |
* | |
* @return bool | |
*/ | |
private function isNodeScoreThreshholdMet(Element $topNode, Element $node) { | |
$topNodeScore = $this->getScore($topNode); | |
$currentNodeScore = $this->getScore($node); | |
$thresholdScore = ($topNodeScore * 0.08); | |
if ($currentNodeScore < $thresholdScore && $node->is(':not(td)')) { | |
return false; | |
} | |
return true; | |
} | |
/** | |
* Adds any siblings that may have a decent score to this node | |
* | |
* @param Element $currentSibling | |
* @param int $baselineScoreForSiblingParagraphs | |
* | |
* @return Element[] | |
*/ | |
private function getSiblingContent(Element $currentSibling, $baselineScoreForSiblingParagraphs) { | |
$text = $currentSibling->text(DOM_NODE_TEXT_TRIM); | |
if ($currentSibling->is('p, strong') && !empty($text)) { | |
return [$currentSibling]; | |
} | |
$results = []; | |
$nodes = $currentSibling->filter('p, strong'); | |
foreach ($nodes as $node) { | |
$text = $node->text(DOM_NODE_TEXT_TRIM); | |
if (!empty($text)) { | |
$wordStats = $this->config()->getStopWords()->getStopwordCount($text); | |
if (($baselineScoreForSiblingParagraphs * self::$SIBLING_BASE_LINE_SCORE) < $wordStats->getStopWordCount()) { | |
$results[] = $node->document()->createElement('p', $text); | |
} | |
} | |
} | |
return $results; | |
} | |
/** | |
* @param Element $topNode | |
*/ | |
private function addSiblings(Element $topNode) { | |
$baselineScoreForSiblingParagraphs = $this->getBaselineScoreForSiblings($topNode); | |
foreach ($topNode->previousAll(XML_ELEMENT_NODE) as $currentNode) { | |
$results = $this->getSiblingContent($currentNode, $baselineScoreForSiblingParagraphs); | |
foreach ($results as $result) { | |
$topNode->insertBefore($result, $topNode->firstChild); | |
} | |
} | |
} | |
/** | |
* we could have long articles that have tons of paragraphs so if we tried to calculate the base score against | |
* the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring | |
* of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of | |
* 100 then 100 should be our base. | |
* | |
* @param Element $topNode | |
* | |
* @return int | |
*/ | |
private function getBaselineScoreForSiblings(Element $topNode) { | |
$base = 100000; | |
$numberOfParagraphs = 0; | |
$scoreOfParagraphs = 0; | |
$nodesToCheck = $topNode->filter('p, strong'); | |
foreach ($nodesToCheck as $node) { | |
$nodeText = $node->text(); | |
$wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText); | |
$highLinkDensity = $this->isHighLinkDensity($node); | |
if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { | |
$numberOfParagraphs += 1; | |
$scoreOfParagraphs += $wordStats->getStopWordCount(); | |
} | |
} | |
if ($numberOfParagraphs > 0) { | |
$base = $scoreOfParagraphs / $numberOfParagraphs; | |
} | |
return $base; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment