Skip to content

Instantly share code, notes, and snippets.

@amacgregor
Created August 28, 2015 15:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amacgregor/e61c63f904436b0cd742 to your computer and use it in GitHub Desktop.
Save amacgregor/e61c63f904436b0cd742 to your computer and use it in GitHub Desktop.
<?php
namespace Goose\Modules\Formatters;
use Goose\Article;
use Goose\Traits\NodeCommonTrait;
use Goose\Traits\NodeGravityTrait;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Text;
use DOMWrap\Element;
/**
* Output Formatter
*
* @package Goose\Modules\Formatters
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
*/
class OutputFormatter extends AbstractModule implements ModuleInterface {
use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait;
/** @var double */
protected static $SIBLING_BASE_LINE_SCORE = 0.30;
/**
* @param Article $article
*/
public function run(Article $article) {
$this->article($article);
if ($this->article()->getTopNode() instanceof Element) {
$this->postExtractionCleanup();
$article->setCleanedArticleText($this->getFormattedText());
$article->setHtmlArticle($this->cleanupHtml());
}
}
/**
* Removes all unnecessary elements and formats the selected text nodes
*
* @return string Formatted string with all HTML removed
*/
private function getFormattedText() {
$this->removeNodesWithNegativeScores($this->article()->getTopNode());
$this->convertLinksToText($this->article()->getTopNode());
$this->replaceTagsWithText($this->article()->getTopNode());
$this->removeParagraphsWithFewWords($this->article()->getTopNode());
return $this->convertToText($this->article()->getTopNode());
}
/**
* Takes an element and turns the P tags into \n\n
*
* @param Element $topNode The top most node to format
*
* @return string
*/
private function convertToText(Element $topNode) {
if (empty($topNode)) {
return '';
}
$list = [];
foreach ($topNode->children() as $child) {
if (!method_exists($child,'is')) {
throw new Exception("Method is() called on invalid object");
}
$list[] = $child->text(DOM_NODE_TEXT_TRIM);
}
return implode("\n\n", $list);
}
/**
* Scrape the node content and return the html
*
* @return string Formatted string with all HTML
*/
private function cleanupHtml() {
$topNode = $this->article()->getTopNode();
if (empty($topNode)) {
return '';
}
$this->removeParagraphsWithFewWords($topNode);
$html = $this->convertToHtml($topNode);
return str_replace(['<p></p>', '<p>&nbsp;</p>'], '', $html);
}
/**
* @param Element $topNode
*
* @return string
*/
private function convertToHtml(Element $topNode) {
if (empty($topNode)) {
return '';
}
return $topNode->ownerDocument->saveHTML($topNode);
}
/**
* cleans up and converts any nodes that should be considered text into text
*
* @param Element $topNode
*/
private function convertLinksToText(Element $topNode) {
if (!empty($topNode)) {
$links = $topNode->filter('a');
foreach ($links as $item) {
$images = $item->filter('img');
if ($images->count() == 0) {
$item->replace(new Text($item->text(DOM_NODE_TEXT_NORMALISED)));
}
}
}
}
/**
* if there are elements inside our top node that have a negative gravity score, let's
* give em the boot
*
* @param Element $topNode
*/
private function removeNodesWithNegativeScores(Element $topNode) {
if (!empty($topNode)) {
$gravityItems = $topNode->filter('*[gravityScore]');
foreach ($gravityItems as $item) {
$score = (int)$item->getAttribute('gravityScore');
if ($score < 1) {
$item->remove();
}
}
}
}
/**
* replace common tags with just text so we don't have any crazy formatting issues
* so replace <br>, <i>, <strong>, etc.... with whatever text is inside them
*
* @param Element $topNode
*/
private function replaceTagsWithText(Element $topNode) {
if (!empty($topNode)) {
$items = $topNode->filter('b, strong, i');
foreach ($items as $item) {
$item->replace(new Text($this->getTagCleanedText($item)));
}
}
}
/**
* @todo Implement
*
* @param Element $item
*
* @return string
*/
private function getTagCleanedText(Element $item) {
return $item->text(DOM_NODE_TEXT_NORMALISED);
}
/**
* remove paragraphs that have less than x number of words, would indicate that it's some sort of link
*
* @param Element $topNode
*/
private function removeParagraphsWithFewWords(Element $topNode) {
if (!empty($topNode)) {
$nodes = $topNode->filter('p');
foreach ($nodes as $node) {
$stopWords = $this->config()->getStopWords()->getStopwordCount($node->text());
if (mb_strlen($node->text(DOM_NODE_TEXT_NORMALISED)) < 8 && $stopWords->getStopWordCount() < 3 && $node->filter('object')->count() == 0 && $node->filter('embed')->count() == 0) {
$node->remove();
}
}
/** @todo Implement */
}
}
/**
* Remove any divs that looks like non-content, clusters of links, or paras with no gusto
*/
private function postExtractionCleanup() {
$this->addSiblings($this->article()->getTopNode());
foreach ($this->article()->getTopNode()->children() as $node) {
if (!method_exists($node,'is')) {
throw new Exception("Method is() called on invalid object");
}
if ($node->is(':not(p):not(strong)')) {
if ($this->isHighLinkDensity($node)
|| $this->isTableTagAndNoParagraphsExist($node)
|| !$this->isNodeScoreThreshholdMet($this->article()->getTopNode(), $node)
) {
$node->remove();
}
}
}
}
/**
* @param Element $topNode
*/
private function removeSmallParagraphs(Element $topNode) {
$nodes = $topNode->filter('p, strong');
foreach ($nodes as $node) {
if (mb_strlen($node->text(DOM_NODE_TEXT_NORMALISED)) < 25) {
$node->remove();
}
}
}
/**
* @param Element $topNode
*
* @return bool
*/
private function isTableTagAndNoParagraphsExist(Element $topNode) {
$this->removeSmallParagraphs($topNode);
$nodes = $topNode->filter('p');
if ($nodes->count() == 0 && $topNode->is(':not(td)')) {
if ($topNode->is('ul, ol')) {
$linkTextLength = array_sum(array_map(function($value) {
return mb_strlen($value->text(DOM_NODE_TEXT_NORMALISED));
}, $topNode->filter('a')->toArray()));
$elementTextLength = mb_strlen($topNode->text(DOM_NODE_TEXT_NORMALISED));
if ($elementTextLength > 0 && ($linkTextLength / $elementTextLength) < 0.5) {
return false;
}
}
return true;
}
return false;
}
/**
* @param Element $topNode
* @param Element $node
*
* @return bool
*/
private function isNodeScoreThreshholdMet(Element $topNode, Element $node) {
$topNodeScore = $this->getScore($topNode);
$currentNodeScore = $this->getScore($node);
$thresholdScore = ($topNodeScore * 0.08);
if ($currentNodeScore < $thresholdScore && $node->is(':not(td)')) {
return false;
}
return true;
}
/**
* Adds any siblings that may have a decent score to this node
*
* @param Element $currentSibling
* @param int $baselineScoreForSiblingParagraphs
*
* @return Element[]
*/
private function getSiblingContent(Element $currentSibling, $baselineScoreForSiblingParagraphs) {
$text = $currentSibling->text(DOM_NODE_TEXT_TRIM);
if ($currentSibling->is('p, strong') && !empty($text)) {
return [$currentSibling];
}
$results = [];
$nodes = $currentSibling->filter('p, strong');
foreach ($nodes as $node) {
$text = $node->text(DOM_NODE_TEXT_TRIM);
if (!empty($text)) {
$wordStats = $this->config()->getStopWords()->getStopwordCount($text);
if (($baselineScoreForSiblingParagraphs * self::$SIBLING_BASE_LINE_SCORE) < $wordStats->getStopWordCount()) {
$results[] = $node->document()->createElement('p', $text);
}
}
}
return $results;
}
/**
* @param Element $topNode
*/
private function addSiblings(Element $topNode) {
$baselineScoreForSiblingParagraphs = $this->getBaselineScoreForSiblings($topNode);
foreach ($topNode->previousAll(XML_ELEMENT_NODE) as $currentNode) {
$results = $this->getSiblingContent($currentNode, $baselineScoreForSiblingParagraphs);
foreach ($results as $result) {
$topNode->insertBefore($result, $topNode->firstChild);
}
}
}
/**
* we could have long articles that have tons of paragraphs so if we tried to calculate the base score against
* the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring
* of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of
* 100 then 100 should be our base.
*
* @param Element $topNode
*
* @return int
*/
private function getBaselineScoreForSiblings(Element $topNode) {
$base = 100000;
$numberOfParagraphs = 0;
$scoreOfParagraphs = 0;
$nodesToCheck = $topNode->filter('p, strong');
foreach ($nodesToCheck as $node) {
$nodeText = $node->text();
$wordStats = $this->config()->getStopWords()->getStopwordCount($nodeText);
$highLinkDensity = $this->isHighLinkDensity($node);
if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
$numberOfParagraphs += 1;
$scoreOfParagraphs += $wordStats->getStopWordCount();
}
}
if ($numberOfParagraphs > 0) {
$base = $scoreOfParagraphs / $numberOfParagraphs;
}
return $base;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment