Skip to content

Instantly share code, notes, and snippets.

@cloudsben
Created January 24, 2013 03:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cloudsben/4617345 to your computer and use it in GitHub Desktop.
Save cloudsben/4617345 to your computer and use it in GitHub Desktop.
grab article content
<?php
error_reporting(-1);
$parent_nodes = array();
$html = file_get_contents("http://www.csdn.net/article/2013-01-09/2813524-CSDN-morning-paper");
$doc = new DOMDocument();
$doc->encoding = "utf-8";
try {
@$doc->loadHTML($html);
$ptext = $doc->getElementsByTagName("p");
} catch (Exception $e) {
// not do anything
}
foreach($ptext as $item)
{
$parent_node = $item->parentNode;
$content_score = intval($parent_node->getAttribute("contentScore"));
$class_name = $parent_node->getAttribute("class");
$id = $parent_node->getAttribute("id");
// Look for a special classname
if (preg_match("/(comment|meta|footer|footnote)/i", $class_name))
{
$content_score -= 50;
}
else if(preg_match(
"/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
$class_name))
{
$content_score += 25;
}
// Look for a special ID
if (preg_match("/(comment|meta|footer|footnote)/i", $id))
{
$content_score -= 50;
}
else if (preg_match(
"/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i",
$id))
{
$content_score += 25;
}
// // Add points for any commas within this paragraph
if (strlen($item->nodeValue) > 10)
{
$content_score += strlen($item->nodeValue);
}
$parent_node->setAttribute("contentScore", $content_score);
array_push($parent_nodes, $parent_node);
}
$box = $doc->createElement('div', '');
for ($i = 0, $len = count($parent_nodes); $i < $len; $i++)
{
$parent_node = $parent_nodes[$i];
$content_score = intval($parent_node->getAttribute("contentScore"));
$org_content_score = intval($box->getAttribute("contentScore"));
if($content_score && $content_score > $org_content_score)
{
$box = $parent_node;
}
}
$content = $doc->saveHTML($box);
echo $content;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment