Created
October 27, 2016 12:13
-
-
Save NeilMasters/215b54a7aa97840e42551ac5a0773b60 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if(isset($argv[1])) { | |
class Xml { | |
public $url; | |
} | |
class Url { | |
public $loc; | |
} | |
$xml = new Xml(); | |
$url = new Url(); | |
$url->loc = $argv[1]; | |
$xml->url[] = $url; | |
} else { | |
$sitemap = "sitemap.xml"; | |
$xml = new SimpleXMLElement(file_get_contents($sitemap)); | |
} | |
function getTextBetweenTags($string, $tagname){ | |
$d = new DOMDocument(); | |
@$d->loadHTML($string); | |
$return = array(); | |
foreach($d->getElementsByTagName($tagname) as $item){ | |
$return[] = $item->textContent; | |
} | |
return $return; | |
} | |
foreach($xml->url as $url) | |
{ | |
$uri = isset($argv[1]) | |
? $argv[1] | |
: reset($url->loc); | |
$pageContents = file_get_contents($uri); | |
$h1Tags = getTextBetweenTags($pageContents, 'h1'); | |
$h2Tags = getTextBetweenTags($pageContents, 'h2'); | |
$h3Tags = getTextBetweenTags($pageContents, 'h3'); | |
$h4Tags = getTextBetweenTags($pageContents, 'h4'); | |
$h1Str = ''; | |
foreach($h1Tags as $tag) { | |
$h1Str .= $tag ? "H1: " . trim($tag) . "\n" : "H1: Empty\n"; | |
} | |
$h2Str = ''; | |
foreach($h2Tags as $tag) { | |
$h2Str .= $tag ? "H2: " . trim($tag) . "\n" : "H2: Empty\n"; | |
} | |
$h3Str = ''; | |
foreach($h3Tags as $tag) { | |
$h3Str .= $tag ? "H3:" . trim($tag) . "\n" : "H3: Empty\n"; | |
} | |
$h4Str = ''; | |
foreach($h4Tags as $tag) { | |
$h4Str .= $tag ? "H4: " . trim($tag) . "\n" : "H4: Empty\n"; | |
} | |
echo sprintf("%s,\"%s\",\"%s\",\"%s\",\"%s\"\n", $uri, $h1Str, $h2Str, $h3Str, $h4Str); | |
usleep(500); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment