Created
June 4, 2015 13:40
-
-
Save hans2103/5300f42bef6dc4893327 to your computer and use it in GitHub Desktop.
get Meta Description from websites
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* get Meta Tags from weblinks | |
* | |
* input for this script is data.txt | |
* data.txt contains weblinks. Every line a new weblink. | |
* | |
* use this php script from command line | |
* $ cat data.txt|while read LINE;do php ./getMetaTags.php $LINE>>output.csv | |
* | |
* script was needed to move Byte.nl/wiki to Byte.nl/kennisbank | |
*/ | |
function file_get_contents_curl($url) | |
{ | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_HEADER, 0); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); | |
$data = curl_exec($ch); | |
curl_close($ch); | |
return $data; | |
} | |
$url = $argv[1]; | |
$pattern = "https://www.byte.nl/wiki/"; | |
$slug = strtolower(substr($url, strlen($pattern))); | |
$html = file_get_contents_curl($url); | |
//parsing begins here: | |
$doc = new DOMDocument(); | |
@$doc->loadHTML($html); | |
$nodes = $doc->getElementsByTagName('title'); | |
//get and display what you need: | |
$title = $nodes->item(0)->nodeValue; | |
$metas = $doc->getElementsByTagName('meta'); | |
for ($i = 0; $i < $metas->length; $i++) | |
{ | |
$meta = $metas->item($i); | |
if($meta->getAttribute('name') == 'description') | |
$description = $meta->getAttribute('content'); | |
if($meta->getAttribute('name') == 'keywords') | |
$keywords = $meta->getAttribute('content'); | |
} | |
/*echo "URL: $url". '<br/><br/>'; | |
echo "Title: $title". '<br/><br/>'; | |
echo "Description: $description". '<br/><br/>'; | |
echo "Keywords: $keywords";*/ | |
echo "\"$slug\";\"$description\"\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment