Skip to content

Instantly share code, notes, and snippets.

@Netcelal
Created May 26, 2012 15:03
Show Gist options
  • Save Netcelal/2794255 to your computer and use it in GitHub Desktop.
Save Netcelal/2794255 to your computer and use it in GitHub Desktop.
PHP class for parsing html and get links
<?php
class helper {
//---------------------------------------------------------------------------
public function __construct() {
}
//---------------------------------------------------------------------------
public function getLinks($html, $parent) {
$ret = array();
$dom = new domDocument;
@$dom -> loadHTML($html);
$dom -> preserveWhiteSpace = false;
$links = $dom -> getElementsByTagName('a');
foreach ($links as $tag) {
$u = $tag -> getAttribute('href');
$u = $this -> checkIfIntern($u, $parent);
$bool = $this -> isValidURL($u);
$ext = $this -> checkExtension($u);
switch ($ext) {
case 'gif' :
case 'png' :
case 'jpg' :
case 'jpeg' :
break;
case 'js' :
case 'css' :
case 'pdf' :
break;
default :
if ($bool != false && !empty($u))
$ret[] = $u;
}
}
return $ret;
}
//---------------------------------------------------------------------------
public function isValidURL($url) {
$pattern = "/\b(?:(?:https?|ftp):\/\/|www\.)[-a-z0-9+&@#\/%?=~_|!:,.;]*[-a-z0-9+&@#\/%=~_|]/i";
if (preg_match($pattern, $url))
return true;
else
return false;
}
//---------------------------------------------------------------------------
public function checkExtension($url) {
$get = strtok($url, '?');
$ext = pathinfo($get, PATHINFO_EXTENSION);
return strtolower($ext);
}
//---------------------------------------------------------------------------
public function checkIfIntern($url, $parent) {
if (substr($url, 0, 1) == '/') {
$foo = parse_url($parent);
$baseurl = $foo['scheme'] . '://' . $foo['host'] . '/';
$url = str_replace('//', '/', $baseurl . $url);
$url = str_replace('http:/', 'http://', $url);
return $url;
}
$url_domain = $this -> GetDomain($url);
$parent_domain = $this -> GetDomain($parent);
if ($url_domain == $parent_domain)
return $url;
return false;
}
//---------------------------------------------------------------------------
public function GetDomain($domainb) {
preg_match('@^(?:http://)?([^/]+)@i', $domainb, $host);
preg_match('/[^.]+\.[^.]+$/', $host[1], $matches);
return $matches[0];
}
//---------------------------------------------------------------------------
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment