Skip to content

Instantly share code, notes, and snippets.

@rcarvs
Created January 20, 2019 11:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcarvs/c2d8d1aa024d8c1f74b50fc3481e6487 to your computer and use it in GitHub Desktop.
Save rcarvs/c2d8d1aa024d8c1f74b50fc3481e6487 to your computer and use it in GitHub Desktop.
Very little crawller in PHP
<?php
/*
------------------------------------
HOW TO USE
------------------------------------
IN YOUR PHP FILE, INCLUDE THE CRAWLER FILE AND DO AS EXAMPLE BELOW:
$crawler = new crawler();
$crawler->setDomainInitial('http://www.ufsj.edu.br');
//if set true, the crawler will add to list and execute just links of initial domain
$crawler->setJustOlyLinksInInitialDomail(true);
$crawler->percorreWeb();
*/
class pagina{
private $url;
private $conteudo;
public function getUrl(){
return $this->url;
}
public function getConteudo(){
return $this->conteudo;
}
public function setUrl($url){
$this->url = $url;
}
public function setConteudo($conteudo){
$this->conteudo = $conteudo;
}
}
class crawler{
private $somenteDominioInicial = false;
private $dominioInicial;
private $links = array();
public function getSomenteDominioInicial(){
return $this->somenteDominioInicial;
}
public function setJustOlyLinksInInitialDomail($somenteDominioInicial){
$this->somenteDominioInicial = $somenteDominioInicial;
}
public function setDomainInitial($dominioInicial){
$this->dominioInicial = $dominioInicial;
$this->addLink($dominioInicial);
}
public function getDominioInicial(){
return $this->dominioInicial;
}
public function addLink($link){
$this->links[] = $link;
}
private function buscaConteudoPagina($url) {
try {
if (!empty($url)) {
$opts = array('http' =>
array(
'method' => 'GET',
'timeout' => 20
)
);
$context = stream_context_create($opts);
$retorno = file_get_contents($url, false, $context);
if ($retorno != false) {
$pagina = new pagina();
$pagina->setUrl($url);
$pagina->setConteudo($retorno);
return $pagina;
} else {
return false;
}
} else {
return false;
}
} catch (Exception $ex) {
return false;
}
}
public function percorreWeb(){
$contador = 0;
$quantidadeTentativas = 0;
while ($contador >= 0) {
if (isset($this->links[$contador])) {
$quantidadeTentativas = 0;
$url = $this->links[$contador];
$pagina = $this->buscaConteudoPagina($url);
if($pagina != false && is_object($pagina)){
//echo $pagina->getConteudo();
$this->buscaLinksConteudo($pagina);
//COLOCAR AQUI A FUNÇÃO DE CONTEUDO
unset($this->links[$contador]);
}
}else {
$quantidadeTentativas++;
if ($quantidadeTentativas > 100) {
break;
}
}
$contador++;
}
}
public function getLb() {
if (PHP_SAPI == "cli")
$lb = "\n";
else
$lb = "</br>";
return $lb;
}
public function buscaLinksConteudo($pagina){
$contador = 0;
while ($contador <= strlen($pagina->getConteudo())) {
$startLink = strpos($pagina->getConteudo(), "<a", $contador);
if ($startLink === false) {
break;
} else {
$contador = $startLink + 2;
$startHref = strpos($pagina->getConteudo(), 'href="http', $startLink);
if ($startHref === false) {
break;
} else {
$endHref = strpos($pagina->getConteudo(), '"', ($startHref + 13));
if ($endHref === false) {
break;
} else {
if (!in_array(substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6)), $this->links)){
if($this->getSomenteDominioInicial()){
if(strlen(substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6))) > strlen($this->getDominioInicial()) && $this->getDominioInicial() == substr(substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6)), 0,strlen($this->getDominioInicial()))){
$this->links[]=substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6));
echo substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6))." foi adicionado na fila".$this->getLb();
}
}else{
$this->links[]=substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6));
echo substr($pagina->getConteudo(), ($startHref + 6), ($endHref - $startHref - 6))." foi adicionado na fila".$this->getLb();
}
}
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment