Skip to content

Instantly share code, notes, and snippets.

@neerolyte
Created March 3, 2014 04:40
Show Gist options
  • Save neerolyte/9318476 to your computer and use it in GitHub Desktop.
Save neerolyte/9318476 to your computer and use it in GitHub Desktop.
quick and dirty php code to try to extract high level subpaths from google indexes
<?php
function docurl($url) {
$opts = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_URL => $url,
CURLOPT_HEADER => true,
);
$ch = curl_init();
curl_setopt_array($ch, $opts);
$data = array();
$res = curl_exec($ch);
$data['info'] = curl_getinfo($ch);
$data['header'] = substr($res, 0, $data['info']['header_size']);
$data['body'] = substr($res, $data['info']['header_size']);
$data['error'] = curl_error($ch);
$data['errorno'] = curl_errno($ch);
curl_close($ch);
return $data;
}
function extractSubPaths($main, $ignores, $filter = '', $start = 0) {
$ignores = array_map(function($v) { return "+-site:$v"; }, $ignores);
$url = "http://www.google.com/search?q="
."site:$main+$filter"
.implode('', $ignores)
."&start=$start";
echo "Testing URL: $url\n";
$res = docurl($url);
$doc = new DOMDocument();
$doc->loadHTML($res['body']);
$xpath = new DOMXPath($doc);
$nodes = $xpath->query("//cite");
$cites = array();
foreach ($nodes as $node) {
$cite = $node->nodeValue;
// remove everything after the second slash
$cite = preg_replace('%^([^/]*/[^/]*)/.*$%', '\1', $cite);
$cites []= $cite;
}
$cites = array_unique($cites);
return $cites;
}
function extractPathCount($path) {
$url = "http://www.google.com/search?q="
."site:$path";
// echo "Extracting count under: $url ... ";
$res = docurl($url);
$doc = new DOMDocument();
$doc->loadHTML($res['body']);
$xpath = new DOMXPath($doc);
$nodes = $xpath->query("//div[@id='resultStats']/text()");
$countText = $nodes->item(0)->wholeText;
$count = preg_replace('%^About ([0-9,]+) results$%', '\1', $countText);
$count = intval(preg_replace('%,%', '', $count));
// echo "$count\n";
return $count;
}
// start up
libxml_use_internal_errors(true);
$start = $argv[1];
$filter = isset($argv[2])?$argv[2]:'';
$paths = array();
do {
// try a couple of offsets every time to pick up a couple more results
for ($i = 0; $i < 50; $i+=10) {
$newPaths = extractSubPaths($start, $paths, $filter, $i);
$paths = array_merge($paths, $newPaths);
}
$paths = array_unique($paths);
sleep(1);
} while(!empty($newPaths));
foreach ($paths as $path) {
echo "$path: ".extractPathCount($path)."\n";
}
// echo implode("\n", $paths);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment