Skip to content

Instantly share code, notes, and snippets.

@matiasfrndz
Last active June 14, 2016 06:25
Show Gist options
  • Save matiasfrndz/e7589fc166ea4d788dcbce5928930e54 to your computer and use it in GitHub Desktop.
Save matiasfrndz/e7589fc166ea4d788dcbce5928930e54 to your computer and use it in GitHub Desktop.
<?php
$data_source = array();
$data_source['kmuratgeber.ch'] =
array(
'start_url' => 'http://kmuratgeber.ch/Handel/Angebote.asp',
'process' => "//div[@class='HLCol1']/a",
'context' => '//div[@id="BLOCKI"]',
'fields' =>
array(
'title' => '//div[@class="HTitel"]',
'description' => '//div//tr[@id="TextGanz"]/td',
'region' => './/div[@class="HDCol2"]//td[@class="HDLabel" and text()="Region:"]/following-sibling::td[@class="HDText"]',
'price' => './/div[@class="HDCol2"]//td[@class="HDLabel" and starts-with(text(),"Transaktionswert")]/following-sibling::td[@class="HDText"]',
),
);
$data_source['www.businessmarket.ch'] =
array(
'start_url' => 'http://www.businessmarket.ch/Firmen/Suchen.9.html',
'follow' => '//div[@class="inserate_list_counter"]/div/a[contains(text(),"Weiter")]',
'process' => '//div[contains(@class,"inserat_brick")]/div[@class="details"]/a[@class="button"]',
'context' => '//html',
'fields' =>
array(
'title' => '//article/h1',
'description' => '//div[@class="inserat_detail_left"]',
'region' => '//div[@class="inserat_detail_right"]//table[@class="inserat_infos"]//td[text()="Region"]/following-sibling::td',
'price' => '//div[@class="inserat_detail_right"]//table[@class="inserat_infos"]//td[text()="Verkaufspreis"]/following-sibling::td',
),
);
<?PHP
require('./goutte.phar');
use Goutte\Client;
use Symfony\Component\DomCrawler\Link;
// import $data_source config
require_once('./config.php');
foreach ($data_source as $source) {
$next_url = $source['start_url'];
// we have to use the same client for crawling the entire site
// so that we keep the same session during the crawl process
$client = new Client();
while (isset($next_url) && $next_url) {
$crawler = $client->request('GET', $next_url);
$link_nodes = $crawler->filterXPath($source['process']);
foreach ($link_nodes as $node) {
$link = new Link($node, $source['start_url'], 'GET');
$content_crawler = $client->request('GET', $link->getUri());
$context = $content_crawler->filterXPath($source['context']);
$item = array();
$item['uri'] = $client->getRequest()->getUri();
foreach ($source['fields'] as $field => $xpath) {
$item[$field] = getValue($context, $xpath);
}
echo json_encode($item) . "\n";
}
unset($next_url);
if ($source['follow']) {
$follow_link = $crawler->filterXPath($source['follow']);
if (count($follow_link)) {
$next_url = new Link($follow_link->getNode(0), $source['start_url'], 'GET');
$next_url = $next_url->getUri();
}
}
}
}
function getValue($crawler, $xpath) {
$filtered = $crawler->filterXPath($xpath);
if (count($filtered)) {
return $filtered->text();
}
return '';
}
<?PHP
require('./goutte.phar');
use Goutte\Client;
use Symfony\Component\DomCrawler\Link;
date_default_timezone_set('Europe/Zurich');
$source =
array(
'start_url' => 'http://www.remicom.com/de/assets/list#top_window',
'follow' => '//div[contains(@class,"show_more")]/a[contains(@class,"stndrd-btn-highlight")]//following-sibling::a',
'process' => '//div[@id="vignettes"]/div/a[position()=1]',
'context' => '//div[@id="detail_objet"]',
'fields' =>
array(
'title' => '//div[contains(@class,"main-content-wrap")]/div[contains(@class,"title")]',
'description' => '//div[@id="fiche_block_info"]/div[contains(@class,"full_desc)]',
'region' => '//div[@class="inserat_detail_right"]//table[@class="inserat_infos"]//td[text()="Region"]/following-sibling::td',
'price' => '//div[@class="inserat_detail_right"]//table[@class="inserat_infos"]//td[text()="Verkaufspreis"]/following-sibling::td',
),
);
$params = [
'country'=> 'CH',
'transmission_d_entreprises' => '{"35":"Unternehmen", "36":"KMU", "37":"Firma" }',
'moteur2' => 'ok',
'list_posted_id_type_transmission_d_entreprises' => null,
'canton' => '-',
'agency' => '-',
'reference' => null,
'offset2' => 0,
'lastoffer2' => null,
'page2' => 1
];
$offset = 1;
$next_url = $source['start_url'];
// we have to use the same client for crawling the entire site
// so that we keep the same session during the crawl process
$client = new Client();
while (isset($params['page2']) && $params['page2']) {
$crawler = $client->request('POST', $next_url, $params);
$link_nodes = $crawler->filterXPath($source['process']);
foreach ($link_nodes as $node) {
$link = new Link($node, 'http://www.remicom.com/', 'GET');
echo $link->getURI() . "\n";
$content_crawler = $client->request('GET', $link->getUri());
$context = $content_crawler->filterXPath($source['context']);
// echo $content_crawler->html() ."\n";
$item = array();
$item['uri'] = $client->getRequest()->getUri();
foreach ($source['fields'] as $field => $xpath) {
$item[$field] = getValue($context, $xpath);
}
echo json_encode($item) . "\n";
}
if ($source['follow']) {
$follow_link = $crawler->filterXPath($source['follow']);
if (count($follow_link)) {
$params['page2'] += 1;
}
else {
unset($params['page2']);
}
}
}
function getValue($crawler, $xpath) {
$filtered = $crawler->filterXPath($xpath);
if (count($filtered)) {
return $filtered->text();
}
return '';
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment