Skip to content

Instantly share code, notes, and snippets.

@diman3210
Created May 2, 2017 17:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save diman3210/e9f0a6766fbcd73db295b83380dd2bf1 to your computer and use it in GitHub Desktop.
Save diman3210/e9f0a6766fbcd73db295b83380dd2bf1 to your computer and use it in GitHub Desktop.
<?php
// ini_set('max_execution_time', '60');
set_time_limit(6000);
ini_set('memory_limit', '2048M');
error_reporting(E_ALL);
ignore_user_abort(true);
define('__MYDIR__' , str_replace('\\', '/', __DIR__ ));
require_once 'phpQuery/phpQuery-onefile.php';
require_once 'recognize/recognize.php';
require_once 'functions/proxy/getproxy.php';
$link = mysqli_connect('localhost', 'root', '', 'metalloprokat');
$fd = fopen('parser_log.txt', 'ab');
// Берем страницу для парсинга
$a = 4;
while (!$strs) {
$query = "SELECT * FROM `strs` WHERE `parse` = 0 AND `subcat{$a}_name` != '' LIMIT 1";
$result = mysqli_query($link, $query) or die ("1 Не могу взять файл из базы ".mysqli_error($link));
$strs = mysqli_fetch_all($result, MYSQLI_ASSOC);
if (!$strs and $a == 1)
$z = "Конец";
$a--;
}
// Конец Берем страницу для парсинга
$id = $strs[0]['id'];
$url = $strs[0]['url'];
$subcat1_name = $strs[0]['subcat1_name'];
$subcat1_url = $strs[0]['subcat1_url'];
$subcat2_name = $strs[0]['subcat2_name'];
$subcat2_url = $strs[0]['subcat2_url'];
$subcat3_name = $strs[0]['subcat3_name'];
$subcat3_url = $strs[0]['subcat3_url'];
$subcat4_name = $strs[0]['subcat4_name'];
$subcat4_url = $strs[0]['subcat4_url'];
$next_parsed = $strs[0]['next_parsed'];
$num = $strs[0]['count_str'];
$query = "UPDATE `strs` SET `parse` = 1 WHERE `id` = '$id'";
mysqli_query($link, $query) or die ("Не могу обновить parse ".mysqli_error($link));
//Берем прокси из БД
$proxy = getproxy();
$proxy_host = $proxy['host'];
$proxy_id = $proxy['id'];
$proxy_port = $proxy['port'];
$proxy_username = $proxy['username'];
$proxy_pass = $proxy['pass'];
$user_agent = $proxy['user-agent'];
$proxy_type = $proxy['type'];
//конец Берем прокси из БД
fwrite ($fd, "$url пытаемся скачать ".date("Y-m-d H:i:s")."\r\n");
$curl = curl_init();
$cookie = __MYDIR__ .'/cookie/cookie1.txt';
$headers = [
'Referer: http://www.google.com/',
"User-Agent: $user_agent"
];
$url = $url."?page=$next_parsed";
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
// curl_setopt($curl, CURLOPT_PROXY, "$proxy_host:$proxy_port");
// curl_setopt($curl, CURLOPT_PROXYUSERPWD, "$proxy_username:$proxy_pass");
curl_setopt($curl, CURLOPT_PROXYTYPE, "CURLPROXY_$proxy_type");
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie);
curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie);
$str = curl_exec($curl);
if (curl_getinfo($curl, CURLINFO_HTTP_CODE) == 200) {
fwrite ($fd, "$url скачали ".date("Y-m-d H:i:s")."\r\n");
$pq = phpQuery::newDocument($str);
// Находим и перебираем пагинацию
$pages = $pq->find(".pagination");
$pages = $pages->find("a");
foreach ($pages as $page) {
$pqpage = pq($page);
$href = $pqpage->attr("href");
preg_match_all('#^.+\?page=(\w+)$#Uus', $href, $nums);
if (!empty($nums[1][0]))
$num1[] = $nums[1][0];
}
if (!empty($num1))
$num = max($num1);
$query = "UPDATE `strs` SET `count_str` = '$num' WHERE `id` = '$id'";
mysqli_query($link, $query) or die("3 Не могу обновить пагинацию ".mysqli_error($link));
// Конец Находим и перебираем пагинацию
$items = $pq->find(".view-product");
foreach ($items as $item) {
$pqitem = pq($item);
$img = $pqitem->find(".pattern-big img")->attr('src');
$price = $pqitem->find(".price")->find(".currency.g-hidden")->remove();
$price = $pqitem->find(".price")->text();
$price = preg_replace('#\n|\r#', ' ', $price);
$price = preg_replace('#\s{2,}#', ' ', $price);
$price = preg_replace('#^\s|\s$#', '', $price);
$title = $pqitem->find(".product-info .title")->text();
$title = preg_replace('#\n|\r#', ' ', $title);
$title = preg_replace('#\s{2,}#', ' ', $title);
$title = preg_replace('#^\s|\s$#', '', $title);
preg_match_all('#^.+(ГОСТ\s?.+)(\s|$|,|;|\.|[a-zA-Zа-яА-ЯёЁ]).*$#Uus', $title, $standart);
$standart = $standart[1][0];
$address = $pqitem->find("[itemprop='address']")->text();
$size = $pqitem->find("[ng-bind]")->text();
$size = str_replace('Размер ', '', $size);
$item_url = $pqitem->find(".product-info .title a")->attr("href");
$query = "SELECT `id` FROM `items` WHERE `url` = '$item_url'";
$result = mysqli_query($link, $query);
$result = mysqli_fetch_array($result);
fwrite ($fd, "$url нашли страницу $item_url".date("Y-m-d H:i:s")."\r\n");
if (!$result) {
fwrite ($fd, "$url со страницы $item_url добавляем в базу ".date("Y-m-d H:i:s")."\r\n");
$query = "INSERT INTO `items` (`subcat1_url`, `subcat2_url`, `subcat3_url`, `subcat4_url`, `subcat1_name`, `subcat2_name`, `subcat3_name`, `subcat4_name`, `title`, `price`, `size`, `city`, `image_link`, `url`, `standart`) VALUES ('$subcat1_url', '$subcat2_url', '$subcat3_url', '$subcat4_url', '$subcat1_name', '$subcat2_name', '$subcat3_name', '$subcat4_name', '$title', '$price', '$size', '$address', '$img', '$item_url', '$standart')";
mysqli_query($link, $query) or die ("4 Не могу добавить продукт ".mysqli_error($link));
}
}
// Обновляем strs
if ($next_parsed >= $num) {
$query = "UPDATE `strs` SET `parse` = 1 WHERE `id` = '$id'";
mysqli_query($link, $query) or die ("Не могу обновить parse ".mysqli_error($link));
}
else {
$next_parsed++;
$query = "UPDATE `strs` SET `next_parsed` = $next_parsed WHERE `id` = '$id'";
mysqli_query($link, $query) or die ("5 Не могу обновить next_parsed ".mysqli_error($link));
$query = "UPDATE `strs` SET `parse` = 0 WHERE `id` = '$id'";
mysqli_query($link, $query) or die ("6 Не могу обновить parse ".mysqli_error($link));
}
// Конец Обновляем strs
}
else {
fwrite ($fd, "$url не удалось скачать страницу ".date("Y-m-d H:i:s")."\r\n");
$query = "UPDATE `proxy` SET `banned` = 1 WHERE `host` = '$proxy_host'";
mysqli_query($link, $query) or die ("7 Не могу обновить proxy ".mysqli_error($link));
}
$i++;
fclose($fd);
curl_close($curl);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment