Skip to content

Instantly share code, notes, and snippets.

@ivansky
Created May 26, 2015 11:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivansky/b5647efec956eb15385f to your computer and use it in GitHub Desktop.
Save ivansky/b5647efec956eb15385f to your computer and use it in GitHub Desktop.
Yandex Wordstat Collector
<?php
$redis = new Redis();
$redis->connect('127.0.0.1');
// Register redis as storage service
YADWordstat::redis($redis);
$yandexLogin = 'some_user';
$yad = new YADWordstat($yandexLogin);
$words = array(); // words key = id, value = data
$reports = array(); // id reports binded by id
$waiting = array(); // list of reports binded by crc
$complete = array(); // crc complete
$done = array(); // list of reports id done
$report_region = array(); // list of binding region id to report id
$words_original_binded = array();
//YADWord::get($keywordString, $regionId, $stat, $statStrict);
YADWord::get('some existing keyword', 213, 1000, 100); // fill existing keywords to ignore them
// Add keywords to stack
YADWord::get('prepared keyword', 213); // fill keywords without stats arguments
printf(
'Only %d keywords and %d pairs count',
YADWord::count(), // keywords count
YADWord::count(true) // pairs (keyword + regionId) count
);
$incomplete = YADWord::findIncomplete();
printf('Left %d pairs of keywords', count($incomplete));
$TOMORROW = strtotime(date('Y-m-d')) + (24*3600) - (10*60); // start next day minus 10 min
while(true){
sleep(5);
if($TOMORROW <= time()) break;
echo 'Get active report list... ';
$report_list = $yad->getReportList();
echo count($report_list).' шт'."\r\n";
$reports_id = array();
$done_now = array();
// Collect currect and completed reports
if(count($report_list))
foreach ($report_list as $ro){
if($ro->StatusReport == 'Done') $done_now[] = (int)$ro->ReportID;
$reports_id[] = (int)$ro->ReportID;
}
// Clearing report list
foreach($waiting as $crc => $report_id){
if(!in_array($report_id, $reports_id))
unset($waiting[$crc]);
}
echo 'Completed reports count: '.count($done_now)."\r\n";
if(count($report_list) < 5){
$insert = array();
$hashes = array();
$region = false;
$incomplete = YADWord::findIncomplete();
foreach($incomplete as $w){
if(!isset($waiting[$w->hash])){
$region = $w->region_id;
break;
}
}
$incomplete = YADWord::findIncomplete($region);
$insert_count = 0;
foreach($incomplete as $w) {
if($w->region_id != $region){
echo 'region_id('.$w->region_id.') != region('.$region.')'."\r\n";
continue;
}
if(isset($waiting[$w->hash])) continue;
$hashes[] = $w->hash;
$insert[] = $w->word;
$insert[] = $w->strict;
printf('++ %s && %s'."\r\n", $w->word, $w->strict);
$insert_count++;
if($insert_count >= 5) break;
}
$report_id = (int)$yad->createReport($insert, array($region));
// !!! EXIT upon the occurrence limit
// при попытке создания 6-го отчета просто продолжаем работу
if($report_id <= 0 && $report_id != -31){
echo 'Error '.abs($report_id).', maybe limit. Exit from script.'."\r\n";
die();
}
$report_region[$report_id] = $region;
foreach($hashes as $hash){
$waiting[$hash] = $report_id;
}
}else{
if(count($done_now)){
$del_count = 0;
foreach ($done_now as $report_id) {
$report_result = $yad->getReport($report_id);
foreach($report_result as $wordstat){
$region_id = (int)$wordstat->GeoID[0];
$shows = 0;
$found = false;
$strict = false;
$keyword = mb_strtolower($wordstat->Phrase, 'UTF-8');
echo '['.$keyword.'] Region: '.$region_id."\r\n";
if(strpos($keyword, '!') !== false){
$strict = true;
}
foreach($wordstat->SearchedWith as $stat){
if(mb_strtolower($stat->Phrase, 'UTF-8') == $keyword){
$shows = (int)$stat->Shows;
$found = true;
echo ' * найдено кол. просмотров: '.$shows."\r\n";
}
}
if(!$found && isset($wordstat->SearchedWith)){
$shows = $wordstat->SearchedWith[0]->Shows;
echo ' * не найдено кол. просмотров, указываем первый: '.$shows."\r\n";
}
$w = YADWord::findByRegionCRC($keyword, $region_id);
if(!$w){
printf('! Keyword (%s) not found'."\r\n", $keyword);
continue;
}
$__original = $w->original;
$crc = YADWordstat::crc($keyword, $region_id, $strict);
$complete[$crc] = true;
if($strict){
$w->setStatStrict($shows); // Set keyword strict stat if it is
/*
$db_query_builder->insert('some_table', [
'region_id' => $region_id,
'date' => date('Y-m'),
'stat_strict' => $shows,
'stat_date' => STAT_DATE
], [
'duplicate_update' => [
'stat_strict',
'stat_date'
]
]);
*/
}else{
$w->setStat($shows); // Set normal stat if it is not strict
/*
$db_query_builder->insert('some_table', [
'region_id' => $region_id,
'date' => date('Y-m'),
'stat' => $shows,
'stat_date' => STAT_DATE
], [
'duplicate_update' => [
'stat',
'stat_date'
]
]);
*/
}
}
$res = $yad->deleteReport($report_id); // Remove report
}
}
}
}
<?php
class YADWord {
private static $bind_region = array();
private static $bind_crc = array();
public $region_id = 213;
public $original;
public $word;
public $strict;
public $crc;
public $stat = -1;
public $stat_strict = -1;
public function __construct($w, $r, $s, $ss){
$this->region_id = (int)$r;
$this->original = $w;
$this->word = self::prepare($w);
if(count(explode(' ', $this->word)) > 7) return false;
$this->strict = '"!'.str_replace(' ', ' !', $this->word).'"';
$this->crc = crc32(str_replace(' ','-',$this->word));
$this->hash = crc32(str_replace(' ','-',$this->word).'-'.$this->region_id);
$this->stat = (int)$s;
$this->stat_strict = (int)$ss;
if(!isset(self::$bind_region[$this->region_id])) self::$bind_region[$this->region_id] = array();
// BIND link by region code
self::$bind_region[$this->region_id][$this->crc] = &$this;
// BIND link by crc32
self::$bind_crc[$this->crc][] = &$this;
return true;
}
public static function count($repeat = false){
if(!$repeat) return count(self::$bind_crc);
$i = 0;
foreach (self::$bind_crc as $crc => $a){
$i += count($a);
}
return $i;
}
public function setStat($stat){
$this->stat = $stat;
}
public function setStatStrict($stat){
$this->stat_strict = $stat;
}
public static function get($w, $r, $stat = -1, $stat_strict = -1){
if($i = self::findByRegionCRC($w, $r)){
if($stat >= 0) $i->stat = $stat;
if($stat_strict >= 0) $i->stat_strict = $stat_strict;
return $i;
}
$i = new self($w, $r, $stat, $stat_strict);
return $i;
}
public static function findComplete($region_id = false, $limit = false){
return self::findByStatus(true, $region_id, $limit);
}
public static function findIncomplete($region_id = false, $limit = false){
return self::findByStatus(false, $region_id, $limit);
}
public static function findByStatus($complete, $region_id, $limit = false){
$l = array();
$i = 0;
$complete = (boolean)$complete;
if($region_id){
$region_id = (int)$region_id;
if(!isset(self::$bind_region[$region_id])){
return $l;
}else{
foreach(self::$bind_region[$region_id] as $crc => $c){
if(($complete == false && ($c->stat < 0 || $c->stat_strict < 0)) || ($complete == true && $c->stat >= 0 && $c->stat_strict >= 0)){
$l[] = $c;
$i++;
}
if($limit && $limit <= $i) break 1;
}
}
}else{
foreach (self::$bind_crc as $cc){
foreach($cc as $crc => $c){
if(($complete == false && ($c->stat < 0 || $c->stat_strict < 0)) || ($complete == true && $c->stat >= 0 && $c->stat_strict >= 0)){
$l[] = $c;
$i++;
}
if($limit && $limit <= $i) break 2;
}
}
}
return $l;
}
/**
* Return array of YADWord found by region code
* @param int|string $r Region code
* @param array $filter Search Options
* @return YADWord|boolean
*/
public static function findByRegion($r, $filter = array()){
$r = (int)$r;
if(isset(self::$bind_region[$r])){
if(!count($filter)) return self::$bind_region[$r];
else{
$l = array();
foreach(self::$bind_region[$r] as $crc => $c){
foreach($filter as $named => $val){
switch($named){
case 'original':
if($c->original != $val)
continue 2;
break;
case 'stat':
if($val && ($c->stat < 0 || $c->stat_strict < 0))
continue 2;
if(!$val && ($c->stat >= 0 && $c->stat_strict >= 0))
continue 2;
break;
case 'crc':
if($crc != $val)
continue 2;
break;
case 'hash':
if($c->hash != $val)
continue 2;
break;
}
}
$l[] = &$c;
}
}
}
return false;
}
/**
* Return copy of YADWord found by region code
* @param string $w Word
* @param int|string $r Region code
* @return YADWord|boolean
*/
public static function findByRegionCRC($w, $r){
$word = self::prepare($w);
$r = (int)$r;
$crc = crc32(str_replace(' ','-',$word));
if(isset(self::$bind_region[$r][$crc])){
return self::$bind_region[$r][$crc];
}
return false;
}
/**
* Return found array of YADWord by CRC
* @param int|string $crc
* @return array|boolean
*/
public static function findByCRC($crc){
$crc = (int)$crc;
if(isset(self::$bind_crc[$crc])){
return self::$bind_crc[$crc];
}
return false;
}
/**
* Remove from string all invalid characters
* Leave only English, Russian, Turkish, Kazakh and Numbers
* @link http://www.unicode.org/charts/
* @param string $w
* @return string
*/
public static function prepare($w){
// АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМм
// НнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъ
// ЫыЬьЭэЮюЯя
$RUSSIAN = '\\x{0410}-\\x{045F}';
// çğışöü ÇĞİŞÖÜ
$TURKISH = '\\x{00E7}\\x{011F}\\x{0131}\\x{015F}'.
'\\x{00F6}\\x{00FC}\\x{00C7}\\x{011E}'.
'\\x{0130}\\x{015E}\\x{00D6}\\x{00DC}';
//ӘҒҚҢӨҮҰҺІ әғқңөүұһі
$KAZAKH = '\\x{04A2}\\x{04A3}\\x{0406}\\x{0456}'.
'\\x{0492}\\x{0493}\\x{049A}\\x{049B}'.
'\\x{04AE}\\x{04AF}\\x{04B0}\\x{04B1}'.
'\\x{04BA}\\x{04BB}\\x{04D8}\\x{04D9}'.
'\\x{04E8}\\x{04E9}';
$w = preg_replace('/[^a-z0-9'.$RUSSIAN.$TURKISH.$KAZAKH.']+/ui', ' ', $w);
$w = preg_replace('/[\s]+/', ' ', $w);
return mb_strtolower(trim($w),'UTF-8');
}
}
<?php
class YADWordstat {
const JSON_URL = 'https://api.direct.yandex.ru/v4/json/';
const CERT_DIR = '/path/to/cert'; // ./path/to/cert/__login__/solid-cert.crt
public $login;
private static $pdo = null;
private static $redis = null;
public $errors = array(
30 => 'Массив слов пустой',
31 => 'Попытка создать 6-ой отчет',
56 => 'Превышен лимит запросов',
71 => 'Параметры запроса указаны неверно',
152 => 'Не достаточно баллов'
);
private $request = array('locale' => 'ru');
private $wait = array();
private $complete = array();
public static function pdo($__pdo){
self::$pdo = $__pdo;
}
public static function redis(Redis &$__redis){
self::$redis = $__redis;
}
public function __construct($login){
$this->login = $login;
$this->request['login'] = $this->login;
}
public function units(){
$crc = crc32(sprintf('yad_units_%s', $this->login));
if(self::$redis && $units = self::$redis->get($crc)){
return $units;
}
$this->request['method'] = 'GetClientsUnits';
$this->request['param'] = array($this->login);
$this->request['login'] = $this->login;
$response = $this->request();
$units = (int)$response->data[0]->UnitsRest;
if(self::$redis){
self::$redis->set($crc, $units, 3600 * 2);
}
return $units;
}
public function getReportList(){
$this->request['method'] = 'GetWordstatReportList';
$this->request['param'] = array();
$this->request['login'] = $this->login;
$response = $this->request();
if(isset($response->data)){
return $response->data;
}
return [];
}
public function getReport($id){
$this->request['method'] = 'GetWordstatReport';
$this->request['param'] = (int)$id;
$response = $this->request();
if(isset($response->data)){
return $response->data;
}
return false;
}
public function getReportInfo($id){
$report = $this->getReport($id);
$data = $report->data;
}
public function deleteReport($id){
$this->request['method'] = 'DeleteWordstatReport';
$this->request['param'] = (int)$id;
$this->request['login'] = $this->login;
$response = $this->request();
return $response->data;
}
public function delReport($id){
return $this->deleteReport($id);
}
public function removeReport($id){
return $this->deleteReport($id);
}
public function createReport(array $keywords = array(), array $regions = array()){
foreach($keywords as $k=>$v)
$keywords[$k] = self::magicUTF8($v);
if($keywords && count($keywords)){
$this->request['method'] = 'CreateNewWordstatReport';
$this->request['param'] = array(
'Phrases' => $keywords,
'GeoID' => $regions
);
$this->request['login'] = $this->login;
$response = $this->request();
if(isset($response->error_code)){
$error_code = (int)$response->error_code;
var_dump($response);
return (0 - $error_code);
}
if(!isset($response->data)){
var_dump($response);
return false;
}
return $response->data;
}
return false;
}
private function request(){
$request = json_encode($this->request);
# параметры запроса
$opts = array(
'http'=>array(
'method' => 'POST',
'content' => $request,
'header' => 'Content-type: application/json; charset=utf-8' . "\r\n"
)
);
# создание контекста потока
$context = stream_context_create($opts);
# подключаем объединенный с приватным ключом сертификат
stream_context_set_option($context, 'ssl', 'local_cert', self::CERT_DIR . '/'.$this->login.'/solid-cert.crt');
# отправляем запрос и получаем ответ от сервера
$result = file_get_contents(self::JSON_URL, 0, $context);
return json_decode($result);
}
public static function magicUTF8($s){
$s = iconv('utf-8', 'windows-1251', $s);
$s = iconv('windows-1251', 'utf-8', $s);
$s = iconv('ISO-8859-1', 'utf-8', $s);
return $s;
}
public static function crc($keyword, $region, $strict = 0){
$strict = (int)$strict;
return crc32('yafreq_'.str_replace(' ','-',mb_strtolower($keyword,'UTF-8')).'_'.$region.'_'.$strict);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment