Created
May 26, 2015 11:25
-
-
Save ivansky/b5647efec956eb15385f to your computer and use it in GitHub Desktop.
Yandex Wordstat Collector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$redis = new Redis(); | |
$redis->connect('127.0.0.1'); | |
// Register redis as storage service | |
YADWordstat::redis($redis); | |
$yandexLogin = 'some_user'; | |
$yad = new YADWordstat($yandexLogin); | |
$words = array(); // words key = id, value = data | |
$reports = array(); // id reports binded by id | |
$waiting = array(); // list of reports binded by crc | |
$complete = array(); // crc complete | |
$done = array(); // list of reports id done | |
$report_region = array(); // list of binding region id to report id | |
$words_original_binded = array(); | |
//YADWord::get($keywordString, $regionId, $stat, $statStrict); | |
YADWord::get('some existing keyword', 213, 1000, 100); // fill existing keywords to ignore them | |
// Add keywords to stack | |
YADWord::get('prepared keyword', 213); // fill keywords without stats arguments | |
printf( | |
'Only %d keywords and %d pairs count', | |
YADWord::count(), // keywords count | |
YADWord::count(true) // pairs (keyword + regionId) count | |
); | |
$incomplete = YADWord::findIncomplete(); | |
printf('Left %d pairs of keywords', count($incomplete)); | |
$TOMORROW = strtotime(date('Y-m-d')) + (24*3600) - (10*60); // start next day minus 10 min | |
while(true){ | |
sleep(5); | |
if($TOMORROW <= time()) break; | |
echo 'Get active report list... '; | |
$report_list = $yad->getReportList(); | |
echo count($report_list).' шт'."\r\n"; | |
$reports_id = array(); | |
$done_now = array(); | |
// Collect currect and completed reports | |
if(count($report_list)) | |
foreach ($report_list as $ro){ | |
if($ro->StatusReport == 'Done') $done_now[] = (int)$ro->ReportID; | |
$reports_id[] = (int)$ro->ReportID; | |
} | |
// Clearing report list | |
foreach($waiting as $crc => $report_id){ | |
if(!in_array($report_id, $reports_id)) | |
unset($waiting[$crc]); | |
} | |
echo 'Completed reports count: '.count($done_now)."\r\n"; | |
if(count($report_list) < 5){ | |
$insert = array(); | |
$hashes = array(); | |
$region = false; | |
$incomplete = YADWord::findIncomplete(); | |
foreach($incomplete as $w){ | |
if(!isset($waiting[$w->hash])){ | |
$region = $w->region_id; | |
break; | |
} | |
} | |
$incomplete = YADWord::findIncomplete($region); | |
$insert_count = 0; | |
foreach($incomplete as $w) { | |
if($w->region_id != $region){ | |
echo 'region_id('.$w->region_id.') != region('.$region.')'."\r\n"; | |
continue; | |
} | |
if(isset($waiting[$w->hash])) continue; | |
$hashes[] = $w->hash; | |
$insert[] = $w->word; | |
$insert[] = $w->strict; | |
printf('++ %s && %s'."\r\n", $w->word, $w->strict); | |
$insert_count++; | |
if($insert_count >= 5) break; | |
} | |
$report_id = (int)$yad->createReport($insert, array($region)); | |
// !!! EXIT upon the occurrence limit | |
// при попытке создания 6-го отчета просто продолжаем работу | |
if($report_id <= 0 && $report_id != -31){ | |
echo 'Error '.abs($report_id).', maybe limit. Exit from script.'."\r\n"; | |
die(); | |
} | |
$report_region[$report_id] = $region; | |
foreach($hashes as $hash){ | |
$waiting[$hash] = $report_id; | |
} | |
}else{ | |
if(count($done_now)){ | |
$del_count = 0; | |
foreach ($done_now as $report_id) { | |
$report_result = $yad->getReport($report_id); | |
foreach($report_result as $wordstat){ | |
$region_id = (int)$wordstat->GeoID[0]; | |
$shows = 0; | |
$found = false; | |
$strict = false; | |
$keyword = mb_strtolower($wordstat->Phrase, 'UTF-8'); | |
echo '['.$keyword.'] Region: '.$region_id."\r\n"; | |
if(strpos($keyword, '!') !== false){ | |
$strict = true; | |
} | |
foreach($wordstat->SearchedWith as $stat){ | |
if(mb_strtolower($stat->Phrase, 'UTF-8') == $keyword){ | |
$shows = (int)$stat->Shows; | |
$found = true; | |
echo ' * найдено кол. просмотров: '.$shows."\r\n"; | |
} | |
} | |
if(!$found && isset($wordstat->SearchedWith)){ | |
$shows = $wordstat->SearchedWith[0]->Shows; | |
echo ' * не найдено кол. просмотров, указываем первый: '.$shows."\r\n"; | |
} | |
$w = YADWord::findByRegionCRC($keyword, $region_id); | |
if(!$w){ | |
printf('! Keyword (%s) not found'."\r\n", $keyword); | |
continue; | |
} | |
$__original = $w->original; | |
$crc = YADWordstat::crc($keyword, $region_id, $strict); | |
$complete[$crc] = true; | |
if($strict){ | |
$w->setStatStrict($shows); // Set keyword strict stat if it is | |
/* | |
$db_query_builder->insert('some_table', [ | |
'region_id' => $region_id, | |
'date' => date('Y-m'), | |
'stat_strict' => $shows, | |
'stat_date' => STAT_DATE | |
], [ | |
'duplicate_update' => [ | |
'stat_strict', | |
'stat_date' | |
] | |
]); | |
*/ | |
}else{ | |
$w->setStat($shows); // Set normal stat if it is not strict | |
/* | |
$db_query_builder->insert('some_table', [ | |
'region_id' => $region_id, | |
'date' => date('Y-m'), | |
'stat' => $shows, | |
'stat_date' => STAT_DATE | |
], [ | |
'duplicate_update' => [ | |
'stat', | |
'stat_date' | |
] | |
]); | |
*/ | |
} | |
} | |
$res = $yad->deleteReport($report_id); // Remove report | |
} | |
} | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class YADWord { | |
private static $bind_region = array(); | |
private static $bind_crc = array(); | |
public $region_id = 213; | |
public $original; | |
public $word; | |
public $strict; | |
public $crc; | |
public $stat = -1; | |
public $stat_strict = -1; | |
public function __construct($w, $r, $s, $ss){ | |
$this->region_id = (int)$r; | |
$this->original = $w; | |
$this->word = self::prepare($w); | |
if(count(explode(' ', $this->word)) > 7) return false; | |
$this->strict = '"!'.str_replace(' ', ' !', $this->word).'"'; | |
$this->crc = crc32(str_replace(' ','-',$this->word)); | |
$this->hash = crc32(str_replace(' ','-',$this->word).'-'.$this->region_id); | |
$this->stat = (int)$s; | |
$this->stat_strict = (int)$ss; | |
if(!isset(self::$bind_region[$this->region_id])) self::$bind_region[$this->region_id] = array(); | |
// BIND link by region code | |
self::$bind_region[$this->region_id][$this->crc] = &$this; | |
// BIND link by crc32 | |
self::$bind_crc[$this->crc][] = &$this; | |
return true; | |
} | |
public static function count($repeat = false){ | |
if(!$repeat) return count(self::$bind_crc); | |
$i = 0; | |
foreach (self::$bind_crc as $crc => $a){ | |
$i += count($a); | |
} | |
return $i; | |
} | |
public function setStat($stat){ | |
$this->stat = $stat; | |
} | |
public function setStatStrict($stat){ | |
$this->stat_strict = $stat; | |
} | |
public static function get($w, $r, $stat = -1, $stat_strict = -1){ | |
if($i = self::findByRegionCRC($w, $r)){ | |
if($stat >= 0) $i->stat = $stat; | |
if($stat_strict >= 0) $i->stat_strict = $stat_strict; | |
return $i; | |
} | |
$i = new self($w, $r, $stat, $stat_strict); | |
return $i; | |
} | |
public static function findComplete($region_id = false, $limit = false){ | |
return self::findByStatus(true, $region_id, $limit); | |
} | |
public static function findIncomplete($region_id = false, $limit = false){ | |
return self::findByStatus(false, $region_id, $limit); | |
} | |
public static function findByStatus($complete, $region_id, $limit = false){ | |
$l = array(); | |
$i = 0; | |
$complete = (boolean)$complete; | |
if($region_id){ | |
$region_id = (int)$region_id; | |
if(!isset(self::$bind_region[$region_id])){ | |
return $l; | |
}else{ | |
foreach(self::$bind_region[$region_id] as $crc => $c){ | |
if(($complete == false && ($c->stat < 0 || $c->stat_strict < 0)) || ($complete == true && $c->stat >= 0 && $c->stat_strict >= 0)){ | |
$l[] = $c; | |
$i++; | |
} | |
if($limit && $limit <= $i) break 1; | |
} | |
} | |
}else{ | |
foreach (self::$bind_crc as $cc){ | |
foreach($cc as $crc => $c){ | |
if(($complete == false && ($c->stat < 0 || $c->stat_strict < 0)) || ($complete == true && $c->stat >= 0 && $c->stat_strict >= 0)){ | |
$l[] = $c; | |
$i++; | |
} | |
if($limit && $limit <= $i) break 2; | |
} | |
} | |
} | |
return $l; | |
} | |
/** | |
* Return array of YADWord found by region code | |
* @param int|string $r Region code | |
* @param array $filter Search Options | |
* @return YADWord|boolean | |
*/ | |
public static function findByRegion($r, $filter = array()){ | |
$r = (int)$r; | |
if(isset(self::$bind_region[$r])){ | |
if(!count($filter)) return self::$bind_region[$r]; | |
else{ | |
$l = array(); | |
foreach(self::$bind_region[$r] as $crc => $c){ | |
foreach($filter as $named => $val){ | |
switch($named){ | |
case 'original': | |
if($c->original != $val) | |
continue 2; | |
break; | |
case 'stat': | |
if($val && ($c->stat < 0 || $c->stat_strict < 0)) | |
continue 2; | |
if(!$val && ($c->stat >= 0 && $c->stat_strict >= 0)) | |
continue 2; | |
break; | |
case 'crc': | |
if($crc != $val) | |
continue 2; | |
break; | |
case 'hash': | |
if($c->hash != $val) | |
continue 2; | |
break; | |
} | |
} | |
$l[] = &$c; | |
} | |
} | |
} | |
return false; | |
} | |
/** | |
* Return copy of YADWord found by region code | |
* @param string $w Word | |
* @param int|string $r Region code | |
* @return YADWord|boolean | |
*/ | |
public static function findByRegionCRC($w, $r){ | |
$word = self::prepare($w); | |
$r = (int)$r; | |
$crc = crc32(str_replace(' ','-',$word)); | |
if(isset(self::$bind_region[$r][$crc])){ | |
return self::$bind_region[$r][$crc]; | |
} | |
return false; | |
} | |
/** | |
* Return found array of YADWord by CRC | |
* @param int|string $crc | |
* @return array|boolean | |
*/ | |
public static function findByCRC($crc){ | |
$crc = (int)$crc; | |
if(isset(self::$bind_crc[$crc])){ | |
return self::$bind_crc[$crc]; | |
} | |
return false; | |
} | |
/** | |
* Remove from string all invalid characters | |
* Leave only English, Russian, Turkish, Kazakh and Numbers | |
* @link http://www.unicode.org/charts/ | |
* @param string $w | |
* @return string | |
*/ | |
public static function prepare($w){ | |
// АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМм | |
// НнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъ | |
// ЫыЬьЭэЮюЯя | |
$RUSSIAN = '\\x{0410}-\\x{045F}'; | |
// çğışöü ÇĞİŞÖÜ | |
$TURKISH = '\\x{00E7}\\x{011F}\\x{0131}\\x{015F}'. | |
'\\x{00F6}\\x{00FC}\\x{00C7}\\x{011E}'. | |
'\\x{0130}\\x{015E}\\x{00D6}\\x{00DC}'; | |
//ӘҒҚҢӨҮҰҺІ әғқңөүұһі | |
$KAZAKH = '\\x{04A2}\\x{04A3}\\x{0406}\\x{0456}'. | |
'\\x{0492}\\x{0493}\\x{049A}\\x{049B}'. | |
'\\x{04AE}\\x{04AF}\\x{04B0}\\x{04B1}'. | |
'\\x{04BA}\\x{04BB}\\x{04D8}\\x{04D9}'. | |
'\\x{04E8}\\x{04E9}'; | |
$w = preg_replace('/[^a-z0-9'.$RUSSIAN.$TURKISH.$KAZAKH.']+/ui', ' ', $w); | |
$w = preg_replace('/[\s]+/', ' ', $w); | |
return mb_strtolower(trim($w),'UTF-8'); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class YADWordstat { | |
const JSON_URL = 'https://api.direct.yandex.ru/v4/json/'; | |
const CERT_DIR = '/path/to/cert'; // ./path/to/cert/__login__/solid-cert.crt | |
public $login; | |
private static $pdo = null; | |
private static $redis = null; | |
public $errors = array( | |
30 => 'Массив слов пустой', | |
31 => 'Попытка создать 6-ой отчет', | |
56 => 'Превышен лимит запросов', | |
71 => 'Параметры запроса указаны неверно', | |
152 => 'Не достаточно баллов' | |
); | |
private $request = array('locale' => 'ru'); | |
private $wait = array(); | |
private $complete = array(); | |
public static function pdo($__pdo){ | |
self::$pdo = $__pdo; | |
} | |
public static function redis(Redis &$__redis){ | |
self::$redis = $__redis; | |
} | |
public function __construct($login){ | |
$this->login = $login; | |
$this->request['login'] = $this->login; | |
} | |
public function units(){ | |
$crc = crc32(sprintf('yad_units_%s', $this->login)); | |
if(self::$redis && $units = self::$redis->get($crc)){ | |
return $units; | |
} | |
$this->request['method'] = 'GetClientsUnits'; | |
$this->request['param'] = array($this->login); | |
$this->request['login'] = $this->login; | |
$response = $this->request(); | |
$units = (int)$response->data[0]->UnitsRest; | |
if(self::$redis){ | |
self::$redis->set($crc, $units, 3600 * 2); | |
} | |
return $units; | |
} | |
public function getReportList(){ | |
$this->request['method'] = 'GetWordstatReportList'; | |
$this->request['param'] = array(); | |
$this->request['login'] = $this->login; | |
$response = $this->request(); | |
if(isset($response->data)){ | |
return $response->data; | |
} | |
return []; | |
} | |
public function getReport($id){ | |
$this->request['method'] = 'GetWordstatReport'; | |
$this->request['param'] = (int)$id; | |
$response = $this->request(); | |
if(isset($response->data)){ | |
return $response->data; | |
} | |
return false; | |
} | |
public function getReportInfo($id){ | |
$report = $this->getReport($id); | |
$data = $report->data; | |
} | |
public function deleteReport($id){ | |
$this->request['method'] = 'DeleteWordstatReport'; | |
$this->request['param'] = (int)$id; | |
$this->request['login'] = $this->login; | |
$response = $this->request(); | |
return $response->data; | |
} | |
public function delReport($id){ | |
return $this->deleteReport($id); | |
} | |
public function removeReport($id){ | |
return $this->deleteReport($id); | |
} | |
public function createReport(array $keywords = array(), array $regions = array()){ | |
foreach($keywords as $k=>$v) | |
$keywords[$k] = self::magicUTF8($v); | |
if($keywords && count($keywords)){ | |
$this->request['method'] = 'CreateNewWordstatReport'; | |
$this->request['param'] = array( | |
'Phrases' => $keywords, | |
'GeoID' => $regions | |
); | |
$this->request['login'] = $this->login; | |
$response = $this->request(); | |
if(isset($response->error_code)){ | |
$error_code = (int)$response->error_code; | |
var_dump($response); | |
return (0 - $error_code); | |
} | |
if(!isset($response->data)){ | |
var_dump($response); | |
return false; | |
} | |
return $response->data; | |
} | |
return false; | |
} | |
private function request(){ | |
$request = json_encode($this->request); | |
# параметры запроса | |
$opts = array( | |
'http'=>array( | |
'method' => 'POST', | |
'content' => $request, | |
'header' => 'Content-type: application/json; charset=utf-8' . "\r\n" | |
) | |
); | |
# создание контекста потока | |
$context = stream_context_create($opts); | |
# подключаем объединенный с приватным ключом сертификат | |
stream_context_set_option($context, 'ssl', 'local_cert', self::CERT_DIR . '/'.$this->login.'/solid-cert.crt'); | |
# отправляем запрос и получаем ответ от сервера | |
$result = file_get_contents(self::JSON_URL, 0, $context); | |
return json_decode($result); | |
} | |
public static function magicUTF8($s){ | |
$s = iconv('utf-8', 'windows-1251', $s); | |
$s = iconv('windows-1251', 'utf-8', $s); | |
$s = iconv('ISO-8859-1', 'utf-8', $s); | |
return $s; | |
} | |
public static function crc($keyword, $region, $strict = 0){ | |
$strict = (int)$strict; | |
return crc32('yafreq_'.str_replace(' ','-',mb_strtolower($keyword,'UTF-8')).'_'.$region.'_'.$strict); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment