Created
October 18, 2019 06:16
-
-
Save nickfox-taterli/a011ed062b5af12e19a700e68cfd8d91 to your computer and use it in GitHub Desktop.
ZhiHu Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'vendor/autoload.php'; | |
$mongo = new MongoDB\Client("mongodb://taterli:f697bb912da7f4d210382c67@127.0.0.1:27017"); | |
$db = $mongo->wa; //选择数据库 | |
$collection = $db->zhihu; //选择文档集合 | |
$next_qid = null; /* 首次 */ | |
$first_qid = 0; | |
function sc_log($level = '',$desp = '') | |
{ | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, 'https://logs.logdna.com/logs/ingest?hostname=robots&mac=0E:33:C8:57:0D:48&ip=172.31.31.209&now='.time()); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_USERPWD, 'dd8e6d59f4af4a87da2cc04225ab453b:'); | |
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); | |
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array("lines" => array(array("timestamp" => time(),"app" => "知乎机器人:","level" => $level,"line" => $desp))))); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json')); | |
$output = curl_exec($ch); | |
curl_close($ch); | |
} | |
for (;;) { | |
//https://www.zhihu.com/api/v4/members/zhang-jia-wei/followees?offset=20&limit=20 | |
$ch = curl_init(); | |
if ($next_qid == null) { | |
curl_setopt($ch, CURLOPT_URL, "https://www.zhihu.com/api/v4/members/".$argv[1]."/followers?include=follower_count&offset=0&limit=20"); | |
} else { | |
curl_setopt($ch, CURLOPT_URL, $next_qid); | |
} | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_COOKIE, ' z_c0="2|1:0|10:1570446544|4:z_c0|92:Mi4xeVQ3FJWGdBZWR3SVZyLWxGU0JKYkNzNjJFbkxscHdzUFdR|63ce7ad7a3c68bc1aaf9a21f8a60d3031e65012f3e875cf9e81378c9dae0b0ce"'); | |
$output = json_decode(curl_exec($ch), true); | |
curl_close($ch); | |
if (isset($output['paging']['next'])) { | |
$next_qid = $output['paging']['next']; | |
$next_qid = str_replace('https://www.zhihu.com/members/','https://www.zhihu.com/api/v4/members/',$next_qid); | |
foreach ($output['data'] as $key => $members_value) { | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, "https://www.zhihu.com/api/v4/members/" . $members_value['id'] . "?include=locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,included_answers_count,included_articles_count,included_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,is_org_createpin_white_user,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics"); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_COOKIE, ' z_c0="2|1:0|10:1570446544|4:z_c0|92:Mi4xeVQ3FJWGdBZWR3SVZyLWxGU0JKYkNzNjJFbkxscHdzUFdR|63ce7ad7a3c68bc1aaf9a21f8a60d3031e65012f3e875cf9e81378c9dae0b0ce"'); | |
$output = curl_exec($ch); | |
curl_close($ch); | |
$output = json_decode($output, true); | |
$result = $collection->find(['id' => $members_value['id']])->toArray(); | |
if (!isset($result[0]["name"])) { | |
$collection->insertOne($output); | |
echo $members_value['id']." 已插入".PHP_EOL; | |
// sc_log("INFO",$members_value['id']." 已插入"); | |
}else{ | |
echo $members_value['id']." 已存在".PHP_EOL; | |
// sc_log("DEBUG",$members_value['id']." 已存在"); | |
} | |
} | |
} else { | |
if (!isset($output['error']['message'])) { | |
//sc_send("正在从死胡同自救,如果自救失败需要人工干预."); | |
} else { | |
//sc_send($output['error']['message']); | |
echo $output['error']['message'] . PHP_EOL; | |
} | |
sleep(30); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment