Skip to content

Instantly share code, notes, and snippets.

@nickfox-taterli
Created October 18, 2019 06:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nickfox-taterli/a011ed062b5af12e19a700e68cfd8d91 to your computer and use it in GitHub Desktop.
Save nickfox-taterli/a011ed062b5af12e19a700e68cfd8d91 to your computer and use it in GitHub Desktop.
ZhiHu Spider
<?php
require 'vendor/autoload.php';
$mongo = new MongoDB\Client("mongodb://taterli:f697bb912da7f4d210382c67@127.0.0.1:27017");
$db = $mongo->wa; //选择数据库
$collection = $db->zhihu; //选择文档集合
$next_qid = null; /* 首次 */
$first_qid = 0;
function sc_log($level = '',$desp = '')
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://logs.logdna.com/logs/ingest?hostname=robots&mac=0E:33:C8:57:0D:48&ip=172.31.31.209&now='.time());
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERPWD, 'dd8e6d59f4af4a87da2cc04225ab453b:');
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode(array("lines" => array(array("timestamp" => time(),"app" => "知乎机器人:","level" => $level,"line" => $desp)))));
curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
$output = curl_exec($ch);
curl_close($ch);
}
for (;;) {
//https://www.zhihu.com/api/v4/members/zhang-jia-wei/followees?offset=20&limit=20
$ch = curl_init();
if ($next_qid == null) {
curl_setopt($ch, CURLOPT_URL, "https://www.zhihu.com/api/v4/members/".$argv[1]."/followers?include=follower_count&offset=0&limit=20");
} else {
curl_setopt($ch, CURLOPT_URL, $next_qid);
}
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIE, ' z_c0="2|1:0|10:1570446544|4:z_c0|92:Mi4xeVQ3FJWGdBZWR3SVZyLWxGU0JKYkNzNjJFbkxscHdzUFdR|63ce7ad7a3c68bc1aaf9a21f8a60d3031e65012f3e875cf9e81378c9dae0b0ce"');
$output = json_decode(curl_exec($ch), true);
curl_close($ch);
if (isset($output['paging']['next'])) {
$next_qid = $output['paging']['next'];
$next_qid = str_replace('https://www.zhihu.com/members/','https://www.zhihu.com/api/v4/members/',$next_qid);
foreach ($output['data'] as $key => $members_value) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://www.zhihu.com/api/v4/members/" . $members_value['id'] . "?include=locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,avatar_hue,answer_count,articles_count,pins_count,question_count,columns_count,commercial_question_count,favorite_count,favorited_count,logs_count,included_answers_count,included_articles_count,included_text,message_thread_token,account_status,is_active,is_bind_phone,is_force_renamed,is_bind_sina,is_privacy_protected,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,is_org_createpin_white_user,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics");
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIE, ' z_c0="2|1:0|10:1570446544|4:z_c0|92:Mi4xeVQ3FJWGdBZWR3SVZyLWxGU0JKYkNzNjJFbkxscHdzUFdR|63ce7ad7a3c68bc1aaf9a21f8a60d3031e65012f3e875cf9e81378c9dae0b0ce"');
$output = curl_exec($ch);
curl_close($ch);
$output = json_decode($output, true);
$result = $collection->find(['id' => $members_value['id']])->toArray();
if (!isset($result[0]["name"])) {
$collection->insertOne($output);
echo $members_value['id']." 已插入".PHP_EOL;
// sc_log("INFO",$members_value['id']." 已插入");
}else{
echo $members_value['id']." 已存在".PHP_EOL;
// sc_log("DEBUG",$members_value['id']." 已存在");
}
}
} else {
if (!isset($output['error']['message'])) {
//sc_send("正在从死胡同自救,如果自救失败需要人工干预.");
} else {
//sc_send($output['error']['message']);
echo $output['error']['message'] . PHP_EOL;
}
sleep(30);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment