Skip to content

Instantly share code, notes, and snippets.

@kaid
Last active June 6, 2019 04:08
Show Gist options
  • Save kaid/9931456 to your computer and use it in GitHub Desktop.
Save kaid/9931456 to your computer and use it in GitHub Desktop.
知乎热门问题抓取脚本
//==== Displayer ====
var Displayer = function() {
this.$el = jQuery("body").prepend(
"<div class=\"hotqs-list\">" +
"<div class=\"head\">" +
"<div>关注</div>" +
"<div>回答</div>" +
"</div>" +
"<div class=\"loading\">开始抓取问题</div>" +
"<ul></ul>" +
"</div>"
).find(".hotqs-list");
this.$list = this.$el.find("ul");
this.$count = this.$el.append("<div class=\"hotqs-count\">0<div>").find(".hotqs-count");
this.$el.fadeIn();
}
Displayer.prototype.start = function() {
this.$el.find(".loading").remove();
}
Displayer.prototype.increment_count = function() {
this.$count.text(this.count());
}
Displayer.prototype.count = function() {
return this.$list.children().length;
}
Displayer.prototype.remove_lowest = function() {
this.$list.children().last().slideUp(function() {jQuery(this).remove();});
}
Displayer.prototype.insert_before = function(question, index) {
var $li = jQuery("<li></li>").hide();
var $anchor = jQuery("<a target=\"_blank\"></a>").attr("href", question.url).text(question.title);
var $followers = jQuery("<div></div>").text(question.followers)
var $answers = jQuery("<div></div>").text(question.answers)
$li.append($anchor, $followers, $answers);
if (0 === this.$list.children().length || index == -1) {
this.start();
this.$list.append($li);
} else {
var $sibling = this.$list.children()[index]
$li.insertBefore($sibling);
}
$li.slideDown();
this.increment_count();
if (this.count() > 64) this.remove_lowest();
}
//==== Queue ====
var Queue = function() {
this.tasks = [];
this.waiting = true;
this.completed = 0;
this.start = new Date;
}
Queue.prototype.enqueue = function(ajax_setting, options) {
if (options && "high" === options.priority) {
this.tasks.unshift(ajax_setting);
} else {
this.tasks.push(ajax_setting);
}
if (this.waiting) {
this.waiting = false;
this.exec();
}
return this;
}
Queue.prototype.exec = function() {
if (0 === this.tasks.length || true === this.stop) {
this.waiting = true;
return;
}
this.waiting = false;
var self = this;
var ajax_setting = this.tasks.shift();
this.sleep(640, function() {
jQuery.ajax(ajax_setting).fail(function(xhr) {
if (404 == xhr.status) return;
self.enqueue(ajax_setting);
}).always(function() {
var t2 = new Date;
self.completed++;
self.time = ((t2 - self.start)/(1000 * 60)).toFixed(2);
self.exec()
});
});
}
Queue.prototype.stop = function() {
this.stop = true;
}
Queue.prototype.start = function() {
this.stop = false;
this.exec();
}
Queue.prototype.sleep = function(millis, fn) {
setTimeout(function() {fn()} , millis);
}
//==== RequestParamsBuilder ====
var RequetParamsBuilder = {
post_param : function(id) {
var cookie = document.cookie;
var xsrf = cookie.split("; ").map(function(s) {return s.split("=")}).filter(function(a){return "_xsrf" === a[0]})[0][1];
return {start: id, offset: 15, _xsrf: xsrf};
},
get_param : function(offset) {
return jQuery.param({
params: JSON.stringify({
offset : offset,
type : "month"
})
});
}
}
//==== Question ====
var Question = function(url, title, followers, answers) {
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1];
var followers = ~~followers;
var answers = ~~answers;
this.url = url;
this.title = title;
this.followers = followers;
this.answers = answers;
}
//==== Record ====
var Recorder = function() {
this.list = [];
}
Recorder.prototype.store = function(record) {
if (this.insufficient(record)) return;
var found = this.find(record.url);
if (found) {
found.followers = record.followers;
found.answers = record.answers;
return;
}
var index;
var cond = this.list.some(function(q, i) {
var cond = q.followers < record.followers;
if (cond) index = i
return cond
});
if (cond) {
this.list.splice(index, 0, record);
window.displayer.insert_before(record, index)
} else {
this.list.push(record);
window.displayer.insert_before(record, -1);
}
if (this.list.length > 64) this.list.pop();
this.highest = this.list[0];
this.lowest = this.list[this.list.length - 1];
}
Recorder.prototype.insufficient = function(record) {
return this.list.length >= 64 && this.lowest && this.lowest.followers >= record.followers;
}
Recorder.prototype.find = function(url) {
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1];
return this.list.filter(function(q) {return q.url === url;})[0];
}
//==== QuestionsFetcher ====
var QuestionsFetcher = function(queue, recorder, deep) {
this.queue = queue;
this.recorder = recorder || window.recorder;
this.deep = deep;
this.feed_url = "http://www.zhihu.com/explore";
this.more_url = "http://www.zhihu.com/node/ExploreAnswerListV2";
this.$questions;
}
QuestionsFetcher.prototype.run = function() {
var self = this;
var ajax_setting = {
url : this.feed_url,
type : "GET",
success : function(res) {
console.log("start!");
self.$questions = jQuery(res).find("[data-type=monthly]").children();
self.more(5);
}
}
this.queue.enqueue(ajax_setting);
}
QuestionsFetcher.prototype.more = function(offset) {
var self = this;
var ajax_setting = {
url : this.more_url,
type : "GET",
data : RequetParamsBuilder.get_param(offset),
success : function(res) {
var $more = jQuery(res);
self.$questions = self.$questions.add($more);
if (100 <= self.$questions.length || 0 == $more.length) {
self.load();
return;
}
self.more(offset + 5);
}
};
this.queue.enqueue(ajax_setting);
}
QuestionsFetcher.prototype.load = function() {
if (!this.question_urls) this.question_urls = this.urls();
if (0 == this.question_urls.length) {
this.queue;
return;
}
this.on(this.question_urls.shift());
this.load();
}
QuestionsFetcher.prototype.urls = function() {
return this.$questions.map(function() {
return jQuery(this).find("h2 a").attr("href");
}).toArray();
}
QuestionsFetcher.prototype.on = function(url) {
if (this.deep && this.recorder.find(url)) return;
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1];
var self = this;
var ajax_setting = {
url : url,
type : "GET",
success : function(res) {
var $res = jQuery(res);
var $anchor = $res.find(".zh-question-followers-sidebar").find("a");
var title = $res.find("#zh-question-title h2.zm-item-title").text();
var followers = $anchor.text();
var answers = $res.find("#zh-question-answer-num").data("num");
self.recorder.store(new Question(url, title, followers, answers));
if (self.deep) {
var followers_fetcher = new FollowersFetcher($anchor.attr("href"), self.queue);
followers_fetcher.run();
}
}
}
this.queue.enqueue(ajax_setting, {priority: "high"});
}
//==== FollowersFetcher ====
var FollowersFetcher = function(url, queue) {
this.url = url;
this.queue = queue;
this.follower_urls;
}
FollowersFetcher.prototype.run = function() {
var self = this;
var ajax_setting = {
url : this.url,
type : "GET",
success : function(res) {
self.follower_urls = jQuery(res).find(".zm-profile-card.zm-profile-section-item").map(function() {
return jQuery(this).find(".zm-item-link-avatar").attr("href");
}).toArray();
self.load();
}
};
this.queue.enqueue(ajax_setting);
}
FollowersFetcher.prototype.load = function() {
if (0 == this.follower_urls.length) {
this.queue;
return;
}
var activities_fetcher = new FollowerActivitiesFetcher(this.follower_urls.shift(), this.queue);
activities_fetcher.run()
this.load();
}
//==== FollowerActivitesFetcher ====
var FollowerActivitiesFetcher = function(url, queue) {
this.url = url;
this.more_url = url + "/activities";
this.queue = queue;
this.questions_fetcher = new QuestionsFetcher(queue);
this.questions_fetcher.deep = true;
this.$activities;
}
FollowerActivitiesFetcher.prototype.run = function() {
var self = this;
var ajax_setting = {
url : this.url,
type : "GET",
success : function(res) {
self.$activities = jQuery(res).find(".zm-profile-section-item.zm-item");
var act_id = self.$activities.last().attr("id").slice(4);
self.more(act_id);
}
};
this.queue.enqueue(ajax_setting, {priority: "high"});
}
FollowerActivitiesFetcher.prototype.more = function(id) {
var self = this;
var ajax_setting = {
url : this.more_url,
type : "POST",
data : RequetParamsBuilder.post_param(id),
success : function(res) {
self.$activities = self.$activities.add(jQuery(res.msg[1]));
var act_id = self.$activities.last().attr("id").slice(4);
if (100 <= self.$activities.length || 0 == res.msg[0]) {
if (!self.questions_fetcher.question_urls) self.questions_fetcher.question_urls = self.urls();
self.questions_fetcher.load()
return;
}
self.more(act_id);
}
};
this.queue.enqueue(ajax_setting, {priority: "high"});
}
FollowerActivitiesFetcher.prototype.urls = function() {
return this.$activities.map(function() {
return jQuery(this).find("a.question_link").attr("href");
}).toArray().filter(function(i) {return i});
}
//==== Ready! Steady! Go! ====
jQuery(function() {
$("head").append(
"<style>" +
".hotqs-list {" +
"position:fixed;top:0;bottom:0;left:0;right:0;margin:auto;display:none;z-index:9999;" +
"height:480px;width:640px;" +
"border:8px solid #ccc;background:white;" +
"color:#888;font-size:20px;" +
"box-shadow:0 0 16px 0px #888;" +
"}" +
".hotqs-list ul {" +
"height:432px;width:100%;" +
"overflow:auto;overflow-x:hidden;" +
"}" +
".hotqs-list .head, .hotqs-list ul li {" +
"height:48px;width:624px;padding:0 8px 0 8px;" +
"border-bottom:1px solid #ddd;" +
"}" +
".hotqs-list ul li a {" +
"padding:8px 0 8px 0;display:inline-block;height:32px;width:400px;text-overflow:ellipsis;overflow:hidden;white-space:nowrap;" +
"}" +
".hotqs-list .head div, .hotqs-list ul li div {" +
"height:32px;width:80px;float:right;margin-right:12px;padding:8px 0 8px 0;text-align:right;font-family:monospace;" +
"}" +
".hotqs-list .hotqs-count {" +
"position:absolute;right:-32px;top:-8px;display:inline-block;" +
"height:24px;width:24px;" +
"background:orange;color:white;font-size:14px;" +
"text-align:center;" +
"}" +
".hotqs-list .loading {" +
"width:100%;margin-top:160px;text-align:center;" +
"}" +
"</style>"
);
var queue = new Queue
, displayer = new Displayer
, recorder = new Recorder
, question_fetcher = new QuestionsFetcher(queue, recorder, true);
window.displayer = displayer;
window.recorder = recorder;
window.queue = queue;
question_fetcher.run();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment