Last active
August 29, 2015 14:04
-
-
Save bigeagle/a7083f292eded54217db to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding:utf-8 -*- | |
import requests | |
import re | |
import time | |
import json | |
from pyquery import PyQuery as pq | |
from urlparse import urlparse, parse_qs | |
backup_file = "backup/page_{}.json" | |
base_url = "http://3g.renren.com/" | |
status_list_url = base_url + "/status/getdoing.do?sid={}&curpage={}" | |
headers = { | |
"Host": "3g.renren.com", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0", | |
"Referer": base_url, | |
} | |
login_info = { | |
"email": "", | |
"password": "", | |
} | |
def login(): | |
loginPage = requests.get(base_url).content | |
dom = pq(loginPage) | |
login_action_url = dom(".sec > form").attr("action") | |
login_data = {pq(x).attr("name"): pq(x).attr("value") for x in dom(".sec > form input")} | |
for k, v in login_info.iteritems(): | |
login_data[k] = v | |
r = requests.post(login_action_url, headers=headers, data=login_data) | |
cookies = r.history[0].cookies | |
queries = parse_qs(urlparse(r.url).query) | |
sid = queries['sid'][0] | |
return sid | |
def next_page(dom): | |
return dom.find(u"a[title='下一页']").attr("href") | |
def read_replies(reply_list, replies_url): | |
r = requests.get(replies_url) | |
dom = pq(r.content) | |
for reply_dom in dom(".list > div:not(.l)").items(): | |
reply_dom.children(".time").children("a").remove() | |
reply_dom.children(".time").children("em").remove() | |
t = reply_dom.children(".time").remove().text() | |
reply_text = reply_dom.text() | |
reply_list.append({"time": t, "content": reply_text}) | |
next_page_url = next_page(dom(".list > div.l")) | |
if next_page_url is None: | |
return reply_list | |
else: | |
return read_replies(reply_list, next_page_url) | |
if __name__ == "__main__": | |
sid = login() | |
last_page = False | |
page = 0 | |
status_list_url = status_list_url.format(sid, page) | |
while not last_page: | |
page += 1 | |
r = requests.get(status_list_url) | |
dom = pq(r.content) | |
next_page_url = next_page(dom(".list > div.l")) | |
if next_page_url is None: | |
last_page = True | |
else: | |
print dom(".list > div.l > span.gray").text() | |
status_list = [] | |
for i, status_dom in enumerate(dom(".list > div:not(.l)").items()): | |
_links = status_dom.children("a") | |
replies_dom = status_dom.children("a").eq(len(_links)-3) | |
has_replies = re.match(r".*\d", replies_dom.text()) | |
replies_url = replies_dom.attr("href") | |
t = status_dom.children(".time").remove().text() | |
status_dom.children("a").remove() | |
status_dom.children("em").remove() | |
status_text = status_dom.text() | |
if has_replies: | |
replies_list = read_replies([], replies_url) | |
else: | |
replies_list = [] | |
status_list.append({"time": t, "content": status_text, "replies": replies_list}) | |
jcontent = json.dumps(status_list, indent=4, ensure_ascii=False).encode("utf-8") | |
with open(backup_file.format(page), 'w') as f: | |
f.write(jcontent) | |
time.sleep(3) | |
print "完了" | |
# vim: ts=4 sw=4 sts=4 expandtab |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment