Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@bigeagle
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bigeagle/a7083f292eded54217db to your computer and use it in GitHub Desktop.
Save bigeagle/a7083f292eded54217db to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# -*- coding:utf-8 -*-
import requests
import re
import time
import json
from pyquery import PyQuery as pq
from urlparse import urlparse, parse_qs
backup_file = "backup/page_{}.json"
base_url = "http://3g.renren.com/"
status_list_url = base_url + "/status/getdoing.do?sid={}&curpage={}"
headers = {
"Host": "3g.renren.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Referer": base_url,
}
login_info = {
"email": "",
"password": "",
}
def login():
loginPage = requests.get(base_url).content
dom = pq(loginPage)
login_action_url = dom(".sec > form").attr("action")
login_data = {pq(x).attr("name"): pq(x).attr("value") for x in dom(".sec > form input")}
for k, v in login_info.iteritems():
login_data[k] = v
r = requests.post(login_action_url, headers=headers, data=login_data)
cookies = r.history[0].cookies
queries = parse_qs(urlparse(r.url).query)
sid = queries['sid'][0]
return sid
def next_page(dom):
return dom.find(u"a[title='下一页']").attr("href")
def read_replies(reply_list, replies_url):
r = requests.get(replies_url)
dom = pq(r.content)
for reply_dom in dom(".list > div:not(.l)").items():
reply_dom.children(".time").children("a").remove()
reply_dom.children(".time").children("em").remove()
t = reply_dom.children(".time").remove().text()
reply_text = reply_dom.text()
reply_list.append({"time": t, "content": reply_text})
next_page_url = next_page(dom(".list > div.l"))
if next_page_url is None:
return reply_list
else:
return read_replies(reply_list, next_page_url)
if __name__ == "__main__":
sid = login()
last_page = False
page = 0
status_list_url = status_list_url.format(sid, page)
while not last_page:
page += 1
r = requests.get(status_list_url)
dom = pq(r.content)
next_page_url = next_page(dom(".list > div.l"))
if next_page_url is None:
last_page = True
else:
print dom(".list > div.l > span.gray").text()
status_list = []
for i, status_dom in enumerate(dom(".list > div:not(.l)").items()):
_links = status_dom.children("a")
replies_dom = status_dom.children("a").eq(len(_links)-3)
has_replies = re.match(r".*\d", replies_dom.text())
replies_url = replies_dom.attr("href")
t = status_dom.children(".time").remove().text()
status_dom.children("a").remove()
status_dom.children("em").remove()
status_text = status_dom.text()
if has_replies:
replies_list = read_replies([], replies_url)
else:
replies_list = []
status_list.append({"time": t, "content": status_text, "replies": replies_list})
jcontent = json.dumps(status_list, indent=4, ensure_ascii=False).encode("utf-8")
with open(backup_file.format(page), 'w') as f:
f.write(jcontent)
time.sleep(3)
print "完了"
# vim: ts=4 sw=4 sts=4 expandtab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment