Skip to content

Instantly share code, notes, and snippets.

@seckcoder
Last active December 10, 2015 06:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save seckcoder/4397413 to your computer and use it in GitHub Desktop.
Save seckcoder/4397413 to your computer and use it in GitHub Desktop.
Crawl weibo(repost timeline) asynchronously in python. It needs gevent and sina python weibo client(https://github.com/michaelliao/sinaweibopy) installed.
import math
import gevent
import gevent.greenlet
from gevent import monkey
from functools import partial
monkey.patch_all()
from weibo import APIClient
import time
class ActionCounter:
def __init__(self, count=0, finished=0):
self.count = count
self.finished = finished
def main(mid, access_token):
client = APIClient(None, None)
client.set_access_token(access_token, time.time() + 3600 * 60)
weibo_id = client.get.statuses__queryid(
access_token=access_token, mid=mid, type=1, isBase62=1)['id']
root_tweet = client.get.statuses__show(access_token=access_token, id=weibo_id)
nreposts_per_page = 200
def get_reposts(page, callback=None):
try:
reply = client.get.statuses__repost_timeline(
id=weibo_id, count=nreposts_per_page, page=page)
if callback:
callback(reply)
except Exception, e:
print "Exception %s happended" % e
tweets = {}
def crawl_callback(depth, reply):
if reply and reply.has_key("reposts"):
for repost in reply["reposts"]:
if add_tweet(repost):
crawl_reposts(repost, depth + 1)
def crawl_reposts(tweet, depth):
if depth >= 5:
return
if not (tweet and tweet.has_key("reposts_count")):
return
total_reposts_count = tweet.reposts_count
page_num = int(math.ceil(float(total_reposts_count) / nreposts_per_page))
#print tweet.user.screen_name, page_num
if page_num > 0 and total_reposts_count >= 2:
# Instead of using link of Greenlet, I use callback directly since spawning greenlet in the link seems not work
gs = [gevent.spawn(get_reposts, page, partial(crawl_callback, depth)) for page in xrange(1, page_num+1)]
gevent.joinall(gs)
def add_tweet(tweet):
if not tweets.has_key(tweet.id) and tweet.has_key("reposts_count"):
tweets[tweet.id] = {
"id": tweet.id,
"text": tweet.text,
"reposts_count": tweet.reposts_count
}
return True
return False
crawl_reposts(root_tweet, 1)
return tweets
if __name__ == "__main__":
try:
now = time.time()
mid = "zbQaYdw5V"
access_token = "xxx"
print "This will take some time..."
tweets = main(mid, access_token)
print "Total number of tweets:", len(tweets)
print "Take %s seconds" % (time.time() - now)
except Exception, e:
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment