Skip to content

Instantly share code, notes, and snippets.

Last active December 10, 2015 06:58
Show Gist options
  • Save seckcoder/4397413 to your computer and use it in GitHub Desktop.
Save seckcoder/4397413 to your computer and use it in GitHub Desktop.
Crawl weibo(repost timeline) asynchronously in python. It needs gevent and sina python weibo client( installed.
import math
import gevent
import gevent.greenlet
from gevent import monkey
from functools import partial
from weibo import APIClient
import time
class ActionCounter:
def __init__(self, count=0, finished=0):
self.count = count
self.finished = finished
def main(mid, access_token):
client = APIClient(None, None)
client.set_access_token(access_token, time.time() + 3600 * 60)
weibo_id = client.get.statuses__queryid(
access_token=access_token, mid=mid, type=1, isBase62=1)['id']
root_tweet = client.get.statuses__show(access_token=access_token, id=weibo_id)
nreposts_per_page = 200
def get_reposts(page, callback=None):
reply = client.get.statuses__repost_timeline(
id=weibo_id, count=nreposts_per_page, page=page)
if callback:
except Exception, e:
print "Exception %s happended" % e
tweets = {}
def crawl_callback(depth, reply):
if reply and reply.has_key("reposts"):
for repost in reply["reposts"]:
if add_tweet(repost):
crawl_reposts(repost, depth + 1)
def crawl_reposts(tweet, depth):
if depth >= 5:
if not (tweet and tweet.has_key("reposts_count")):
total_reposts_count = tweet.reposts_count
page_num = int(math.ceil(float(total_reposts_count) / nreposts_per_page))
#print tweet.user.screen_name, page_num
if page_num > 0 and total_reposts_count >= 2:
# Instead of using link of Greenlet, I use callback directly since spawning greenlet in the link seems not work
gs = [gevent.spawn(get_reposts, page, partial(crawl_callback, depth)) for page in xrange(1, page_num+1)]
def add_tweet(tweet):
if not tweets.has_key( and tweet.has_key("reposts_count"):
tweets[] = {
"text": tweet.text,
"reposts_count": tweet.reposts_count
return True
return False
crawl_reposts(root_tweet, 1)
return tweets
if __name__ == "__main__":
now = time.time()
mid = "zbQaYdw5V"
access_token = "xxx"
print "This will take some time..."
tweets = main(mid, access_token)
print "Total number of tweets:", len(tweets)
print "Take %s seconds" % (time.time() - now)
except Exception, e:
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment