Skip to content

Instantly share code, notes, and snippets.

@kazhang
Created August 16, 2012 02:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kazhang/3366012 to your computer and use it in GitHub Desktop.
Save kazhang/3366012 to your computer and use it in GitHub Desktop.
A simple crawler for Renren.com
#coding:utf-8
import urllib,urllib2,cookielib,re,json
class Renren:
'''This is a simple crawler for renren.com'''
header={'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.75 Safari/537.1'}
email=''
password=''
def __init__(self,email,passwd,needProxy=False,cookieFile='./cookie.dat'):
'''Basic initialisation for email,password,cookie,cookieFile.Building up opener'''
self.email=email
self.password=passwd
self.cookieFile=cookieFile
self.cookie=cookielib.LWPCookieJar()
if needProxy==True:
#Please change to your own proxy setting
proxy=urllib2.ProxyHandler({'http':'http://zakir:zakir@10.214.52.16:808'})
opener=urllib2.build_opener(proxy,urllib2.HTTPCookieProcessor(self.cookie))
else:
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))
urllib2.install_opener(opener)
def login(self):
'''To sign in renren.com.
Please notice that if you keep failing to login, you are possiblely banned by Renren. You can use the checkBan() function to detect if you are banned and use the getCode() function to handle this manually.
Good Luck!'''
code=''
postData={
'email': self.email,
'password': self.password,
'origURL': 'http://www.renren.com/home',
'domain':'renren.com',
'captcha_type':'web_login',
'key_id': '1',
'icode':code
}
postData = urllib.urlencode(postData)
request = urllib2.Request(
url = 'http://www.renren.com/ajaxLogin/login',
data = postData,
headers = self.header
)
result = urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
result = str(result)
if "true" in result:
print 'Login successfully!'
else:
print 'Failed to login'
print result
exit(1)
def getCode(self):
'''To get the captcha on login page'''
request=urllib2.Request(
url='http://www.renren.com/',
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
request=urllib2.Request(
url='http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()',
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
fl=open('tmp.jpg','w')
fl.write(result)
fl.close()
code=raw_input('Input the damn captcha->')
return code
def isBanned(self):
'''To check if be banned by Renren'''
postData={
'email':'test',
'password':'',
'icode':'',
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login'
}
postData=urllib.urlencode(postData)
request=urllib2.Request(
url='http://www.renren.com/ajax/ShowCaptcha',
data=postData,
headers=self.header
)
result=urllib2.urlopen(request).read()
if result=='1':
return True
else:
return False
def getMyFriendList(self):
'''To get the friendlist of current user.
@return [(ID1,name1),(ID2,name2),...]'''
request=urllib2.Request(
url='http://friend.renren.com/myfriendlistx.do',
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
friends=str(re.search('friends=\[{.*}\]',result).group())
friendList=re.findall(r'"id":(.*?),.*?,"name":"(.*?)"',friends)
return friendList
def getFriendListOnPage(self,ID,page=0):
'''To get the friend list of user with required ID.
@return [(ID1,name1),(ID2,name2)...]'''
page=repr(page)
request=urllib2.Request(
url='http://friend.renren.com/GetFriendList.do?curpage='+page+'&id='+ID,
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
friendList=re.findall('<dd><a href.*?id=(\d+)">(.*?)</a>',result)
return friendList
def getStatusOnPage(self,ID,page=0):
'''To get the status of user with required ID.
@reutrn json'''
page=repr(page)
request=urllib2.Request(
url='http://status.renren.com/GetSomeomeDoingList.do?userId='+ID+'&curpage='+page,
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
data=json.loads(result)
return data['doingArray']
def getProfile(self,ID):
'''To get the profile of user with required ID.
@return {}'''
request=urllib2.Request(
url='http://www.renren.com/'+ID+'/profile',
headers=self.header
)
result=urllib2.urlopen(request).read()
self.cookie.save(self.cookieFile)
info=re.search('<ul class="u.*?</ul>',result,re.S)
if info==None:
return {'ID':ID}
info=info.group()
info=re.sub('<span.*?>','',info)
info=re.sub('</span>','',info)
lis=re.findall('<li class="(.*?)">(.*?)</li>',info)
info={'ID':ID}
print lis
for item in lis:
info[item[0]]=item[1]
return info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment