Created
August 16, 2012 02:58
-
-
Save kazhang/3366012 to your computer and use it in GitHub Desktop.
A simple crawler for Renren.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
import urllib,urllib2,cookielib,re,json | |
class Renren: | |
'''This is a simple crawler for renren.com''' | |
header={'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.75 Safari/537.1'} | |
email='' | |
password='' | |
def __init__(self,email,passwd,needProxy=False,cookieFile='./cookie.dat'): | |
'''Basic initialisation for email,password,cookie,cookieFile.Building up opener''' | |
self.email=email | |
self.password=passwd | |
self.cookieFile=cookieFile | |
self.cookie=cookielib.LWPCookieJar() | |
if needProxy==True: | |
#Please change to your own proxy setting | |
proxy=urllib2.ProxyHandler({'http':'http://zakir:zakir@10.214.52.16:808'}) | |
opener=urllib2.build_opener(proxy,urllib2.HTTPCookieProcessor(self.cookie)) | |
else: | |
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie)) | |
urllib2.install_opener(opener) | |
def login(self): | |
'''To sign in renren.com. | |
Please notice that if you keep failing to login, you are possiblely banned by Renren. You can use the checkBan() function to detect if you are banned and use the getCode() function to handle this manually. | |
Good Luck!''' | |
code='' | |
postData={ | |
'email': self.email, | |
'password': self.password, | |
'origURL': 'http://www.renren.com/home', | |
'domain':'renren.com', | |
'captcha_type':'web_login', | |
'key_id': '1', | |
'icode':code | |
} | |
postData = urllib.urlencode(postData) | |
request = urllib2.Request( | |
url = 'http://www.renren.com/ajaxLogin/login', | |
data = postData, | |
headers = self.header | |
) | |
result = urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
result = str(result) | |
if "true" in result: | |
print 'Login successfully!' | |
else: | |
print 'Failed to login' | |
print result | |
exit(1) | |
def getCode(self): | |
'''To get the captcha on login page''' | |
request=urllib2.Request( | |
url='http://www.renren.com/', | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
request=urllib2.Request( | |
url='http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()', | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
fl=open('tmp.jpg','w') | |
fl.write(result) | |
fl.close() | |
code=raw_input('Input the damn captcha->') | |
return code | |
def isBanned(self): | |
'''To check if be banned by Renren''' | |
postData={ | |
'email':'test', | |
'password':'', | |
'icode':'', | |
'origURL':'http://www.renren.com/home', | |
'domain':'renren.com', | |
'key_id':'1', | |
'captcha_type':'web_login' | |
} | |
postData=urllib.urlencode(postData) | |
request=urllib2.Request( | |
url='http://www.renren.com/ajax/ShowCaptcha', | |
data=postData, | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
if result=='1': | |
return True | |
else: | |
return False | |
def getMyFriendList(self): | |
'''To get the friendlist of current user. | |
@return [(ID1,name1),(ID2,name2),...]''' | |
request=urllib2.Request( | |
url='http://friend.renren.com/myfriendlistx.do', | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
friends=str(re.search('friends=\[{.*}\]',result).group()) | |
friendList=re.findall(r'"id":(.*?),.*?,"name":"(.*?)"',friends) | |
return friendList | |
def getFriendListOnPage(self,ID,page=0): | |
'''To get the friend list of user with required ID. | |
@return [(ID1,name1),(ID2,name2)...]''' | |
page=repr(page) | |
request=urllib2.Request( | |
url='http://friend.renren.com/GetFriendList.do?curpage='+page+'&id='+ID, | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
friendList=re.findall('<dd><a href.*?id=(\d+)">(.*?)</a>',result) | |
return friendList | |
def getStatusOnPage(self,ID,page=0): | |
'''To get the status of user with required ID. | |
@reutrn json''' | |
page=repr(page) | |
request=urllib2.Request( | |
url='http://status.renren.com/GetSomeomeDoingList.do?userId='+ID+'&curpage='+page, | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
data=json.loads(result) | |
return data['doingArray'] | |
def getProfile(self,ID): | |
'''To get the profile of user with required ID. | |
@return {}''' | |
request=urllib2.Request( | |
url='http://www.renren.com/'+ID+'/profile', | |
headers=self.header | |
) | |
result=urllib2.urlopen(request).read() | |
self.cookie.save(self.cookieFile) | |
info=re.search('<ul class="u.*?</ul>',result,re.S) | |
if info==None: | |
return {'ID':ID} | |
info=info.group() | |
info=re.sub('<span.*?>','',info) | |
info=re.sub('</span>','',info) | |
lis=re.findall('<li class="(.*?)">(.*?)</li>',info) | |
info={'ID':ID} | |
print lis | |
for item in lis: | |
info[item[0]]=item[1] | |
return info |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment