kazhang/fetchRenren.py

## fetchRenren.py
#coding:utf-8
import urllib,urllib2,cookielib,re,json

class Renren:
	'''This is a simple crawler for renren.com'''
	header={'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.75 Safari/537.1'}
	email=''
	password=''

	def  __init__(self,email,passwd,needProxy=False,cookieFile='./cookie.dat'):
		'''Basic initialisation for email,password,cookie,cookieFile.Building up opener'''
		self.email=email
		self.password=passwd
		self.cookieFile=cookieFile
		self.cookie=cookielib.LWPCookieJar()

		if needProxy==True:
			#Please change to your own proxy setting
			proxy=urllib2.ProxyHandler({'http':'http://zakir:zakir@10.214.52.16:808'})
			opener=urllib2.build_opener(proxy,urllib2.HTTPCookieProcessor(self.cookie))
		else:
			opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))
		urllib2.install_opener(opener)

	def login(self):
		'''To sign in renren.com.
		Please notice that if you keep failing to login, you are possiblely banned by Renren. You can use the checkBan() function to detect if you are banned and use the getCode() function to handle this manually.
		Good Luck!'''
		code=''
		postData={
			'email': self.email,
			'password': self.password,
			'origURL': 'http://www.renren.com/home',
			'domain':'renren.com',
			'captcha_type':'web_login',
			'key_id': '1',
			'icode':code
        	}
		postData = urllib.urlencode(postData)
		request = urllib2.Request(
			  url = 'http://www.renren.com/ajaxLogin/login',
			  data = postData,
			  headers = self.header
			  )
		result = urllib2.urlopen(request).read()

		self.cookie.save(self.cookieFile)

		result = str(result)
		if "true" in result:
			print 'Login successfully!'
		else:
			print 'Failed to login'
			print result
			exit(1)

	def getCode(self):
		'''To get the captcha on login page'''
		request=urllib2.Request(
				url='http://www.renren.com/',
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		request=urllib2.Request(
				url='http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()',
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		fl=open('tmp.jpg','w')
		fl.write(result)
		fl.close()
		code=raw_input('Input the damn captcha->')
		return code

	def isBanned(self):
		'''To check if be banned by Renren'''
		postData={
			'email':'test',
			'password':'',
			'icode':'',
			'origURL':'http://www.renren.com/home',
			'domain':'renren.com',
			'key_id':'1',
			'captcha_type':'web_login'
			}
		postData=urllib.urlencode(postData)
		request=urllib2.Request(
				url='http://www.renren.com/ajax/ShowCaptcha',
				data=postData,
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		if result=='1':
			return True
		else:
			return False

	def getMyFriendList(self):
		'''To get the friendlist of current user.
			@return [(ID1,name1),(ID2,name2),...]'''
		request=urllib2.Request(
				url='http://friend.renren.com/myfriendlistx.do',
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		friends=str(re.search('friends=\[{.*}\]',result).group())
		friendList=re.findall(r'"id":(.*?),.*?,"name":"(.*?)"',friends)
		return friendList

	def getFriendListOnPage(self,ID,page=0):
		'''To get the friend list of user with required ID.
			@return [(ID1,name1),(ID2,name2)...]'''
		page=repr(page)
		request=urllib2.Request(
				url='http://friend.renren.com/GetFriendList.do?curpage='+page+'&id='+ID,
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		friendList=re.findall('<dd><a href.*?id=(\d+)">(.*?)</a>',result)
		return friendList

	def getStatusOnPage(self,ID,page=0):
		'''To get the status of user with required ID.
			@reutrn json'''
		page=repr(page)
		request=urllib2.Request(
				url='http://status.renren.com/GetSomeomeDoingList.do?userId='+ID+'&curpage='+page,
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		data=json.loads(result)
		return data['doingArray']

	def getProfile(self,ID):
		'''To get the profile of user with required ID.
			@return {}'''
		request=urllib2.Request(
				url='http://www.renren.com/'+ID+'/profile',
				headers=self.header
				)
		result=urllib2.urlopen(request).read()
		self.cookie.save(self.cookieFile)
		info=re.search('<ul class="u.*?</ul>',result,re.S)
		if info==None:
			return {'ID':ID}
		info=info.group()
		info=re.sub('<span.*?>','',info)
		info=re.sub('</span>','',info)
		lis=re.findall('<li class="(.*?)">(.*?)</li>',info)
		info={'ID':ID}
		print lis
		for item in lis:
			info[item[0]]=item[1]
		return info
	#coding:utf-8
	import urllib,urllib2,cookielib,re,json

	class Renren:
	'''This is a simple crawler for renren.com'''
	header={'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.75 Safari/537.1'}
	email=''
	password=''

	def __init__(self,email,passwd,needProxy=False,cookieFile='./cookie.dat'):
	'''Basic initialisation for email,password,cookie,cookieFile.Building up opener'''
	self.email=email
	self.password=passwd
	self.cookieFile=cookieFile
	self.cookie=cookielib.LWPCookieJar()

	if needProxy==True:
	#Please change to your own proxy setting
	proxy=urllib2.ProxyHandler({'http':'http://zakir:zakir@10.214.52.16:808'})
	opener=urllib2.build_opener(proxy,urllib2.HTTPCookieProcessor(self.cookie))
	else:
	opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie))
	urllib2.install_opener(opener)

	def login(self):
	'''To sign in renren.com.
	Please notice that if you keep failing to login, you are possiblely banned by Renren. You can use the checkBan() function to detect if you are banned and use the getCode() function to handle this manually.
	Good Luck!'''
	code=''
	postData={
	'email': self.email,
	'password': self.password,
	'origURL': 'http://www.renren.com/home',
	'domain':'renren.com',
	'captcha_type':'web_login',
	'key_id': '1',
	'icode':code
	}
	postData = urllib.urlencode(postData)
	request = urllib2.Request(
	url = 'http://www.renren.com/ajaxLogin/login',
	data = postData,
	headers = self.header
	)
	result = urllib2.urlopen(request).read()

	self.cookie.save(self.cookieFile)

	result = str(result)
	if "true" in result:
	print 'Login successfully!'
	else:
	print 'Failed to login'
	print result
	exit(1)

	def getCode(self):
	'''To get the captcha on login page'''
	request=urllib2.Request(
	url='http://www.renren.com/',
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	request=urllib2.Request(
	url='http://icode.renren.com/getcode.do?t=web_login&rnd=Math.random()',
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	fl=open('tmp.jpg','w')
	fl.write(result)
	fl.close()
	code=raw_input('Input the damn captcha->')
	return code

	def isBanned(self):
	'''To check if be banned by Renren'''
	postData={
	'email':'test',
	'password':'',
	'icode':'',
	'origURL':'http://www.renren.com/home',
	'domain':'renren.com',
	'key_id':'1',
	'captcha_type':'web_login'
	}
	postData=urllib.urlencode(postData)
	request=urllib2.Request(
	url='http://www.renren.com/ajax/ShowCaptcha',
	data=postData,
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	if result=='1':
	return True
	else:
	return False

	def getMyFriendList(self):
	'''To get the friendlist of current user.
	@return [(ID1,name1),(ID2,name2),...]'''
	request=urllib2.Request(
	url='http://friend.renren.com/myfriendlistx.do',
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	friends=str(re.search('friends=\[{.*}\]',result).group())
	friendList=re.findall(r'"id":(.?),.?,"name":"(.*?)"',friends)
	return friendList

	def getFriendListOnPage(self,ID,page=0):
	'''To get the friend list of user with required ID.
	@return [(ID1,name1),(ID2,name2)...]'''
	page=repr(page)
	request=urllib2.Request(
	url='http://friend.renren.com/GetFriendList.do?curpage='+page+'&id='+ID,
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	friendList=re.findall('<dd><a href.?id=(\d+)">(.?)</a>',result)
	return friendList

	def getStatusOnPage(self,ID,page=0):
	'''To get the status of user with required ID.
	@reutrn json'''
	page=repr(page)
	request=urllib2.Request(
	url='http://status.renren.com/GetSomeomeDoingList.do?userId='+ID+'&curpage='+page,
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	data=json.loads(result)
	return data['doingArray']

	def getProfile(self,ID):
	'''To get the profile of user with required ID.
	@return {}'''
	request=urllib2.Request(
	url='http://www.renren.com/'+ID+'/profile',
	headers=self.header
	)
	result=urllib2.urlopen(request).read()
	self.cookie.save(self.cookieFile)
	info=re.search('<ul class="u.*?</ul>',result,re.S)
	if info==None:
	return {'ID':ID}
	info=info.group()
	info=re.sub('<span.*?>','',info)
	info=re.sub('</span>','',info)
	lis=re.findall('<li class="(.?)">(.?)</li>',info)
	info={'ID':ID}
	print lis
	for item in lis:
	info[item[0]]=item[1]
	return info