congkhoa/MechBrowser.py

## MechBrowser.py
import mechanize
import cookielib
import urlparse
import re
import time
import random
import csv
import pandas as pd
import pickle
import random
import datetime
import os

user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
                    'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
                    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
                    'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                    'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)',
                    'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
                    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1'
]

#DOWNLOAD DIRECTORY
DIRECTORY='YOUR DIRECTORY HERE'

def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

class MiniBatchIterator(object):
	''' mini-batch iterator '''
	def __init__(self, x, batch_size=4):
		self.x = x
		self.batch_size = batch_size

	def __iter__(self):
		n_samples = len(self.x)
		for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
			yield (self.x[i*self.batch_size:(i+1)*self.batch_size])


class TrendBrowser(object):

	''' browser must have vaild gmail account, valid gmail password, a list of user agents,
		and a valid directory'''

	def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY,
					listofstuff=None,dictionary=None,dataframe=None):
		#time.sleep(random.randint(0,15))
		#Create the basic browser object
		os.chdir(DIRECTORY)
		self.directory=DIRECTORY


		if dictionary==None:
			self.dic={}
			self.max_value = None
		else:
			self.dic=dictionary
			self.max_value=max(self.dic, key=self.dic.get)

		#if you had to stop and had an existing dataframe,
		#this will help
		if isinstance(dictionary,pd.DataFrame):
			self.df=dataframe
		else:
			self.df=pd.DataFrame() #blank dataframe

		self.LIST=listofstuff
		self.error_log={}

		#adds user agents
		self.user_agents=user_agents

		self.browser = mechanize.Browser()
		#Create a handler for cookies, this class can load and save cookies
		cookies = cookielib.LWPCookieJar()
		#Add it to browser
		self.browser.set_cookiejar(cookies)
		#Ignore robots.txt, so we don't miss anything while scraping
		self.browser.set_handle_robots(False)
		#Allow refresh redirections
		self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

		#Add a user agent header to our browser
		#if you want proxies that work, this may do the trick
		#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
		#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
		self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]

		response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
		forms = mechanize.ParseResponse(response)
		form = forms[0]
		form['Email'] = gmail_account
		form['Passwd'] = gmail_password
		response = self.browser.open(form.click())


	#the following three functions are basic functions to simply query google
	#trends for trend information
	def get(self,website):
		''' this will get the html of any website you wish to go to'''
		source = self.browser.open(website).read()
		return source

	def trends_query(self,terms=[]): #hard limit on 5 search items
		'''this function is exclusive to google trends'''

		if len(terms)>5:
			return 'can only search for 5 items at a time'
		else:
			strings=",".join(terms)
			query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1'
			#this downloads the file
			self.browser.open(query1).read()
			return self.browser.open(query1).read()

	def trend_to_pandas(self, filen):
		repo=filen.split('\n')
		if repo!=[]:
			repo=repo[4:575]
			repo_columns=[x for x in repo[0]]
			self.df=pd.DataFrame(repo[1:],columns=repo_columns)
			#f.close()
			#convert columns to floats
			for column in self.df.columns[1:]: #skip the week column
				self.df[column]=self.df[column].apply(lambda x: float(x))
			return self.df
		else:
			log('error in trend_to_pandas')


	def START(self, filen):
		repo=filen.split('\n')
		if repo!=[]:
			repo=repo[4:575]
			repo_columns=[x for x in repo[0]]
			self.df=pd.DataFrame(repo[1:],columns=repo_columns)
			#f.close()
			#convert columns to floats
			for column in self.df.columns[1:]: #skip the week column
				self.df[column]=self.df[column].apply(lambda x: float(x))
			for i in self.df.columns[1:]: #skip the week column
				self.dic[i]=self.df[i].max()
			self.max_value=max(self.dic, key=self.dic.get)
			#save the dictionary after every iteration
			pickle.dump(self.dic,open('dic.p','wb'))
			pickle.dump(self.df,open('df.p','wb'))
		else:
			num=len(self.error_log)
			self.error_log[num]=minibatch
			log('error in START, look at error log')
			pass

	def UPDATER(self, filen):
		repo=filen.split('\n')
		if repo!=[]:
			repo=repo[4:575]
			repo_columns=[x for x in repo[0]]
			updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
			updates={}
			#convert columns to floats
			for column in updatedf.columns[1:]:
				updatedf[column]=updatedf[column].apply(lambda x: float(x))
			for i in updatedf.columns[1:]: #skip the week column
				updates[i]=updatedf[i].max()
			minibatch_max_value=max(updates, key=updates.get)

			if minibatch_max_value!=self.max_value:
				old_max=self.max_value #for the updating after the for loop
				for k,v in self.dic.items():
					if k==self.max_value:
						#update dataframe
						self.df[self.max_value]=updatedf[self.max_value]
						#update max dictionary
						self.dic[self.max_value]=updates[self.max_value]
					else:
						#update dataframe
						factor=self.dic[k]/updates[self.max_value]
						self.df[k]*factor
						#update max dictionary
						new_value=factor*self.dic[k]
						self.dic[k]=new_value

				#update the max-value
				self.max_value=minibatch_max_value
				for k,v in updates.items():
					if k==old_max: #already updated, don't need to do it again.
						pass
					else:
						self.dic[k]=v

			else:
				#only need to update the dic item normally
				for k,v in updates.items():
					#update max dictionary
					self.dic[k]=v

			for x in updatedf.columns[1:]:
				self.df[x]=updatedf[x]

			#self.df.append(updates)
			#save pickle file
			pickle.dump(self.dic,open('dic.p','wb'))
			pickle.dump(self.df,open('df.p','wb'))
		else:
			num=len(self.error_log)
			self.error_log[num]=minibatch
			log('ERROR in UPDATER, check error log')
			pass

	def start_process(self):

		'''only use this method if you're starting the process
		otherwise DON'T use it as it will fuck with your list.'''

		New=random.sample(self.LIST,5)
		for item in New:
		    self.LIST.remove(item)

		#process 2: create minibatches
		self.minibatches=MiniBatchIterator(self.LIST)

		#wait between 5,10 seconds for the item to download
		#before starting with the next process.
		time.sleep(random.randint(15,20))

		log('searching for '+str(New))
		self.START(self.trends_query(terms=New))

	def create_minibatches(self):
		#only use this minibatch creater if you DID NOT use the start_process functions
		self.minibatches=MiniBatchIterator(self.LIST)

	def updater_process(self):
		for minibatch in self.minibatches:
			#sleep between 1.5 - 2 min per download #this way you don't hit their
			#search quotas
			time.sleep(random.randint(90,120))

			updates={}

			#sleep funciton is important, need to wait for item to download
			time.sleep(random.randint(15,20))
			log('searching for '+str(self.max_value)+','+str(minibatch))
			self.UPDATER(self.trends_query(terms=minibatch+[self.max_value]))
			if abs(random.gauss(0,1))<0.1:
				[self.random_wiki() for n_time in range(5,15)]
		return self.dic

	#goes to a random page of wikipedia, then waits random time
	#to again fool das google
	def random_wiki(self):
		self.get('http://en.wikipedia.org/wiki/Main_page')
		self.browser.get('http://en.wikipedia.org/wiki/Special:Random')
		time.sleep(abs(random.gauss(10,3)))

	def pd_sorter(self):
		#TODO
		#can only be used when top item is found.
		return ''


if __name__ == "__main__":
	import pickle
	os.chdir(DIRECTORY)
	SEARCH_TERMS=pickle.load(open('search.p'))
	random.shuffle(SEARCH_TERMS)
	browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY,
						listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None)
	print 'loading the sorter'
	#sorter=Sorter(list_of_terms,browser)
	print 'starting the process'
	browser.starter()
	time.sleep(5)
	print 'updating the searcher, this will take some time'
	browser.updater_process()
	import mechanize
	import cookielib
	import urlparse
	import re
	import time
	import random
	import csv
	import pandas as pd
	import pickle
	import random
	import datetime
	import os

	user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
	'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
	'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
	'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
	'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
	'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)',
	'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
	'Opera/9.25 (Windows NT 5.1; U; en)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
	'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
	'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1'
	]

	#DOWNLOAD DIRECTORY
	DIRECTORY='YOUR DIRECTORY HERE'

	def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

	class MiniBatchIterator(object):
	''' mini-batch iterator '''
	def __init__(self, x, batch_size=4):
	self.x = x
	self.batch_size = batch_size

	def __iter__(self):
	n_samples = len(self.x)
	for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
	yield (self.x[iself.batch_size:(i+1)self.batch_size])


	class TrendBrowser(object):

	''' browser must have vaild gmail account, valid gmail password, a list of user agents,
	and a valid directory'''

	def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY,
	listofstuff=None,dictionary=None,dataframe=None):
	#time.sleep(random.randint(0,15))
	#Create the basic browser object
	os.chdir(DIRECTORY)
	self.directory=DIRECTORY


	if dictionary==None:
	self.dic={}
	self.max_value = None
	else:
	self.dic=dictionary
	self.max_value=max(self.dic, key=self.dic.get)

	#if you had to stop and had an existing dataframe,
	#this will help
	if isinstance(dictionary,pd.DataFrame):
	self.df=dataframe
	else:
	self.df=pd.DataFrame() #blank dataframe

	self.LIST=listofstuff
	self.error_log={}

	#adds user agents
	self.user_agents=user_agents

	self.browser = mechanize.Browser()
	#Create a handler for cookies, this class can load and save cookies
	cookies = cookielib.LWPCookieJar()
	#Add it to browser
	self.browser.set_cookiejar(cookies)
	#Ignore robots.txt, so we don't miss anything while scraping
	self.browser.set_handle_robots(False)
	#Allow refresh redirections
	self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	#Add a user agent header to our browser
	#if you want proxies that work, this may do the trick
	#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
	#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
	self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]

	response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
	forms = mechanize.ParseResponse(response)
	form = forms[0]
	form['Email'] = gmail_account
	form['Passwd'] = gmail_password
	response = self.browser.open(form.click())


	#the following three functions are basic functions to simply query google
	#trends for trend information
	def get(self,website):
	''' this will get the html of any website you wish to go to'''
	source = self.browser.open(website).read()
	return source

	def trends_query(self,terms=[]): #hard limit on 5 search items
	'''this function is exclusive to google trends'''

	if len(terms)>5:
	return 'can only search for 5 items at a time'
	else:
	strings=",".join(terms)
	query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1'
	#this downloads the file
	self.browser.open(query1).read()
	return self.browser.open(query1).read()

	def trend_to_pandas(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	self.df=pd.DataFrame(repo[1:],columns=repo_columns)
	#f.close()
	#convert columns to floats
	for column in self.df.columns[1:]: #skip the week column
	self.df[column]=self.df[column].apply(lambda x: float(x))
	return self.df
	else:
	log('error in trend_to_pandas')


	def START(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	self.df=pd.DataFrame(repo[1:],columns=repo_columns)
	#f.close()
	#convert columns to floats
	for column in self.df.columns[1:]: #skip the week column
	self.df[column]=self.df[column].apply(lambda x: float(x))
	for i in self.df.columns[1:]: #skip the week column
	self.dic[i]=self.df[i].max()
	self.max_value=max(self.dic, key=self.dic.get)
	#save the dictionary after every iteration
	pickle.dump(self.dic,open('dic.p','wb'))
	pickle.dump(self.df,open('df.p','wb'))
	else:
	num=len(self.error_log)
	self.error_log[num]=minibatch
	log('error in START, look at error log')
	pass

	def UPDATER(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
	updates={}
	#convert columns to floats
	for column in updatedf.columns[1:]:
	updatedf[column]=updatedf[column].apply(lambda x: float(x))
	for i in updatedf.columns[1:]: #skip the week column
	updates[i]=updatedf[i].max()
	minibatch_max_value=max(updates, key=updates.get)

	if minibatch_max_value!=self.max_value:
	old_max=self.max_value #for the updating after the for loop
	for k,v in self.dic.items():
	if k==self.max_value:
	#update dataframe
	self.df[self.max_value]=updatedf[self.max_value]
	#update max dictionary
	self.dic[self.max_value]=updates[self.max_value]
	else:
	#update dataframe
	factor=self.dic[k]/updates[self.max_value]
	self.df[k]*factor
	#update max dictionary
	new_value=factor*self.dic[k]
	self.dic[k]=new_value

	#update the max-value
	self.max_value=minibatch_max_value
	for k,v in updates.items():
	if k==old_max: #already updated, don't need to do it again.
	pass
	else:
	self.dic[k]=v

	else:
	#only need to update the dic item normally
	for k,v in updates.items():
	#update max dictionary
	self.dic[k]=v

	for x in updatedf.columns[1:]:
	self.df[x]=updatedf[x]

	#self.df.append(updates)
	#save pickle file
	pickle.dump(self.dic,open('dic.p','wb'))
	pickle.dump(self.df,open('df.p','wb'))
	else:
	num=len(self.error_log)
	self.error_log[num]=minibatch
	log('ERROR in UPDATER, check error log')
	pass

	def start_process(self):

	'''only use this method if you're starting the process
	otherwise DON'T use it as it will fuck with your list.'''

	New=random.sample(self.LIST,5)
	for item in New:
	self.LIST.remove(item)

	#process 2: create minibatches
	self.minibatches=MiniBatchIterator(self.LIST)

	#wait between 5,10 seconds for the item to download
	#before starting with the next process.
	time.sleep(random.randint(15,20))

	log('searching for '+str(New))
	self.START(self.trends_query(terms=New))

	def create_minibatches(self):
	#only use this minibatch creater if you DID NOT use the start_process functions
	self.minibatches=MiniBatchIterator(self.LIST)

	def updater_process(self):
	for minibatch in self.minibatches:
	#sleep between 1.5 - 2 min per download #this way you don't hit their
	#search quotas
	time.sleep(random.randint(90,120))

	updates={}

	#sleep funciton is important, need to wait for item to download
	time.sleep(random.randint(15,20))
	log('searching for '+str(self.max_value)+','+str(minibatch))
	self.UPDATER(self.trends_query(terms=minibatch+[self.max_value]))
	if abs(random.gauss(0,1))<0.1:
	[self.random_wiki() for n_time in range(5,15)]
	return self.dic

	#goes to a random page of wikipedia, then waits random time
	#to again fool das google
	def random_wiki(self):
	self.get('http://en.wikipedia.org/wiki/Main_page')
	self.browser.get('http://en.wikipedia.org/wiki/Special:Random')
	time.sleep(abs(random.gauss(10,3)))

	def pd_sorter(self):
	#TODO
	#can only be used when top item is found.
	return ''


	if __name__ == "__main__":
	import pickle
	os.chdir(DIRECTORY)
	SEARCH_TERMS=pickle.load(open('search.p'))
	random.shuffle(SEARCH_TERMS)
	browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY,
	listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None)
	print 'loading the sorter'
	#sorter=Sorter(list_of_terms,browser)
	print 'starting the process'
	browser.starter()
	time.sleep(5)
	print 'updating the searcher, this will take some time'
	browser.updater_process()