Skip to content

Instantly share code, notes, and snippets.

@congkhoa
Forked from KayneWest/MechBrowser.py
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save congkhoa/f7e097410093e82c2ad3 to your computer and use it in GitHub Desktop.
Save congkhoa/f7e097410093e82c2ad3 to your computer and use it in GitHub Desktop.
import mechanize
import cookielib
import urlparse
import re
import time
import random
import csv
import pandas as pd
import pickle
import random
import datetime
import os
user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)',
'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1'
]
#DOWNLOAD DIRECTORY
DIRECTORY='YOUR DIRECTORY HERE'
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
class MiniBatchIterator(object):
''' mini-batch iterator '''
def __init__(self, x, batch_size=4):
self.x = x
self.batch_size = batch_size
def __iter__(self):
n_samples = len(self.x)
for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
yield (self.x[i*self.batch_size:(i+1)*self.batch_size])
class TrendBrowser(object):
''' browser must have vaild gmail account, valid gmail password, a list of user agents,
and a valid directory'''
def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY,
listofstuff=None,dictionary=None,dataframe=None):
#time.sleep(random.randint(0,15))
#Create the basic browser object
os.chdir(DIRECTORY)
self.directory=DIRECTORY
if dictionary==None:
self.dic={}
self.max_value = None
else:
self.dic=dictionary
self.max_value=max(self.dic, key=self.dic.get)
#if you had to stop and had an existing dataframe,
#this will help
if isinstance(dictionary,pd.DataFrame):
self.df=dataframe
else:
self.df=pd.DataFrame() #blank dataframe
self.LIST=listofstuff
self.error_log={}
#adds user agents
self.user_agents=user_agents
self.browser = mechanize.Browser()
#Create a handler for cookies, this class can load and save cookies
cookies = cookielib.LWPCookieJar()
#Add it to browser
self.browser.set_cookiejar(cookies)
#Ignore robots.txt, so we don't miss anything while scraping
self.browser.set_handle_robots(False)
#Allow refresh redirections
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#Add a user agent header to our browser
#if you want proxies that work, this may do the trick
#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]
response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
forms = mechanize.ParseResponse(response)
form = forms[0]
form['Email'] = gmail_account
form['Passwd'] = gmail_password
response = self.browser.open(form.click())
#the following three functions are basic functions to simply query google
#trends for trend information
def get(self,website):
''' this will get the html of any website you wish to go to'''
source = self.browser.open(website).read()
return source
def trends_query(self,terms=[]): #hard limit on 5 search items
'''this function is exclusive to google trends'''
if len(terms)>5:
return 'can only search for 5 items at a time'
else:
strings=",".join(terms)
query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1'
#this downloads the file
self.browser.open(query1).read()
return self.browser.open(query1).read()
def trend_to_pandas(self, filen):
repo=filen.split('\n')
if repo!=[]:
repo=repo[4:575]
repo_columns=[x for x in repo[0]]
self.df=pd.DataFrame(repo[1:],columns=repo_columns)
#f.close()
#convert columns to floats
for column in self.df.columns[1:]: #skip the week column
self.df[column]=self.df[column].apply(lambda x: float(x))
return self.df
else:
log('error in trend_to_pandas')
def START(self, filen):
repo=filen.split('\n')
if repo!=[]:
repo=repo[4:575]
repo_columns=[x for x in repo[0]]
self.df=pd.DataFrame(repo[1:],columns=repo_columns)
#f.close()
#convert columns to floats
for column in self.df.columns[1:]: #skip the week column
self.df[column]=self.df[column].apply(lambda x: float(x))
for i in self.df.columns[1:]: #skip the week column
self.dic[i]=self.df[i].max()
self.max_value=max(self.dic, key=self.dic.get)
#save the dictionary after every iteration
pickle.dump(self.dic,open('dic.p','wb'))
pickle.dump(self.df,open('df.p','wb'))
else:
num=len(self.error_log)
self.error_log[num]=minibatch
log('error in START, look at error log')
pass
def UPDATER(self, filen):
repo=filen.split('\n')
if repo!=[]:
repo=repo[4:575]
repo_columns=[x for x in repo[0]]
updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
updates={}
#convert columns to floats
for column in updatedf.columns[1:]:
updatedf[column]=updatedf[column].apply(lambda x: float(x))
for i in updatedf.columns[1:]: #skip the week column
updates[i]=updatedf[i].max()
minibatch_max_value=max(updates, key=updates.get)
if minibatch_max_value!=self.max_value:
old_max=self.max_value #for the updating after the for loop
for k,v in self.dic.items():
if k==self.max_value:
#update dataframe
self.df[self.max_value]=updatedf[self.max_value]
#update max dictionary
self.dic[self.max_value]=updates[self.max_value]
else:
#update dataframe
factor=self.dic[k]/updates[self.max_value]
self.df[k]*factor
#update max dictionary
new_value=factor*self.dic[k]
self.dic[k]=new_value
#update the max-value
self.max_value=minibatch_max_value
for k,v in updates.items():
if k==old_max: #already updated, don't need to do it again.
pass
else:
self.dic[k]=v
else:
#only need to update the dic item normally
for k,v in updates.items():
#update max dictionary
self.dic[k]=v
for x in updatedf.columns[1:]:
self.df[x]=updatedf[x]
#self.df.append(updates)
#save pickle file
pickle.dump(self.dic,open('dic.p','wb'))
pickle.dump(self.df,open('df.p','wb'))
else:
num=len(self.error_log)
self.error_log[num]=minibatch
log('ERROR in UPDATER, check error log')
pass
def start_process(self):
'''only use this method if you're starting the process
otherwise DON'T use it as it will fuck with your list.'''
New=random.sample(self.LIST,5)
for item in New:
self.LIST.remove(item)
#process 2: create minibatches
self.minibatches=MiniBatchIterator(self.LIST)
#wait between 5,10 seconds for the item to download
#before starting with the next process.
time.sleep(random.randint(15,20))
log('searching for '+str(New))
self.START(self.trends_query(terms=New))
def create_minibatches(self):
#only use this minibatch creater if you DID NOT use the start_process functions
self.minibatches=MiniBatchIterator(self.LIST)
def updater_process(self):
for minibatch in self.minibatches:
#sleep between 1.5 - 2 min per download #this way you don't hit their
#search quotas
time.sleep(random.randint(90,120))
updates={}
#sleep funciton is important, need to wait for item to download
time.sleep(random.randint(15,20))
log('searching for '+str(self.max_value)+','+str(minibatch))
self.UPDATER(self.trends_query(terms=minibatch+[self.max_value]))
if abs(random.gauss(0,1))<0.1:
[self.random_wiki() for n_time in range(5,15)]
return self.dic
#goes to a random page of wikipedia, then waits random time
#to again fool das google
def random_wiki(self):
self.get('http://en.wikipedia.org/wiki/Main_page')
self.browser.get('http://en.wikipedia.org/wiki/Special:Random')
time.sleep(abs(random.gauss(10,3)))
def pd_sorter(self):
#TODO
#can only be used when top item is found.
return ''
if __name__ == "__main__":
import pickle
os.chdir(DIRECTORY)
SEARCH_TERMS=pickle.load(open('search.p'))
random.shuffle(SEARCH_TERMS)
browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY,
listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None)
print 'loading the sorter'
#sorter=Sorter(list_of_terms,browser)
print 'starting the process'
browser.starter()
time.sleep(5)
print 'updating the searcher, this will take some time'
browser.updater_process()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment