Skip to content

Instantly share code, notes, and snippets.

@KayneWest
Last active August 29, 2015 14:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save KayneWest/bac279438e67cf2e8fdf to your computer and use it in GitHub Desktop.
Save KayneWest/bac279438e67cf2e8fdf to your computer and use it in GitHub Desktop.
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import random
import csv
import pandas as pd
import pickle
import random
import datetime
import os
#DOWNLOAD DIRECTORY
DIRECTORY="DIRECTORY"
def log(msg):
print("{} {}".format(str(datetime.datetime.now()), msg))
def removeNonAscii(s):
return "".join(filter(lambda x: ord(x)<128, s))
class MiniBatchIterator(object):
""" Basic mini-batch iterator """
def __init__(self, x, batch_size=4):
self.x = x
self.batch_size = batch_size
def __iter__(self):
n_samples = len(self.x)
for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
yield (self.x[i*self.batch_size:(i+1)*self.batch_size])
class RealTrendsBrowswer(object):
'''uses a real browser so google doesn't block you,
if you're using this on an AWS machine, you're deffinitely going to
need X10 or X11 (whatever the it's called) forwarding so that
selenium is actually displayed'''
def __init__(self,gmail_account,gmail_password,directory=DIRECTORY):
#initializes you to the trends page, MUST HAVE GMAIL ACCOUNT/PASSWORD'''
self.email=gmail_account
self.password=gmail_password
self.profile = webdriver.FirefoxProfile()
self.profile.set_preference('browser.download.dir',directory)
self.profile.set_preference('browser.helperApps.neverAsk.saveToDisk',"text/csv")
self.profile.set_preference("browser.download.manager.showWhenStarting",False)
self.profile.set_preference("browser.download.manager.closeWhenDone",True)
self.profile.set_preference("browser.download.manager.showAlertOnComplete", False)
self.profile.set_preference("browser.download.alertOnEXEOpen", False);
self.profile.set_preference("browser.download.manager.focusWhenStarting", False)
self.profile.set_preference("browser.helperApps.alwaysAsk.force", False)
self.profile.set_preference("browser.download.manager.alertOnEXEOpen", False)
self.profile.set_preference("browser.download.manager.closeWhenDone", False)
self.profile.set_preference("browser.download.manager.showAlertOnComplete", False)
self.profile.set_preference("browser.download.manager.useWindow", False)
self.profile.set_preference("browser.download.manager.showWhenStarting", False)
self.profile.set_preference("services.sync.prefs.sync.browser.download.manager.showWhenStarting", False)
self.profile.set_preference("pdfjs.disabled", True)
self.profile.set_preference("browser.download.show_plugins_in_list",False)
self.driver = webdriver.Firefox(self.profile)
time.sleep(random.randint(1,5))
self.driver.get('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
time.sleep(random.randint(1,5))
emailid=self.driver.find_element_by_id("Email")
emailid.send_keys(self.email)
time.sleep(random.randint(1,5))
passw=self.driver.find_element_by_id("Passwd")
passw.send_keys(self.password)
time.sleep(random.randint(1,5))
signin=self.driver.find_element_by_id("signIn")
signin.click()
time.sleep(random.randint(1,5))
self.driver.get('http://www.google.com/trends/')
time.sleep(5)
#driver.get('http://www.google.com/trends/explore#cmpt=q')
#time.sleep(10)
def query_items(self,item1,item2,item3,item4,item5):
self.driver.get('http://www.google.com/trends/explore#cmpt=q')
time.sleep(5)
blue=self.driver.find_element_by_css_selector('div.pill.add.add-term.blue')
blue.click()
time.sleep(random.randint(1,5))
Hover = ActionChains(self.driver).move_to_element(blue).send_keys(item1).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007')
Hover.perform()
time.sleep(random.randint(1,5))
red=self.driver.find_element_by_css_selector('div.pill.add.add-term.red')
red.click()
time.sleep(random.randint(1,5))
Hoverred = ActionChains(self.driver).move_to_element(red).send_keys(item2).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007')
Hoverred.perform()
time.sleep(random.randint(1,5))
yellow=self.driver.find_element_by_css_selector('div.pill.add.add-term.yellow')
yellow.click()
time.sleep(random.randint(1,5))
Hoveryellow = ActionChains(self.driver).move_to_element(yellow).send_keys(item3).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007')
Hoveryellow.perform()
time.sleep(random.randint(1,5))
green=self.driver.find_element_by_css_selector('div.pill.add.add-term.green')
green.click()
time.sleep(random.randint(1,5))
Hovergreen = ActionChains(self.driver).move_to_element(green).send_keys(item4).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007')
Hovergreen.perform()
time.sleep(random.randint(1,5))
purple=self.driver.find_element_by_css_selector('div.pill.add.add-term.purple')
purple.click()
time.sleep(random.randint(1,5))
Hoverpurple = ActionChains(self.driver).move_to_element(green).send_keys(item5).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007')
Hoverpurple.perform()
time.sleep(random.randint(1,5))
def download(self):
menu=self.driver.find_element_by_css_selector('div#settings-menu-button.app-bar-buttons')
Hover = ActionChains(self.driver).move_to_element(menu).send_keys("")
Hover.perform()
submenu=self.driver.find_element_by_css_selector('div.goog-inline-block.goog-flat-menu-button.goog-flat-menu-button-hover')
submenu.click()
submenu.send_keys(u'\ue015') #down-arrow
submenu.send_keys(u'\ue007') #press enter
class Sorter(object): #list needs to be a number divisible by 4, then add 5 to that.
'''This class sorts the list to find out what is the highest searched item in the list
so that each item in the list can be relative to eachothers search index/
this is ideal for finding relative demand'''
def __init__(self,listofstuff,browser, dictionary=None,dataframe=None):
if dictionary==None:
self.dic={}
self.max_value = None
else:
self.dic=dictionary
self.max_value=max(self.dic, key=self.dic.get)
if isinstance(dictionary,pd.DataFrame):
self.df=dataframe
else:
self.df=pd.DataFrame() #blank dataframe
self.LIST=listofstuff
self.browser=browser
self.error_log={}
def open_file(self,path_to_file, attempts=0, timeout=12, sleep_int=10):
if attempts < timeout and os.path.exists(path_to_file) and os.path.isfile(path_to_file):
try:
file = open(path_to_file,'rU')
return file
except:
# perform an action
sleep(sleep_int)
open_file(path_to_file, attempts + 1)
def STARTwaiter(self, filen):
ff = self.open_file(DIRECTORY+'report.csv')
if ff:
log('got to start waiter')
log('file opened during start waiter')
file1=csv.reader(ff,delimiter=',')
repo=[]
for line in file1:
repo.append(line)
ff.close()
if repo!=[]:
repo=repo[4:570]
repo_columns=[x for x in repo[0]]
self.df=pd.DataFrame(repo[1:],columns=repo_columns)
#f.close()
#convert columns to floats
for column in self.df.columns[1:]: #skip the week column
self.df[column]=self.df[column].apply(lambda x: float(x))
for i in self.df.columns[1:]: #skip the week column
self.dic[i]=self.df[i].max()
#!rm 'Downloads/report.csv'
log('removing report.csv during startwaiter')
os.remove(DIRECTORY+'report.csv')
self.max_value=max(self.dic, key=self.dic.get)
#save the dictionary after every iteration
pickle.dump(self.dic,open('dic.p','wb'))
pickle.dump(self.df,open('df.p','wb'))
else:
num=len(self.error_log)
self.error_log[num]=minibatch
log('removing report.csv during else of startwaiter')
os.remove(DIRECTORY+'report.csv')
pass
def UPDATERwaiter(self, filen):
ff = self.open_file(DIRECTORY+'report.csv')
if ff:
log('got inside updatewaiter')
log('opened file')
file1=csv.reader(ff,delimiter=',')
repo=[]
for line in file1:
repo.append(line)
ff.close()
if repo!=[]:
repo=repo[4:570]
repo_columns=[x for x in repo[0]]
updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
updates={}
#convert columns to floats
for column in updatedf.columns[1:]:
updatedf[column]=updatedf[column].apply(lambda x: float(x))
for i in updatedf.columns[1:]: #skip the week column
updates[i]=updatedf[i].max()
#!rm 'Downloads/report.csv'
log('removing report.csv during Update waiter')
os.remove(DIRECTORY+'report.csv')
minibatch_max_value=max(updates, key=updates.get)
if minibatch_max_value!=self.max_value:
old_max=self.max_value #for the updating after the for loop
for k,v in self.dic.items():
if k==self.max_value:
#update dataframe
self.df[self.max_value]=updatedf[self.max_value]
#update max dictionary
self.dic[self.max_value]=updates[self.max_value]
else:
#update dataframe
factor=self.dic[k]/updates[self.max_value]
self.df[k]*factor
#update max dictionary
new_value=factor*self.dic[k]
self.dic[k]=new_value
#update the max-value
self.max_value=minibatch_max_value
for k,v in updates.items():
if k==old_max: #already updated, don't need to do it again.
pass
else:
self.dic[k]=v
else:
#only need to update the dic item normally
for k,v in updates.items():
#update max dictionary
self.dic[k]=v
for x in updatedf.columns[1:]:
self.df[x]=updatedf[x]
#self.df.append(updates)
#save pickle file
pickle.dump(self.dic,open('dic.p','wb'))
pickle.dump(self.df,open('df.p','wb'))
else:
num=len(self.error_log)
self.error_log[num]=minibatch
log('removing report.csv in else of updater')
os.remove(DIRECTORY+'report.csv')
pass
def starter(self):
'''only use this method if you're starting the process
otherwise DON'T use it.'''
New=random.sample(self.LIST,5)
for item in New:
self.LIST.remove(item)
#process 2: create minibatches
self.minibatches=MiniBatchIterator(self.LIST)
log('searching for '+str(New))
try:
#process 3: start query, download/analyze start point
self.browser.query_items(New[0],New[1],New[2],New[3],New[4])
self.browser.download()
except:
try:
self.browser.query_items(New[0],New[1],New[2],New[3],New[4])
self.browser.download()
except:
try:
self.browser.query_items(New[0],New[1],New[2],New[3],New[4])
self.browser.download()
except:
pass
#wait between 5,10 seconds for the item to download
#before starting with the next process.
time.sleep(random.randint(15,20))
log('starting download finder thing')
self.STARTwaiter(DIRECTORY+'report.csv')
def create_minibatches(self):
#only use this minibatch creater if you DID NOT use the starter functions
self.minibatches=MiniBatchIterator(self.LIST)
def updater(self):
for minibatch in self.minibatches:
#sleep between 1.5 - 2 min per download
time.sleep(random.randint(90,120))
log('searching for '+str(self.max_value)+','+str(minibatch))
updates={}
#sometimes google is weird, so it might not pick up on
#finding the right css, so this hacky method hits it three times
#before giving up
try:
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3])
self.browser.download()
except:
try:
time.sleep(10)
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3])
self.browser.download()
except:
try:
time.sleep(5)
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3])
self.browser.download()
except:
pass
#sleep funciton is important, need to wait for item to download
time.sleep(random.randint(15,20))
self.UPDATERwaiter(DIRECTORY+'report.csv')
if abs(random.gauss(0,1))<0.1:
[self.random_wiki() for n_time in range(5,15)]
return self.dic
#goes to a random page of wikipedia, then waits random time
def random_wiki(self):
self.browser.driver.get('http://en.wikipedia.org/wiki/Main_page')
self.browser.driver.find_elements_by_id('n-randompage')[0].click()
time.sleep(abs(random.gauss(10,3)))
def bubble_sorter(self):
#TODO
#can only be used when top item is found.
return ''
if __name__ == "__main__":
browser=RealTrendsBrowswer('EMAIL','PASSWORD')
import pickle
os.chdir(DIRECTORY)
skills=pickle.load(open('search.p'))
random.shuffle(skills)
print 'loading the sorter'
sorter=Sorter(skills,browser)
#sorter=Sorter(list_of_terms,browser)
print 'starting the process'
sorter.starter()
time.sleep(5)
print 'updating the searcher, this will take some time'
sorter.updater()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment