-
-
Save congkhoa/d62f9c36490c0cbc05ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from selenium import webdriver | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.common.action_chains import ActionChains | |
import random | |
import csv | |
import pandas as pd | |
import pickle | |
import random | |
import datetime | |
import os | |
#DOWNLOAD DIRECTORY | |
DIRECTORY="DIRECTORY" | |
def log(msg): | |
print("{} {}".format(str(datetime.datetime.now()), msg)) | |
def removeNonAscii(s): | |
return "".join(filter(lambda x: ord(x)<128, s)) | |
class MiniBatchIterator(object): | |
""" Basic mini-batch iterator """ | |
def __init__(self, x, batch_size=4): | |
self.x = x | |
self.batch_size = batch_size | |
def __iter__(self): | |
n_samples = len(self.x) | |
for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size): | |
yield (self.x[i*self.batch_size:(i+1)*self.batch_size]) | |
class RealTrendsBrowswer(object): | |
'''uses a real browser so google doesn't block you, | |
if you're using this on an AWS machine, you're deffinitely going to | |
need X10 or X11 (whatever the it's called) forwarding so that | |
selenium is actually displayed''' | |
def __init__(self,gmail_account,gmail_password,directory=DIRECTORY): | |
#initializes you to the trends page, MUST HAVE GMAIL ACCOUNT/PASSWORD''' | |
self.email=gmail_account | |
self.password=gmail_password | |
self.profile = webdriver.FirefoxProfile() | |
self.profile.set_preference('browser.download.dir',directory) | |
self.profile.set_preference('browser.helperApps.neverAsk.saveToDisk',"text/csv") | |
self.profile.set_preference("browser.download.manager.showWhenStarting",False) | |
self.profile.set_preference("browser.download.manager.closeWhenDone",True) | |
self.profile.set_preference("browser.download.manager.showAlertOnComplete", False) | |
self.profile.set_preference("browser.download.alertOnEXEOpen", False); | |
self.profile.set_preference("browser.download.manager.focusWhenStarting", False) | |
self.profile.set_preference("browser.helperApps.alwaysAsk.force", False) | |
self.profile.set_preference("browser.download.manager.alertOnEXEOpen", False) | |
self.profile.set_preference("browser.download.manager.closeWhenDone", False) | |
self.profile.set_preference("browser.download.manager.showAlertOnComplete", False) | |
self.profile.set_preference("browser.download.manager.useWindow", False) | |
self.profile.set_preference("browser.download.manager.showWhenStarting", False) | |
self.profile.set_preference("services.sync.prefs.sync.browser.download.manager.showWhenStarting", False) | |
self.profile.set_preference("pdfjs.disabled", True) | |
self.profile.set_preference("browser.download.show_plugins_in_list",False) | |
self.driver = webdriver.Firefox(self.profile) | |
time.sleep(random.randint(1,5)) | |
self.driver.get('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/') | |
time.sleep(random.randint(1,5)) | |
emailid=self.driver.find_element_by_id("Email") | |
emailid.send_keys(self.email) | |
time.sleep(random.randint(1,5)) | |
passw=self.driver.find_element_by_id("Passwd") | |
passw.send_keys(self.password) | |
time.sleep(random.randint(1,5)) | |
signin=self.driver.find_element_by_id("signIn") | |
signin.click() | |
time.sleep(random.randint(1,5)) | |
self.driver.get('http://www.google.com/trends/') | |
time.sleep(5) | |
#driver.get('http://www.google.com/trends/explore#cmpt=q') | |
#time.sleep(10) | |
def query_items(self,item1,item2,item3,item4,item5): | |
self.driver.get('http://www.google.com/trends/explore#cmpt=q') | |
time.sleep(5) | |
blue=self.driver.find_element_by_css_selector('div.pill.add.add-term.blue') | |
blue.click() | |
time.sleep(random.randint(1,5)) | |
Hover = ActionChains(self.driver).move_to_element(blue).send_keys(item1).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007') | |
Hover.perform() | |
time.sleep(random.randint(1,5)) | |
red=self.driver.find_element_by_css_selector('div.pill.add.add-term.red') | |
red.click() | |
time.sleep(random.randint(1,5)) | |
Hoverred = ActionChains(self.driver).move_to_element(red).send_keys(item2).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007') | |
Hoverred.perform() | |
time.sleep(random.randint(1,5)) | |
yellow=self.driver.find_element_by_css_selector('div.pill.add.add-term.yellow') | |
yellow.click() | |
time.sleep(random.randint(1,5)) | |
Hoveryellow = ActionChains(self.driver).move_to_element(yellow).send_keys(item3).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007') | |
Hoveryellow.perform() | |
time.sleep(random.randint(1,5)) | |
green=self.driver.find_element_by_css_selector('div.pill.add.add-term.green') | |
green.click() | |
time.sleep(random.randint(1,5)) | |
Hovergreen = ActionChains(self.driver).move_to_element(green).send_keys(item4).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007') | |
Hovergreen.perform() | |
time.sleep(random.randint(1,5)) | |
purple=self.driver.find_element_by_css_selector('div.pill.add.add-term.purple') | |
purple.click() | |
time.sleep(random.randint(1,5)) | |
Hoverpurple = ActionChains(self.driver).move_to_element(green).send_keys(item5).send_keys(u'\ue007')#.send_keys(u'\ue015').send_keys(u'\ue007') | |
Hoverpurple.perform() | |
time.sleep(random.randint(1,5)) | |
def download(self): | |
menu=self.driver.find_element_by_css_selector('div#settings-menu-button.app-bar-buttons') | |
Hover = ActionChains(self.driver).move_to_element(menu).send_keys("") | |
Hover.perform() | |
submenu=self.driver.find_element_by_css_selector('div.goog-inline-block.goog-flat-menu-button.goog-flat-menu-button-hover') | |
submenu.click() | |
submenu.send_keys(u'\ue015') #down-arrow | |
submenu.send_keys(u'\ue007') #press enter | |
class Sorter(object): #list needs to be a number divisible by 4, then add 5 to that. | |
'''This class sorts the list to find out what is the highest searched item in the list | |
so that each item in the list can be relative to eachothers search index/ | |
this is ideal for finding relative demand''' | |
def __init__(self,listofstuff,browser, dictionary=None,dataframe=None): | |
if dictionary==None: | |
self.dic={} | |
self.max_value = None | |
else: | |
self.dic=dictionary | |
self.max_value=max(self.dic, key=self.dic.get) | |
if isinstance(dictionary,pd.DataFrame): | |
self.df=dataframe | |
else: | |
self.df=pd.DataFrame() #blank dataframe | |
self.LIST=listofstuff | |
self.browser=browser | |
self.error_log={} | |
def open_file(self,path_to_file, attempts=0, timeout=12, sleep_int=10): | |
if attempts < timeout and os.path.exists(path_to_file) and os.path.isfile(path_to_file): | |
try: | |
file = open(path_to_file,'rU') | |
return file | |
except: | |
# perform an action | |
sleep(sleep_int) | |
open_file(path_to_file, attempts + 1) | |
def STARTwaiter(self, filen): | |
ff = self.open_file(DIRECTORY+'report.csv') | |
if ff: | |
log('got to start waiter') | |
log('file opened during start waiter') | |
file1=csv.reader(ff,delimiter=',') | |
repo=[] | |
for line in file1: | |
repo.append(line) | |
ff.close() | |
if repo!=[]: | |
repo=repo[4:570] | |
repo_columns=[x for x in repo[0]] | |
self.df=pd.DataFrame(repo[1:],columns=repo_columns) | |
#f.close() | |
#convert columns to floats | |
for column in self.df.columns[1:]: #skip the week column | |
self.df[column]=self.df[column].apply(lambda x: float(x)) | |
for i in self.df.columns[1:]: #skip the week column | |
self.dic[i]=self.df[i].max() | |
#!rm 'Downloads/report.csv' | |
log('removing report.csv during startwaiter') | |
os.remove(DIRECTORY+'report.csv') | |
self.max_value=max(self.dic, key=self.dic.get) | |
#save the dictionary after every iteration | |
pickle.dump(self.dic,open('dic.p','wb')) | |
pickle.dump(self.df,open('df.p','wb')) | |
else: | |
num=len(self.error_log) | |
self.error_log[num]=minibatch | |
log('removing report.csv during else of startwaiter') | |
os.remove(DIRECTORY+'report.csv') | |
pass | |
def UPDATERwaiter(self, filen): | |
ff = self.open_file(DIRECTORY+'report.csv') | |
if ff: | |
log('got inside updatewaiter') | |
log('opened file') | |
file1=csv.reader(ff,delimiter=',') | |
repo=[] | |
for line in file1: | |
repo.append(line) | |
ff.close() | |
if repo!=[]: | |
repo=repo[4:570] | |
repo_columns=[x for x in repo[0]] | |
updatedf=pd.DataFrame(repo[1:],columns=repo_columns) | |
updates={} | |
#convert columns to floats | |
for column in updatedf.columns[1:]: | |
updatedf[column]=updatedf[column].apply(lambda x: float(x)) | |
for i in updatedf.columns[1:]: #skip the week column | |
updates[i]=updatedf[i].max() | |
#!rm 'Downloads/report.csv' | |
log('removing report.csv during Update waiter') | |
os.remove(DIRECTORY+'report.csv') | |
minibatch_max_value=max(updates, key=updates.get) | |
if minibatch_max_value!=self.max_value: | |
old_max=self.max_value #for the updating after the for loop | |
for k,v in self.dic.items(): | |
if k==self.max_value: | |
#update dataframe | |
self.df[self.max_value]=updatedf[self.max_value] | |
#update max dictionary | |
self.dic[self.max_value]=updates[self.max_value] | |
else: | |
#update dataframe | |
factor=self.dic[k]/updates[self.max_value] | |
self.df[k]*factor | |
#update max dictionary | |
new_value=factor*self.dic[k] | |
self.dic[k]=new_value | |
#update the max-value | |
self.max_value=minibatch_max_value | |
for k,v in updates.items(): | |
if k==old_max: #already updated, don't need to do it again. | |
pass | |
else: | |
self.dic[k]=v | |
else: | |
#only need to update the dic item normally | |
for k,v in updates.items(): | |
#update max dictionary | |
self.dic[k]=v | |
for x in updatedf.columns[1:]: | |
self.df[x]=updatedf[x] | |
#self.df.append(updates) | |
#save pickle file | |
pickle.dump(self.dic,open('dic.p','wb')) | |
pickle.dump(self.df,open('df.p','wb')) | |
else: | |
num=len(self.error_log) | |
self.error_log[num]=minibatch | |
log('removing report.csv in else of updater') | |
os.remove(DIRECTORY+'report.csv') | |
pass | |
def starter(self): | |
'''only use this method if you're starting the process | |
otherwise DON'T use it.''' | |
New=random.sample(self.LIST,5) | |
for item in New: | |
self.LIST.remove(item) | |
#process 2: create minibatches | |
self.minibatches=MiniBatchIterator(self.LIST) | |
log('searching for '+str(New)) | |
try: | |
#process 3: start query, download/analyze start point | |
self.browser.query_items(New[0],New[1],New[2],New[3],New[4]) | |
self.browser.download() | |
except: | |
try: | |
self.browser.query_items(New[0],New[1],New[2],New[3],New[4]) | |
self.browser.download() | |
except: | |
try: | |
self.browser.query_items(New[0],New[1],New[2],New[3],New[4]) | |
self.browser.download() | |
except: | |
pass | |
#wait between 5,10 seconds for the item to download | |
#before starting with the next process. | |
time.sleep(random.randint(15,20)) | |
log('starting download finder thing') | |
self.STARTwaiter(DIRECTORY+'report.csv') | |
def create_minibatches(self): | |
#only use this minibatch creater if you DID NOT use the starter functions | |
self.minibatches=MiniBatchIterator(self.LIST) | |
def updater(self): | |
for minibatch in self.minibatches: | |
#sleep between 1.5 - 2 min per download | |
time.sleep(random.randint(90,120)) | |
log('searching for '+str(self.max_value)+','+str(minibatch)) | |
updates={} | |
#sometimes google is weird, so it might not pick up on | |
#finding the right css, so this hacky method hits it three times | |
#before giving up | |
try: | |
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3]) | |
self.browser.download() | |
except: | |
try: | |
time.sleep(10) | |
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3]) | |
self.browser.download() | |
except: | |
try: | |
time.sleep(5) | |
self.browser.query_items(self.max_value,minibatch[0],minibatch[1],minibatch[2],minibatch[3]) | |
self.browser.download() | |
except: | |
pass | |
#sleep funciton is important, need to wait for item to download | |
time.sleep(random.randint(15,20)) | |
self.UPDATERwaiter(DIRECTORY+'report.csv') | |
if abs(random.gauss(0,1))<0.1: | |
[self.random_wiki() for n_time in range(5,15)] | |
return self.dic | |
#goes to a random page of wikipedia, then waits random time | |
def random_wiki(self): | |
self.browser.driver.get('http://en.wikipedia.org/wiki/Main_page') | |
self.browser.driver.find_elements_by_id('n-randompage')[0].click() | |
time.sleep(abs(random.gauss(10,3))) | |
def bubble_sorter(self): | |
#TODO | |
#can only be used when top item is found. | |
return '' | |
if __name__ == "__main__": | |
browser=RealTrendsBrowswer('EMAIL','PASSWORD') | |
import pickle | |
os.chdir(DIRECTORY) | |
skills=pickle.load(open('search.p')) | |
random.shuffle(skills) | |
print 'loading the sorter' | |
sorter=Sorter(skills,browser) | |
#sorter=Sorter(list_of_terms,browser) | |
print 'starting the process' | |
sorter.starter() | |
time.sleep(5) | |
print 'updating the searcher, this will take some time' | |
sorter.updater() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment