import time
from bs4 import BeautifulSoup
from selenium import webdriver
import threading, multiprocessing
def create_driver():
"""returns a chrome webdriver headless"""
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless") # make it not visible
return webdriver.Chrome(options=chromeOptions)
def try_click_random_link(driver):
"""try to click on a random link on the opened page"""
elements = driver.find_elements_by_tag_name('a:link')
element = elements[len(elements)//3] # try being more deterministic for threads/process
def get_title(url, webdriver=None):
"""get the url html title using BeautifulSoup
if driver is None uses a new chrome-driver and quit() after
otherwise uses the driver provided and don't quit() after
def print_title(driver):
#[ try_click_random_link(driver) for i in range(8) ] # try to click-walk through 8 pages on random found links
soup = BeautifulSoup(driver.page_source,"lxml")
item = soup.find('title')
if webdriver:
webdriver = create_driver()
links = ["", "", "", "", "",
"", "", "", ""]
def main_sequentially():
start_time = time.time()
driver = create_driver()
for link in links: # simulation clicks
get_title(link, driver)
return (time.time() - start_time)
def main_threads():
start_time = time.time()
threads = []
for link in links: # each thread a new 'click'
th = threading.Thread(target=get_title, args=(link,))
th.start() # could sleep 1 between 'clicks' with `time.sleep(1)``
for th in threads:
th.join() # Main thread wait for threads finish
return (time.time() - start_time)
def main_multiprocessing():
start_time = time.time()
processes = []
for link in links: # each thread a new 'click'
ps = multiprocessing.Process(target=get_title, args=(link,))
ps.start() # could sleep 1 between 'clicks' with `time.sleep(1)``
for ps in processes:
ps.join() # Main wait for processes finish
return (time.time() - start_time)
def run_nget_times():
"""only for statistical measuraments - using this as a module"""
return main_sequentially(), main_threads(), main_multiprocessing()
if __name__ == '__main__':
seq_time = main_sequentially()
th_time = main_threads()
ps_time = main_multiprocessing()
print("sequential {:0} seconds ---".format(seq_time))
print("multithreads {:0} seconds ---".format(th_time))
print("multiprocessing {:0} seconds ---".format(ps_time))
