Skip to content

Instantly share code, notes, and snippets.

@wooddar
Last active May 8, 2024 03:59
Show Gist options
  • Save wooddar/df4c89f381fa20ce819e94782dc5bc04 to your computer and use it in GitHub Desktop.
Save wooddar/df4c89f381fa20ce819e94782dc5bc04 to your computer and use it in GitHub Desktop.
Easy Python script to run selenium web workers/browsers in parallel
"""
This is an adaptable example script for using selenium across multiple webbrowsers simultaneously. This makes use of
two queues - one to store idle webworkers and another to store data to pass to any idle webworkers in a selenium function
"""
from multiprocessing import Queue, cpu_count
from threading import Thread
from selenium import webdriver
from time import sleep
from numpy.random import randint
import logging
logger = logging.getLogger(__name__)
# Some example data to pass the the selenium processes, this will just cause a sleep of time i
# This data can be a list of any datatype that can be pickled
selenium_data = [4, 2, 3, 3, 4, 3, 4, 3, 1, 2, 3, 2, 'STOP']
# Create the two queues to hold the data and the IDs for the selenium workers
selenium_data_queue = Queue()
worker_queue = Queue()
# Create Selenium processes and assign them a worker ID
# This ID is what needs to be put on the queue as Selenium workers cannot be pickled
# By default, make one selenium process per cpu core with cpu_count
# TODO: Change the worker creation code to be your webworker of choice e.g. PhantomJS
worker_ids = list(range(cpu_count()))
selenium_workers = {i: webdriver.Chrome() for i in worker_ids}
for worker_id in worker_ids:
worker_queue.put(worker_id)
def selenium_task(worker, data):
"""
This is a demonstration selenium function that takes a worker and data and then does something with the worker and
data.
TODO: change the below code to be whatever it is you want your worker to do e.g. scrape webpages or run browser tests
:param worker: A selenium web worker NOT a worker ID
:type worker: webdriver.XXX
:param data: Any data for your selenium function (must be pickleable)
:rtype: None
"""
worker.set_window_size(randint(100, 200), randint(200, 400))
logger.info("Getting Google")
worker.get(f'https://ytroulette.com')
logger.info("Sleeping")
sleep(data)
def selenium_queue_listener(data_queue, worker_queue):
"""
Monitor a data queue and assign new pieces of data to any available web workers to action
:param data_queue: The python FIFO queue containing the data to run on the web worker
:type data_queue: Queue
:param worker_queue: The queue that holds the IDs of any idle workers
:type worker_queue: Queue
:rtype: None
"""
logger.info("Selenium func worker started")
while True:
current_data = data_queue.get()
if current_data == 'STOP':
# If a stop is encountered then kill the current worker and put the stop back onto the queue
# to poison other workers listening on the queue
logger.warning("STOP encountered, killing worker thread")
data_queue.put(current_data)
break
else:
logger.info(f"Got the item {current_data} on the data queue")
# Get the ID of any currently free workers from the worker queue
worker_id = worker_queue.get()
worker = selenium_workers[worker_id]
# Assign current worker and current data to your selenium function
selenium_task(worker, current_data)
# Put the worker back into the worker queue as it has completed it's task
worker_queue.put(worker_id)
return
# Create one new queue listener thread per selenium worker and start them
logger.info("Starting selenium background processes")
selenium_processes = [Thread(target=selenium_queue_listener,
args=(selenium_data_queue, worker_queue)) for _ in worker_ids]
for p in selenium_processes:
p.daemon = True
p.start()
# Add each item of data to the data queue, this could be done over time so long as the selenium queue listening
# processes are still running
logger.info("Adding data to data queue")
for d in selenium_data:
selenium_data_queue.put(d)
# Wait for all selenium queue listening processes to complete, this happens when the queue listener returns
logger.info("Waiting for Queue listener threads to complete")
for p in selenium_processes:
p.join()
# Quit all the web workers elegantly in the background
logger.info("Tearing down web workers")
for b in selenium_workers.values():
b.quit()
Copy link

ghost commented May 20, 2021

Wow wow woW

@julienlambert42
Copy link

Thanks for this script.
How should I write data from each thread and write it to a csv file ?
I can't make threading.Lock() to work ... :(

@DolevAlgam
Copy link

DolevAlgam commented Nov 29, 2022

This is great, thanks.
I'm also returning the results with

class ThreadWithReturnValue(Thread):
  
    def __init__(self, group=None, target=None, name=None,
                 args=(), kwargs={}, Verbose=None):
        Thread.__init__(self, group, target, name, args, kwargs)
        self._return = None

    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args,
                                                **self._kwargs)
    def join(self, *args):
        Thread.join(self, *args)
        return self._return

Got it from this post: StackOverflow

For this to work you need line 81 to return a value, I'm returning a list I'm extending with the return of line 78.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment