Skip to content

Instantly share code, notes, and snippets.

@himan94
Forked from miraculixx/example.MD
Created June 9, 2020 14:45
Show Gist options
  • Save himan94/30d9a8e9795b4a96bb543604fb8efbf6 to your computer and use it in GitHub Desktop.
Save himan94/30d9a8e9795b4a96bb543604fb8efbf6 to your computer and use it in GitHub Desktop.
Python multiprocess parallel selenium web scraping with improved performance
beautifulsoup4==4.6.3
certifi==2018.10.15
chardet==3.0.4
idna==2.7
lxml==4.2.5
requests==2.20.1
selenium==3.141.0
urllib3==1.24.1
# answer to https://stackoverflow.com/q/53475578/890242
import requests
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool, Pool
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
def get_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
titles = [urljoin(url,items.get("href")) for items in soup.select(".summary .question-hyperlink")]
return titles
threadLocal = threading.local()
def get_driver():
driver = getattr(threadLocal, 'driver', None)
if driver is None:
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chromeOptions)
setattr(threadLocal, 'driver', driver)
return driver
def get_title(url):
driver = get_driver()
driver.get(url)
sauce = BeautifulSoup(driver.page_source,"lxml")
item = sauce.select_one("h1 a").text
print(item)
if __name__ == '__main__':
url = "https://stackoverflow.com/questions/tagged/web-scraping"
ThreadPool(5).map(get_title,get_links(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment