Skip to content

Instantly share code, notes, and snippets.

@tarekziade
Created May 18, 2023 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tarekziade/349f81462602c57cba55ceee5e45b010 to your computer and use it in GitHub Desktop.
Save tarekziade/349f81462602c57cba55ceee5e45b010 to your computer and use it in GitHub Desktop.
extract fully rendered web pages
"""
Demo of extracting text and links from a rendered web page.
$ brew install geckodriver
$ python3 -m venv .
$ bin/pip install bs4 selenium
$ bin/python scrap.py
The script looks for an element of a specific id on the page.
This can be used to make sure we wait for all JS to execute, and
fall back on waiting a few seconds.
The website can set such value at the end of its JS execution.
This demo uses BeautifulSoup to extract the content and links.
"""
import contextlib
import sys
import traceback
import time
import io
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
@contextlib.contextmanager
def silence():
old_stderr = sys.stderr
sys.stderr = io.BytesIO()
old_stdout = sys.stdout
sys.stdout = io.BytesIO()
last_exc = None
try:
yield
except Exception as exc:
last_exc = exc
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
if last_exc is not None:
traceback.print_tb(last_exc.__traceback__)
raise last_exc
def bs_parse(content="", parser="html.parser"):
start = time.time()
with silence():
soup = BeautifulSoup(content, parser)
text = soup.find_all(string=True)
lines = [(t.parent.name, str(t).lower().strip()) for t in text]
links = [a_tag.attrs.get("href") for a_tag in soup.findAll("a")]
return (time.time() - start, len(text), lines, links)
def get_page_source(url, element_id, timeout=5):
driver = webdriver.Firefox()
driver.get(url)
try:
elem = WebDriverWait(driver, timeout).until(
EC.presence_of_element_located((By.ID, element_id))
)
except TimeoutException:
# could not find the item, giving up after 5 seconds
pass
finally:
source = driver.page_source
driver.quit()
return source
print(bs_parse(content=get_page_source("https://www.elastic.co", "myBar")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment