tarekziade/scrape.py

## scrape.py
"""
Demo of extracting text and links from a rendered web page.

$ brew install geckodriver
$ python3 -m venv .
$ bin/pip install bs4 selenium
$ bin/python scrap.py

The script looks for an element of a specific id on the page.
This can be used to make sure we wait for all JS to execute, and
fall back on waiting a few seconds.

The website can set such value at the end of its JS execution.

This demo uses BeautifulSoup to extract the content and links.
"""
import contextlib
import sys
import traceback
import time
import io

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup


@contextlib.contextmanager
def silence():
    old_stderr = sys.stderr
    sys.stderr = io.BytesIO()
    old_stdout = sys.stdout
    sys.stdout = io.BytesIO()
    last_exc = None
    try:
        yield
    except Exception as exc:
        last_exc = exc
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
    if last_exc is not None:
        traceback.print_tb(last_exc.__traceback__)
        raise last_exc


def bs_parse(content="", parser="html.parser"):
    start = time.time()
    with silence():
        soup = BeautifulSoup(content, parser)
        text = soup.find_all(string=True)
        lines = [(t.parent.name, str(t).lower().strip()) for t in text]
        links = [a_tag.attrs.get("href") for a_tag in soup.findAll("a")]
        return (time.time() - start, len(text), lines, links)


def get_page_source(url, element_id, timeout=5):
    driver = webdriver.Firefox()
    driver.get(url)
    try:
        elem = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.ID, element_id))
        )
    except TimeoutException:
        # could not find the item, giving up after 5 seconds
        pass
    finally:
        source = driver.page_source
        driver.quit()
    return source


print(bs_parse(content=get_page_source("https://www.elastic.co", "myBar")))
	"""
	Demo of extracting text and links from a rendered web page.

	$ brew install geckodriver
	$ python3 -m venv .
	$ bin/pip install bs4 selenium
	$ bin/python scrap.py

	The script looks for an element of a specific id on the page.
	This can be used to make sure we wait for all JS to execute, and
	fall back on waiting a few seconds.

	The website can set such value at the end of its JS execution.

	This demo uses BeautifulSoup to extract the content and links.
	"""
	import contextlib
	import sys
	import traceback
	import time
	import io

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException

	from bs4 import BeautifulSoup


	@contextlib.contextmanager
	def silence():
	old_stderr = sys.stderr
	sys.stderr = io.BytesIO()
	old_stdout = sys.stdout
	sys.stdout = io.BytesIO()
	last_exc = None
	try:
	yield
	except Exception as exc:
	last_exc = exc
	finally:
	sys.stdout = old_stdout
	sys.stderr = old_stderr
	if last_exc is not None:
	traceback.print_tb(last_exc.__traceback__)
	raise last_exc


	def bs_parse(content="", parser="html.parser"):
	start = time.time()
	with silence():
	soup = BeautifulSoup(content, parser)
	text = soup.find_all(string=True)
	lines = [(t.parent.name, str(t).lower().strip()) for t in text]
	links = [a_tag.attrs.get("href") for a_tag in soup.findAll("a")]
	return (time.time() - start, len(text), lines, links)


	def get_page_source(url, element_id, timeout=5):
	driver = webdriver.Firefox()
	driver.get(url)
	try:
	elem = WebDriverWait(driver, timeout).until(
	EC.presence_of_element_located((By.ID, element_id))
	)
	except TimeoutException:
	# could not find the item, giving up after 5 seconds
	pass
	finally:
	source = driver.page_source
	driver.quit()
	return source


	print(bs_parse(content=get_page_source("https://www.elastic.co", "myBar")))