rriemann/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Preparation

curl -sS https://www.i-dont-care-about-cookies.eu/abp/ > bannerlist.txt && \
curl -sS https://publicsuffix.org/list/public_suffix_list.dat > public_suffix_list.dat

The alternative browser_commands.py adds also labels to the database of links generated by the links extract command.

  
## browser_commands.py

from __future__ import absolute_import
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import MoveTargetOutOfBoundsException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from hashlib import md5
from glob import glob
from PIL import Image
import traceback
import random
import json
import time
import sys
import gzip
import os
import tempfile
from lxml import etree
from pprint import pprint
from collections import namedtuple

from ..SocketInterface import clientsocket
from ..MPLogger import loggingclient
from .utils.lso import get_flash_cookies
from .utils.firefox_profile import get_cookies
from .utils.webdriver_extensions import (scroll_down,
                                         wait_until_loaded,
                                         get_intra_links,
                                         execute_in_all_frames,
                                         execute_script_with_retry)
from .utils.banner_utils import fetch_banner_list, find_banners_by_selectors
from six.moves import range
import six

# Constants for bot mitigation
NUM_MOUSE_MOVES = 10  # Times to randomly move the mouse
RANDOM_SLEEP_LOW = 1  # low (in sec) for random sleep between page loads
RANDOM_SLEEP_HIGH = 7  # high (in sec) for random sleep between page loads


def bot_mitigation(webdriver):
    """ performs three optional commands for bot-detection
    mitigation when getting a site """

    # bot mitigation 1: move the randomly around a number of times
    window_size = webdriver.get_window_size()
    num_moves = 0
    num_fails = 0
    while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES:
        try:
            if num_moves == 0:  # move to the center of the screen
                x = int(round(window_size['height']/2))
                y = int(round(window_size['width']/2))
            else:  # move a random amount in some direction
                move_max = random.randint(0, 500)
                x = random.randint(-move_max, move_max)
                y = random.randint(-move_max, move_max)
            action = ActionChains(webdriver)
            action.move_by_offset(x, y)
            action.perform()
            num_moves += 1
        except MoveTargetOutOfBoundsException:
            num_fails += 1
            pass

    # bot mitigation 2: scroll in random intervals down page
    scroll_down(webdriver)

    # bot mitigation 3: randomly wait so page visits happen with irregularity
    time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))


def close_other_windows(webdriver):
    """
    close all open pop-up windows and tabs other than the current one
    """
    main_handle = webdriver.current_window_handle
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)


def tab_restart_browser(webdriver):
    """
    kills the current tab and creates a new one to stop traffic
    """
    # note: this technically uses windows, not tabs, due to problems with
    # chrome-targeted keyboard commands in Selenium 3 (intermittent
    # nonsense WebDriverExceptions are thrown). windows can be reliably
    # created, although we do have to detour into JS to do it.
    close_other_windows(webdriver)

    if webdriver.current_url.lower() == 'about:blank':
        return

    # Create a new window.  Note that it is not practical to use
    # noopener here, as we would then be forced to specify a bunch of
    # other "features" that we don't know whether they are on or off.
    # Closing the old window will kill the opener anyway.
    webdriver.execute_script("window.open('')")

    # This closes the _old_ window, and does _not_ switch to the new one.
    webdriver.close()

    # The only remaining window handle will be for the new window;
    # switch to it.
    assert len(webdriver.window_handles) == 1
    webdriver.switch_to_window(webdriver.window_handles[0])


def get_website(url, sleep, visit_id, webdriver,
                browser_params, extension_socket):
    """
    goes to <url> using the given <webdriver> instance
    """

    tab_restart_browser(webdriver)

    if extension_socket is not None:
        extension_socket.send(visit_id)

    # Execute a get through selenium
    try:
        webdriver.get(url)
    except TimeoutException:
        pass

    # Sleep after get returns
    time.sleep(sleep)

    # Close modal dialog if exists
    try:
        WebDriverWait(webdriver, .5).until(EC.alert_is_present())
        alert = webdriver.switch_to_alert()
        alert.dismiss()
        time.sleep(1)
    except TimeoutException:
        pass

    close_other_windows(webdriver)

    if browser_params['bot_mitigation']:
        bot_mitigation(webdriver)


def extract_links(webdriver, browser_params, manager_params):
    link_elements = webdriver.find_elements_by_tag_name('a')

    if len(link_elements) > 0:
        sock = clientsocket()
        sock.connect(*manager_params['aggregator_address'])
        create_table_query = ("""
        CREATE TABLE IF NOT EXISTS links_found (
          found_on TEXT,
          location TEXT,
          label TEXT,
          inner_html TEXT
        )
        """, ())
        sock.send(create_table_query)

        current_url = webdriver.current_url
        insert_query_string = """
        INSERT INTO links_found (found_on, location, label, inner_html)
        VALUES (?, ?, ?, ?)
        """
        for element in link_elements:
            sock.send((insert_query_string, (current_url, element.get_attribute("href"), element.get_attribute('textContent'), element.get_attribute('innerHTML'))))

        sock.close()


def browse_website(url, num_links, sleep, visit_id, webdriver,
                   browser_params, manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver,
                browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = [x for x in get_intra_links(webdriver, url)
                 if x.is_displayed() is True]
        logger(links);
        if not links:
            break
        r = int(random.random()*len(links))
        logger.info("BROWSER %i: visiting internal link %s" % (
            browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
            wait_until_loaded(webdriver, 300)
        except Exception:
            pass


def dump_flash_cookies(start_time, visit_id, webdriver, browser_params,
                       manager_params):
    """ Save newly changed Flash LSOs to database

    We determine which LSOs to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills window to avoid stray requests
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Flash cookies
    flash_cookies = get_flash_cookies(start_time)
    for cookie in flash_cookies:
        query = ("INSERT INTO flash_cookies (crawl_id, visit_id, domain, "
                 "filename, local_path, key, content) VALUES (?,?,?,?,?,?,?)",
                 (browser_params['crawl_id'], visit_id, cookie.domain,
                  cookie.filename, cookie.local_path, cookie.key,
                  cookie.content))
        sock.send(query)

    # Close connection to db
    sock.close()


def dump_profile_cookies(start_time, visit_id, webdriver,
                         browser_params, manager_params):
    """ Save changes to Firefox's cookies.sqlite to database

    We determine which cookies to save by the `start_time` timestamp.
    This timestamp should be taken prior to calling the `get` for
    which creates these changes.

    Note that the extension's cookieInstrument is preferred to this approach,
    as this is likely to miss changes still present in the sqlite `wal` files.
    This will likely be removed in a future version.
    """
    # Set up a connection to DataAggregator
    tab_restart_browser(webdriver)  # kills window to avoid stray requests
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])

    # Cookies
    rows = get_cookies(browser_params['profile_path'], start_time)
    if rows is not None:
        for row in rows:
            query = ("INSERT INTO profile_cookies (crawl_id, visit_id, "
                     "baseDomain, name, value, host, path, expiry, accessed, "
                     "creationTime, isSecure, isHttpOnly) VALUES "
                     "(?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'],
                                                   visit_id) + row)
            sock.send(query)

    # Close connection to db
    sock.close()


def save_screenshot(visit_id, crawl_id, driver, manager_params, suffix=''):
    """ Save a screenshot of the current viewport"""
    if suffix != '':
        suffix = '-' + suffix

    urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
    outname = os.path.join(manager_params['screenshot_path'],
                           '%i-%s%s.png' %
                           (visit_id, urlhash, suffix))
    driver.save_screenshot(outname)


def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
    # Read image parts and compute dimensions of output image
    total_height = -1
    max_scroll = -1
    max_width = -1
    images = dict()
    parts = list()
    for f in glob(os.path.join(manager_params['screenshot_path'],
                               'parts',
                               '%i*-part-*.png' % visit_id)):

        # Load image from disk and parse params out of filename
        img_obj = Image.open(f)
        width, height = img_obj.size
        parts.append((f, width, height))
        outname, _, index, curr_scroll = os.path.basename(f).rsplit('-', 3)
        curr_scroll = int(curr_scroll.split('.')[0])
        index = int(index)

        # Update output image size
        if curr_scroll > max_scroll:
            max_scroll = curr_scroll
            total_height = max_scroll + height

        if width > max_width:
            max_width = width

        # Save image parameters
        img = {}
        img['object'] = img_obj
        img['scroll'] = curr_scroll
        images[index] = img

    # Output filename same for all parts, so we can just use last filename
    outname = outname + '.png'
    outname = os.path.join(manager_params['screenshot_path'], outname)
    output = Image.new('RGB', (max_width, total_height))

    # Compute dimensions for output image
    for i in range(max(images.keys())+1):
        img = images[i]
        output.paste(im=img['object'], box=(0, img['scroll']))
        img['object'].close()
    try:
        output.save(outname)
    except SystemError:
        logger.error(
            "BROWSER %i: SystemError while trying to save screenshot %s. \n"
            "Slices of image %s \n Final size %s, %s." %
            (crawl_id, outname, '\n'.join([str(x) for x in parts]),
             max_width, total_height)
        )
        pass


def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
                         suffix=''):
    logger = loggingclient(*manager_params['logger_address'])

    outdir = os.path.join(manager_params['screenshot_path'], 'parts')
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    if suffix != '':
        suffix = '-' + suffix
    urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
    outname = os.path.join(outdir, '%i-%s%s-part-%%i-%%i.png' %
                           (visit_id, urlhash, suffix))

    try:
        part = 0
        max_height = execute_script_with_retry(
            driver, 'return document.body.scrollHeight;')
        inner_height = execute_script_with_retry(
            driver, 'return window.innerHeight;')
        curr_scrollY = execute_script_with_retry(
            driver, 'return window.scrollY;')
        prev_scrollY = -1
        driver.save_screenshot(outname % (part, curr_scrollY))
        while ((curr_scrollY + inner_height) < max_height
               and curr_scrollY != prev_scrollY):

            # Scroll down to bottom of previous viewport
            try:
                driver.execute_script('window.scrollBy(0, window.innerHeight)')
            except WebDriverException:
                logger.info(
                    "BROWSER %i: WebDriverException while scrolling, "
                    "screenshot may be misaligned!" % crawl_id)
                pass

            # Update control variables
            part += 1
            prev_scrollY = curr_scrollY
            curr_scrollY = execute_script_with_retry(
                driver, 'return window.scrollY;')

            # Save screenshot
            driver.save_screenshot(outname % (part, curr_scrollY))
    except WebDriverException:
        excp = traceback.format_exception(*sys.exc_info())
        logger.error(
            "BROWSER %i: Exception while taking full page screenshot \n %s" %
            (crawl_id, ''.join(excp)))
        return

    _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)


def dump_page_source(visit_id, driver, manager_params, suffix=''):
    if suffix != '':
        suffix = '-' + suffix

    outname = md5(driver.current_url.encode('utf-8')).hexdigest()
    outfile = os.path.join(manager_params['source_dump_path'],
                           '%i-%s%s.html' % (visit_id, outname, suffix))

    with open(outfile, 'wb') as f:
        f.write(driver.page_source.encode('utf8'))
        f.write(b'\n')


def recursive_dump_page_source(visit_id, driver, manager_params, suffix=''):
    """Dump a compressed html tree for the current page visit"""
    if suffix != '':
        suffix = '-' + suffix

    outname = md5(driver.current_url.encode('utf-8')).hexdigest()
    outfile = os.path.join(manager_params['source_dump_path'],
                           '%i-%s%s.json.gz' % (visit_id, outname, suffix))

    def collect_source(driver, frame_stack, rv={}):
        is_top_frame = len(frame_stack) == 1

        # Gather frame information
        doc_url = driver.execute_script("return window.document.URL;")
        if is_top_frame:
            page_source = rv
        else:
            page_source = dict()
        page_source['doc_url'] = doc_url
        source = driver.page_source
        if type(source) != six.text_type:
            source = six.text_type(source, 'utf-8')
        page_source['source'] = source
        page_source['iframes'] = dict()

        # Store frame info in correct area of return value
        if is_top_frame:
            return
        out_dict = rv['iframes']
        for frame in frame_stack[1:-1]:
            out_dict = out_dict[frame.id]['iframes']
        out_dict[frame_stack[-1].id] = page_source

    page_source = dict()
    execute_in_all_frames(driver, collect_source, {'rv': page_source})

    with gzip.GzipFile(outfile, 'wb') as f:
        f.write(json.dumps(page_source).encode('utf-8'))


def detect_cookie_banner(visit_id, webdriver, browser_params, manager_params):
    """Detect if the fetched site contains a cookie-notice/cookie-wall banner.

    We detect banners by searching for HTML elements using a large set of  CSS selectors. The
    selectors have been collected by the browser extension "I don't care about cookies" using
    crowdsourceding. They are read from the file of `browser-param['banner_list_location']`.
    If this parameter is not specified, the latest version is downloaded and cached.

    Once we find one or more cookie banner elements, we record them in the table 'cookie_banners'.
    """
    # Locate the banner list
    if 'banner_list_location' not in browser_params:
        bl_loc = os.path.join(tempfile.gettempdir(), 'bannerlist.txt')
        if not os.path.isfile(bl_loc):
            fetch_banner_list(tempfile.gettempdir())
        browser_params['banner_list_location'] = tempfile.gettempdir()  # cache for next runs
    else:
        bl_loc = os.path.expanduser(browser_params['banner_list_location'])

    time0 = time.time()
    logger = loggingclient(*manager_params['logger_address'])  # Connect to logger

    try:
        # The main searching happens in utils.banner_utils.find_banners_by_selectors()
        # with the lxml-cssselect module, which is much faster (5x) than Selenium's search.
        # See that function for some known limitations.
        n_selectors, banners_part = find_banners_by_selectors(webdriver.current_url,
                                                              webdriver.page_source,
                                                              bl_loc,
                                                              logger=logger)
    except Exception as err:
        n_selectors, banners_part = None, []
        logger.warning("DETECT_COOKIE_BANNER: Exception in 'find_banners_by_selectors': %s" % err)

    logger.debug("DETECT_COOKIE_BANNER: find_banners_by_selectors() returned %d results in %.1fs" %
                 (len(banners_part), time.time()-time0))

    B = namedtuple('Banner', ['selector', 'tag', 'id', 'text', 'html', 'x', 'y', 'h', 'w'])
    banners = []
    for b in banners_part:
            # Selenium's find_elements_by_css gives also the size/pos of the element;
            # match it only for the selectors of the banner elements already found
            matched = None
            try:
                elements = webdriver.find_elements_by_css_selector(b.selector)
                for e in elements:
                    if b.tag == e.tag_name and b.id == e.get_attribute('id'):
                        matched = e
                        break
                if elements and not matched:
                    logger.warning("DETECT_COOKIE_BANNER: couldn't match %d elements found by "
                                   "webdriver to cssselector" % len(elements))
            except Exception as err:
                logger.warning("DETECT_COOKIE_BANNER: exception in 'webdriver.find_elements': %s"
                               % err)

            if not matched:
                banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
                                 html=None, x=None, y=None, w=None, h=None))
            else:
                banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
                                 html=matched.get_attribute('innerHTML'),
                                 x=matched.location['x'], y=matched.location['y'],
                                 w=matched.size['width'], h=matched.size['height']))

    logger.info("DETECT_COOKIE_BANNER: found %d banners in %.1fs (%d selectors) for '%s'" % (
                len(banners), time.time()-time0, n_selectors, webdriver.current_url))

    # Write result to database.
    sock = clientsocket()
    sock.connect(*manager_params['aggregator_address'])
    for b in banners:
        query = ("INSERT INTO cookie_banners "
                 "(crawl_id, visit_id, url, css_selector, selected_tag, selected_id, "
                 " text, html, pos_x, pos_y, size_w, size_h) "
                 "VALUES (?,?,?, ?,?,?, ?,?, ?,?,?,?)",
                 (browser_params["crawl_id"], visit_id, webdriver.current_url,
                  b.selector, b.tag, b.id, b.text, b.html, b.x, b.y, b.w, b.h))
        sock.send(query)
    sock.close()

## my-demo.py
from __future__ import absolute_import
import sys
sys.path.insert(0, '/opt/OpenWPM')
from automation import TaskManager, CommandSequence
from six.moves import range

from automation.MPLogger import loggingclient

# how-to custom methods:
# https://github.com/citp/OpenWPM/wiki/Available-Commands#run_custom_function
def check_js_frameworks(**kwargs):
    logger = loggingclient(*manager_params['logger_address'])
    driver = kwargs['driver']

    # check for AngularJS framework
    has_angular = driver.execute_script("return (typeof angular == 'undefined' ? false : angular.version.full)");
    if has_angular:
        logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

    # check for VueJS framework
    has_vuejs = driver.execute_script("return (typeof Vue == 'undefined' ? false : Vue.version)");
    if has_angular:
        logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

    # check for react framework
    has_react = driver.execute_script("var v; try {v = require('React2').version} catch(e) {v = false}; return v;");
    if has_react:
        logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

    return

# The list of sites that we wish to crawl
NUM_BROWSERS = 1
sites = [
    'https://edps.europa.eu/'
    , 'http://www.example.com'
    , 'http://www.princeton.edu'
    , 'http://chirurgie-lichtenberg.de'
    , 'https://angularjs.org'
]

# Loads the manager preference and 3 copies of the default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i]['http_instrument'] = True
    # Enable flash for all three browsers
    browser_params[i]['disable_flash'] = False
    browser_params[i]['headless'] = True  # Launch all headless
    browser_params[i]['banner_list_location'] = '/opt/OpenWPM/bannerlist.txt'

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/'
manager_params['log_directory'] = '~/Desktop/'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites:
    # use of reset=True for stateless crawl, see:
    # https://github.com/citp/OpenWPM#stateful-vs-stateless-crawls
    command_sequence = CommandSequence.CommandSequence(site, reset=True)

    # Start by visiting the page
    command_sequence.get(sleep=0, timeout=60)

    # check js framework
    command_sequence.run_custom_function(check_js_frameworks)

    # test cookie banner
    command_sequence.detect_cookie_banner()

    command_sequence.save_screenshot()
    # command_sequence.screenshot_full_page('test_full_%d' % i)
    command_sequence.extract_links()

    command_sequence.browse(num_links=2, sleep=1, timeout=60)

    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    command_sequence.dump_profile_cookies(120)

    # index='**' synchronizes visits between the three browsers
    manager.execute_command_sequence(command_sequence, index='**')

# Shuts down the browsers and waits for the data to finish logging
manager.close()

	from __future__ import absolute_import
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import MoveTargetOutOfBoundsException
	from selenium.common.exceptions import TimeoutException
	from selenium.common.exceptions import WebDriverException
	from selenium.webdriver.common.action_chains import ActionChains
	from hashlib import md5
	from glob import glob
	from PIL import Image
	import traceback
	import random
	import json
	import time
	import sys
	import gzip
	import os
	import tempfile
	from lxml import etree
	from pprint import pprint
	from collections import namedtuple

	from ..SocketInterface import clientsocket
	from ..MPLogger import loggingclient
	from .utils.lso import get_flash_cookies
	from .utils.firefox_profile import get_cookies
	from .utils.webdriver_extensions import (scroll_down,
	wait_until_loaded,
	get_intra_links,
	execute_in_all_frames,
	execute_script_with_retry)
	from .utils.banner_utils import fetch_banner_list, find_banners_by_selectors
	from six.moves import range
	import six

	# Constants for bot mitigation
	NUM_MOUSE_MOVES = 10 # Times to randomly move the mouse
	RANDOM_SLEEP_LOW = 1 # low (in sec) for random sleep between page loads
	RANDOM_SLEEP_HIGH = 7 # high (in sec) for random sleep between page loads


	def bot_mitigation(webdriver):
	""" performs three optional commands for bot-detection
	mitigation when getting a site """

	# bot mitigation 1: move the randomly around a number of times
	window_size = webdriver.get_window_size()
	num_moves = 0
	num_fails = 0
	while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES:
	try:
	if num_moves == 0: # move to the center of the screen
	x = int(round(window_size['height']/2))
	y = int(round(window_size['width']/2))
	else: # move a random amount in some direction
	move_max = random.randint(0, 500)
	x = random.randint(-move_max, move_max)
	y = random.randint(-move_max, move_max)
	action = ActionChains(webdriver)
	action.move_by_offset(x, y)
	action.perform()
	num_moves += 1
	except MoveTargetOutOfBoundsException:
	num_fails += 1
	pass

	# bot mitigation 2: scroll in random intervals down page
	scroll_down(webdriver)

	# bot mitigation 3: randomly wait so page visits happen with irregularity
	time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))


	def close_other_windows(webdriver):
	"""
	close all open pop-up windows and tabs other than the current one
	"""
	main_handle = webdriver.current_window_handle
	windows = webdriver.window_handles
	if len(windows) > 1:
	for window in windows:
	if window != main_handle:
	webdriver.switch_to_window(window)
	webdriver.close()
	webdriver.switch_to_window(main_handle)


	def tab_restart_browser(webdriver):
	"""
	kills the current tab and creates a new one to stop traffic
	"""
	# note: this technically uses windows, not tabs, due to problems with
	# chrome-targeted keyboard commands in Selenium 3 (intermittent
	# nonsense WebDriverExceptions are thrown). windows can be reliably
	# created, although we do have to detour into JS to do it.
	close_other_windows(webdriver)

	if webdriver.current_url.lower() == 'about:blank':
	return

	# Create a new window. Note that it is not practical to use
	# noopener here, as we would then be forced to specify a bunch of
	# other "features" that we don't know whether they are on or off.
	# Closing the old window will kill the opener anyway.
	webdriver.execute_script("window.open('')")

	# This closes the _old_ window, and does _not_ switch to the new one.
	webdriver.close()

	# The only remaining window handle will be for the new window;
	# switch to it.
	assert len(webdriver.window_handles) == 1
	webdriver.switch_to_window(webdriver.window_handles[0])


	def get_website(url, sleep, visit_id, webdriver,
	browser_params, extension_socket):
	"""
	goes to <url> using the given <webdriver> instance
	"""

	tab_restart_browser(webdriver)

	if extension_socket is not None:
	extension_socket.send(visit_id)

	# Execute a get through selenium
	try:
	webdriver.get(url)
	except TimeoutException:
	pass

	# Sleep after get returns
	time.sleep(sleep)

	# Close modal dialog if exists
	try:
	WebDriverWait(webdriver, .5).until(EC.alert_is_present())
	alert = webdriver.switch_to_alert()
	alert.dismiss()
	time.sleep(1)
	except TimeoutException:
	pass

	close_other_windows(webdriver)

	if browser_params['bot_mitigation']:
	bot_mitigation(webdriver)


	def extract_links(webdriver, browser_params, manager_params):
	link_elements = webdriver.find_elements_by_tag_name('a')

	if len(link_elements) > 0:
	sock = clientsocket()
	sock.connect(*manager_params['aggregator_address'])
	create_table_query = ("""
	CREATE TABLE IF NOT EXISTS links_found (
	found_on TEXT,
	location TEXT,
	label TEXT,
	inner_html TEXT
	)
	""", ())
	sock.send(create_table_query)

	current_url = webdriver.current_url
	insert_query_string = """
	INSERT INTO links_found (found_on, location, label, inner_html)
	VALUES (?, ?, ?, ?)
	"""
	for element in link_elements:
	sock.send((insert_query_string, (current_url, element.get_attribute("href"), element.get_attribute('textContent'), element.get_attribute('innerHTML'))))

	sock.close()


	def browse_website(url, num_links, sleep, visit_id, webdriver,
	browser_params, manager_params, extension_socket):
	"""Calls get_website before visiting <num_links> present on the page.

	Note: the site_url in the site_visits table for the links visited will
	be the site_url of the original page and NOT the url of the links visited.
	"""
	# First get the site
	get_website(url, sleep, visit_id, webdriver,
	browser_params, extension_socket)

	# Connect to logger
	logger = loggingclient(*manager_params['logger_address'])

	# Then visit a few subpages
	for i in range(num_links):
	links = [x for x in get_intra_links(webdriver, url)
	if x.is_displayed() is True]
	logger(links);
	if not links:
	break
	r = int(random.random()*len(links))
	logger.info("BROWSER %i: visiting internal link %s" % (
	browser_params['crawl_id'], links[r].get_attribute("href")))

	try:
	links[r].click()
	wait_until_loaded(webdriver, 300)
	time.sleep(max(1, sleep))
	if browser_params['bot_mitigation']:
	bot_mitigation(webdriver)
	webdriver.back()
	wait_until_loaded(webdriver, 300)
	except Exception:
	pass


	def dump_flash_cookies(start_time, visit_id, webdriver, browser_params,
	manager_params):
	""" Save newly changed Flash LSOs to database

	We determine which LSOs to save by the `start_time` timestamp.
	This timestamp should be taken prior to calling the `get` for
	which creates these changes.
	"""
	# Set up a connection to DataAggregator
	tab_restart_browser(webdriver) # kills window to avoid stray requests
	sock = clientsocket()
	sock.connect(*manager_params['aggregator_address'])

	# Flash cookies
	flash_cookies = get_flash_cookies(start_time)
	for cookie in flash_cookies:
	query = ("INSERT INTO flash_cookies (crawl_id, visit_id, domain, "
	"filename, local_path, key, content) VALUES (?,?,?,?,?,?,?)",
	(browser_params['crawl_id'], visit_id, cookie.domain,
	cookie.filename, cookie.local_path, cookie.key,
	cookie.content))
	sock.send(query)

	# Close connection to db
	sock.close()


	def dump_profile_cookies(start_time, visit_id, webdriver,
	browser_params, manager_params):
	""" Save changes to Firefox's cookies.sqlite to database

	We determine which cookies to save by the `start_time` timestamp.
	This timestamp should be taken prior to calling the `get` for
	which creates these changes.

	Note that the extension's cookieInstrument is preferred to this approach,
	as this is likely to miss changes still present in the sqlite `wal` files.
	This will likely be removed in a future version.
	"""
	# Set up a connection to DataAggregator
	tab_restart_browser(webdriver) # kills window to avoid stray requests
	sock = clientsocket()
	sock.connect(*manager_params['aggregator_address'])

	# Cookies
	rows = get_cookies(browser_params['profile_path'], start_time)
	if rows is not None:
	for row in rows:
	query = ("INSERT INTO profile_cookies (crawl_id, visit_id, "
	"baseDomain, name, value, host, path, expiry, accessed, "
	"creationTime, isSecure, isHttpOnly) VALUES "
	"(?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'],
	visit_id) + row)
	sock.send(query)

	# Close connection to db
	sock.close()


	def save_screenshot(visit_id, crawl_id, driver, manager_params, suffix=''):
	""" Save a screenshot of the current viewport"""
	if suffix != '':
	suffix = '-' + suffix

	urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
	outname = os.path.join(manager_params['screenshot_path'],
	'%i-%s%s.png' %
	(visit_id, urlhash, suffix))
	driver.save_screenshot(outname)


	def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
	# Read image parts and compute dimensions of output image
	total_height = -1
	max_scroll = -1
	max_width = -1
	images = dict()
	parts = list()
	for f in glob(os.path.join(manager_params['screenshot_path'],
	'parts',
	'%i-part-.png' % visit_id)):

	# Load image from disk and parse params out of filename
	img_obj = Image.open(f)
	width, height = img_obj.size
	parts.append((f, width, height))
	outname, _, index, curr_scroll = os.path.basename(f).rsplit('-', 3)
	curr_scroll = int(curr_scroll.split('.')[0])
	index = int(index)

	# Update output image size
	if curr_scroll > max_scroll:
	max_scroll = curr_scroll
	total_height = max_scroll + height

	if width > max_width:
	max_width = width

	# Save image parameters
	img = {}
	img['object'] = img_obj
	img['scroll'] = curr_scroll
	images[index] = img

	# Output filename same for all parts, so we can just use last filename
	outname = outname + '.png'
	outname = os.path.join(manager_params['screenshot_path'], outname)
	output = Image.new('RGB', (max_width, total_height))

	# Compute dimensions for output image
	for i in range(max(images.keys())+1):
	img = images[i]
	output.paste(im=img['object'], box=(0, img['scroll']))
	img['object'].close()
	try:
	output.save(outname)
	except SystemError:
	logger.error(
	"BROWSER %i: SystemError while trying to save screenshot %s. \n"
	"Slices of image %s \n Final size %s, %s." %
	(crawl_id, outname, '\n'.join([str(x) for x in parts]),
	max_width, total_height)
	)
	pass


	def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
	suffix=''):
	logger = loggingclient(*manager_params['logger_address'])

	outdir = os.path.join(manager_params['screenshot_path'], 'parts')
	if not os.path.isdir(outdir):
	os.mkdir(outdir)
	if suffix != '':
	suffix = '-' + suffix
	urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
	outname = os.path.join(outdir, '%i-%s%s-part-%%i-%%i.png' %
	(visit_id, urlhash, suffix))

	try:
	part = 0
	max_height = execute_script_with_retry(
	driver, 'return document.body.scrollHeight;')
	inner_height = execute_script_with_retry(
	driver, 'return window.innerHeight;')
	curr_scrollY = execute_script_with_retry(
	driver, 'return window.scrollY;')
	prev_scrollY = -1
	driver.save_screenshot(outname % (part, curr_scrollY))
	while ((curr_scrollY + inner_height) < max_height
	and curr_scrollY != prev_scrollY):

	# Scroll down to bottom of previous viewport
	try:
	driver.execute_script('window.scrollBy(0, window.innerHeight)')
	except WebDriverException:
	logger.info(
	"BROWSER %i: WebDriverException while scrolling, "
	"screenshot may be misaligned!" % crawl_id)
	pass

	# Update control variables
	part += 1
	prev_scrollY = curr_scrollY
	curr_scrollY = execute_script_with_retry(
	driver, 'return window.scrollY;')

	# Save screenshot
	driver.save_screenshot(outname % (part, curr_scrollY))
	except WebDriverException:
	excp = traceback.format_exception(*sys.exc_info())
	logger.error(
	"BROWSER %i: Exception while taking full page screenshot \n %s" %
	(crawl_id, ''.join(excp)))
	return

	_stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)


	def dump_page_source(visit_id, driver, manager_params, suffix=''):
	if suffix != '':
	suffix = '-' + suffix

	outname = md5(driver.current_url.encode('utf-8')).hexdigest()
	outfile = os.path.join(manager_params['source_dump_path'],
	'%i-%s%s.html' % (visit_id, outname, suffix))

	with open(outfile, 'wb') as f:
	f.write(driver.page_source.encode('utf8'))
	f.write(b'\n')


	def recursive_dump_page_source(visit_id, driver, manager_params, suffix=''):
	"""Dump a compressed html tree for the current page visit"""
	if suffix != '':
	suffix = '-' + suffix

	outname = md5(driver.current_url.encode('utf-8')).hexdigest()
	outfile = os.path.join(manager_params['source_dump_path'],
	'%i-%s%s.json.gz' % (visit_id, outname, suffix))

	def collect_source(driver, frame_stack, rv={}):
	is_top_frame = len(frame_stack) == 1

	# Gather frame information
	doc_url = driver.execute_script("return window.document.URL;")
	if is_top_frame:
	page_source = rv
	else:
	page_source = dict()
	page_source['doc_url'] = doc_url
	source = driver.page_source
	if type(source) != six.text_type:
	source = six.text_type(source, 'utf-8')
	page_source['source'] = source
	page_source['iframes'] = dict()

	# Store frame info in correct area of return value
	if is_top_frame:
	return
	out_dict = rv['iframes']
	for frame in frame_stack[1:-1]:
	out_dict = out_dict[frame.id]['iframes']
	out_dict[frame_stack[-1].id] = page_source

	page_source = dict()
	execute_in_all_frames(driver, collect_source, {'rv': page_source})

	with gzip.GzipFile(outfile, 'wb') as f:
	f.write(json.dumps(page_source).encode('utf-8'))


	def detect_cookie_banner(visit_id, webdriver, browser_params, manager_params):
	"""Detect if the fetched site contains a cookie-notice/cookie-wall banner.

	We detect banners by searching for HTML elements using a large set of CSS selectors. The
	selectors have been collected by the browser extension "I don't care about cookies" using
	crowdsourceding. They are read from the file of `browser-param['banner_list_location']`.
	If this parameter is not specified, the latest version is downloaded and cached.

	Once we find one or more cookie banner elements, we record them in the table 'cookie_banners'.
	"""
	# Locate the banner list
	if 'banner_list_location' not in browser_params:
	bl_loc = os.path.join(tempfile.gettempdir(), 'bannerlist.txt')
	if not os.path.isfile(bl_loc):
	fetch_banner_list(tempfile.gettempdir())
	browser_params['banner_list_location'] = tempfile.gettempdir() # cache for next runs
	else:
	bl_loc = os.path.expanduser(browser_params['banner_list_location'])

	time0 = time.time()
	logger = loggingclient(*manager_params['logger_address']) # Connect to logger

	try:
	# The main searching happens in utils.banner_utils.find_banners_by_selectors()
	# with the lxml-cssselect module, which is much faster (5x) than Selenium's search.
	# See that function for some known limitations.
	n_selectors, banners_part = find_banners_by_selectors(webdriver.current_url,
	webdriver.page_source,
	bl_loc,
	logger=logger)
	except Exception as err:
	n_selectors, banners_part = None, []
	logger.warning("DETECT_COOKIE_BANNER: Exception in 'find_banners_by_selectors': %s" % err)

	logger.debug("DETECT_COOKIE_BANNER: find_banners_by_selectors() returned %d results in %.1fs" %
	(len(banners_part), time.time()-time0))

	B = namedtuple('Banner', ['selector', 'tag', 'id', 'text', 'html', 'x', 'y', 'h', 'w'])
	banners = []
	for b in banners_part:
	# Selenium's find_elements_by_css gives also the size/pos of the element;
	# match it only for the selectors of the banner elements already found
	matched = None
	try:
	elements = webdriver.find_elements_by_css_selector(b.selector)
	for e in elements:
	if b.tag == e.tag_name and b.id == e.get_attribute('id'):
	matched = e
	break
	if elements and not matched:
	logger.warning("DETECT_COOKIE_BANNER: couldn't match %d elements found by "
	"webdriver to cssselector" % len(elements))
	except Exception as err:
	logger.warning("DETECT_COOKIE_BANNER: exception in 'webdriver.find_elements': %s"
	% err)

	if not matched:
	banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
	html=None, x=None, y=None, w=None, h=None))
	else:
	banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
	html=matched.get_attribute('innerHTML'),
	x=matched.location['x'], y=matched.location['y'],
	w=matched.size['width'], h=matched.size['height']))

	logger.info("DETECT_COOKIE_BANNER: found %d banners in %.1fs (%d selectors) for '%s'" % (
	len(banners), time.time()-time0, n_selectors, webdriver.current_url))

	# Write result to database.
	sock = clientsocket()
	sock.connect(*manager_params['aggregator_address'])
	for b in banners:
	query = ("INSERT INTO cookie_banners "
	"(crawl_id, visit_id, url, css_selector, selected_tag, selected_id, "
	" text, html, pos_x, pos_y, size_w, size_h) "
	"VALUES (?,?,?, ?,?,?, ?,?, ?,?,?,?)",
	(browser_params["crawl_id"], visit_id, webdriver.current_url,
	b.selector, b.tag, b.id, b.text, b.html, b.x, b.y, b.w, b.h))
	sock.send(query)
	sock.close()
	from __future__ import absolute_import
	import sys
	sys.path.insert(0, '/opt/OpenWPM')
	from automation import TaskManager, CommandSequence
	from six.moves import range

	from automation.MPLogger import loggingclient

	# how-to custom methods:
	# https://github.com/citp/OpenWPM/wiki/Available-Commands#run_custom_function
	def check_js_frameworks(**kwargs):
	logger = loggingclient(*manager_params['logger_address'])
	driver = kwargs['driver']

	# check for AngularJS framework
	has_angular = driver.execute_script("return (typeof angular == 'undefined' ? false : angular.version.full)");
	if has_angular:
	logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

	# check for VueJS framework
	has_vuejs = driver.execute_script("return (typeof Vue == 'undefined' ? false : Vue.version)");
	if has_angular:
	logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

	# check for react framework
	has_react = driver.execute_script("var v; try {v = require('React2').version} catch(e) {v = false}; return v;");
	if has_react:
	logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));

	return

	# The list of sites that we wish to crawl
	NUM_BROWSERS = 1
	sites = [
	'https://edps.europa.eu/'
	, 'http://www.example.com'
	, 'http://www.princeton.edu'
	, 'http://chirurgie-lichtenberg.de'
	, 'https://angularjs.org'
	]

	# Loads the manager preference and 3 copies of the default browser dictionaries
	manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

	# Update browser configuration (use this for per-browser settings)
	for i in range(NUM_BROWSERS):
	# Record HTTP Requests and Responses
	browser_params[i]['http_instrument'] = True
	# Enable flash for all three browsers
	browser_params[i]['disable_flash'] = False
	browser_params[i]['headless'] = True # Launch all headless
	browser_params[i]['banner_list_location'] = '/opt/OpenWPM/bannerlist.txt'

	# Update TaskManager configuration (use this for crawl-wide settings)
	manager_params['data_directory'] = '~/Desktop/'
	manager_params['log_directory'] = '~/Desktop/'

	# Instantiates the measurement platform
	# Commands time out by default after 60 seconds
	manager = TaskManager.TaskManager(manager_params, browser_params)

	# Visits the sites with all browsers simultaneously
	for site in sites:
	# use of reset=True for stateless crawl, see:
	# https://github.com/citp/OpenWPM#stateful-vs-stateless-crawls
	command_sequence = CommandSequence.CommandSequence(site, reset=True)

	# Start by visiting the page
	command_sequence.get(sleep=0, timeout=60)

	# check js framework
	command_sequence.run_custom_function(check_js_frameworks)

	# test cookie banner
	command_sequence.detect_cookie_banner()

	command_sequence.save_screenshot()
	# command_sequence.screenshot_full_page('test_full_%d' % i)
	command_sequence.extract_links()

	command_sequence.browse(num_links=2, sleep=1, timeout=60)

	# dump_profile_cookies/dump_flash_cookies closes the current tab.
	command_sequence.dump_profile_cookies(120)

	# index='**' synchronizes visits between the three browsers
	manager.execute_command_sequence(command_sequence, index='**')

	# Shuts down the browsers and waits for the data to finish logging
	manager.close()