Skip to content

Instantly share code, notes, and snippets.

@rriemann
Created February 21, 2018 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rriemann/8aab633ffc48cd17bb8cefb8dbab8068 to your computer and use it in GitHub Desktop.
Save rriemann/8aab633ffc48cd17bb8cefb8dbab8068 to your computer and use it in GitHub Desktop.
OpenWPM

Preparation

curl -sS https://www.i-dont-care-about-cookies.eu/abp/ > bannerlist.txt && \
curl -sS https://publicsuffix.org/list/public_suffix_list.dat > public_suffix_list.dat

The alternative browser_commands.py adds also labels to the database of links generated by the links extract command.

from __future__ import absolute_import
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import MoveTargetOutOfBoundsException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from hashlib import md5
from glob import glob
from PIL import Image
import traceback
import random
import json
import time
import sys
import gzip
import os
import tempfile
from lxml import etree
from pprint import pprint
from collections import namedtuple
from ..SocketInterface import clientsocket
from ..MPLogger import loggingclient
from .utils.lso import get_flash_cookies
from .utils.firefox_profile import get_cookies
from .utils.webdriver_extensions import (scroll_down,
wait_until_loaded,
get_intra_links,
execute_in_all_frames,
execute_script_with_retry)
from .utils.banner_utils import fetch_banner_list, find_banners_by_selectors
from six.moves import range
import six
# Constants for bot mitigation
NUM_MOUSE_MOVES = 10 # Times to randomly move the mouse
RANDOM_SLEEP_LOW = 1 # low (in sec) for random sleep between page loads
RANDOM_SLEEP_HIGH = 7 # high (in sec) for random sleep between page loads
def bot_mitigation(webdriver):
""" performs three optional commands for bot-detection
mitigation when getting a site """
# bot mitigation 1: move the randomly around a number of times
window_size = webdriver.get_window_size()
num_moves = 0
num_fails = 0
while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES:
try:
if num_moves == 0: # move to the center of the screen
x = int(round(window_size['height']/2))
y = int(round(window_size['width']/2))
else: # move a random amount in some direction
move_max = random.randint(0, 500)
x = random.randint(-move_max, move_max)
y = random.randint(-move_max, move_max)
action = ActionChains(webdriver)
action.move_by_offset(x, y)
action.perform()
num_moves += 1
except MoveTargetOutOfBoundsException:
num_fails += 1
pass
# bot mitigation 2: scroll in random intervals down page
scroll_down(webdriver)
# bot mitigation 3: randomly wait so page visits happen with irregularity
time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))
def close_other_windows(webdriver):
"""
close all open pop-up windows and tabs other than the current one
"""
main_handle = webdriver.current_window_handle
windows = webdriver.window_handles
if len(windows) > 1:
for window in windows:
if window != main_handle:
webdriver.switch_to_window(window)
webdriver.close()
webdriver.switch_to_window(main_handle)
def tab_restart_browser(webdriver):
"""
kills the current tab and creates a new one to stop traffic
"""
# note: this technically uses windows, not tabs, due to problems with
# chrome-targeted keyboard commands in Selenium 3 (intermittent
# nonsense WebDriverExceptions are thrown). windows can be reliably
# created, although we do have to detour into JS to do it.
close_other_windows(webdriver)
if webdriver.current_url.lower() == 'about:blank':
return
# Create a new window. Note that it is not practical to use
# noopener here, as we would then be forced to specify a bunch of
# other "features" that we don't know whether they are on or off.
# Closing the old window will kill the opener anyway.
webdriver.execute_script("window.open('')")
# This closes the _old_ window, and does _not_ switch to the new one.
webdriver.close()
# The only remaining window handle will be for the new window;
# switch to it.
assert len(webdriver.window_handles) == 1
webdriver.switch_to_window(webdriver.window_handles[0])
def get_website(url, sleep, visit_id, webdriver,
browser_params, extension_socket):
"""
goes to <url> using the given <webdriver> instance
"""
tab_restart_browser(webdriver)
if extension_socket is not None:
extension_socket.send(visit_id)
# Execute a get through selenium
try:
webdriver.get(url)
except TimeoutException:
pass
# Sleep after get returns
time.sleep(sleep)
# Close modal dialog if exists
try:
WebDriverWait(webdriver, .5).until(EC.alert_is_present())
alert = webdriver.switch_to_alert()
alert.dismiss()
time.sleep(1)
except TimeoutException:
pass
close_other_windows(webdriver)
if browser_params['bot_mitigation']:
bot_mitigation(webdriver)
def extract_links(webdriver, browser_params, manager_params):
link_elements = webdriver.find_elements_by_tag_name('a')
if len(link_elements) > 0:
sock = clientsocket()
sock.connect(*manager_params['aggregator_address'])
create_table_query = ("""
CREATE TABLE IF NOT EXISTS links_found (
found_on TEXT,
location TEXT,
label TEXT,
inner_html TEXT
)
""", ())
sock.send(create_table_query)
current_url = webdriver.current_url
insert_query_string = """
INSERT INTO links_found (found_on, location, label, inner_html)
VALUES (?, ?, ?, ?)
"""
for element in link_elements:
sock.send((insert_query_string, (current_url, element.get_attribute("href"), element.get_attribute('textContent'), element.get_attribute('innerHTML'))))
sock.close()
def browse_website(url, num_links, sleep, visit_id, webdriver,
browser_params, manager_params, extension_socket):
"""Calls get_website before visiting <num_links> present on the page.
Note: the site_url in the site_visits table for the links visited will
be the site_url of the original page and NOT the url of the links visited.
"""
# First get the site
get_website(url, sleep, visit_id, webdriver,
browser_params, extension_socket)
# Connect to logger
logger = loggingclient(*manager_params['logger_address'])
# Then visit a few subpages
for i in range(num_links):
links = [x for x in get_intra_links(webdriver, url)
if x.is_displayed() is True]
logger(links);
if not links:
break
r = int(random.random()*len(links))
logger.info("BROWSER %i: visiting internal link %s" % (
browser_params['crawl_id'], links[r].get_attribute("href")))
try:
links[r].click()
wait_until_loaded(webdriver, 300)
time.sleep(max(1, sleep))
if browser_params['bot_mitigation']:
bot_mitigation(webdriver)
webdriver.back()
wait_until_loaded(webdriver, 300)
except Exception:
pass
def dump_flash_cookies(start_time, visit_id, webdriver, browser_params,
manager_params):
""" Save newly changed Flash LSOs to database
We determine which LSOs to save by the `start_time` timestamp.
This timestamp should be taken prior to calling the `get` for
which creates these changes.
"""
# Set up a connection to DataAggregator
tab_restart_browser(webdriver) # kills window to avoid stray requests
sock = clientsocket()
sock.connect(*manager_params['aggregator_address'])
# Flash cookies
flash_cookies = get_flash_cookies(start_time)
for cookie in flash_cookies:
query = ("INSERT INTO flash_cookies (crawl_id, visit_id, domain, "
"filename, local_path, key, content) VALUES (?,?,?,?,?,?,?)",
(browser_params['crawl_id'], visit_id, cookie.domain,
cookie.filename, cookie.local_path, cookie.key,
cookie.content))
sock.send(query)
# Close connection to db
sock.close()
def dump_profile_cookies(start_time, visit_id, webdriver,
browser_params, manager_params):
""" Save changes to Firefox's cookies.sqlite to database
We determine which cookies to save by the `start_time` timestamp.
This timestamp should be taken prior to calling the `get` for
which creates these changes.
Note that the extension's cookieInstrument is preferred to this approach,
as this is likely to miss changes still present in the sqlite `wal` files.
This will likely be removed in a future version.
"""
# Set up a connection to DataAggregator
tab_restart_browser(webdriver) # kills window to avoid stray requests
sock = clientsocket()
sock.connect(*manager_params['aggregator_address'])
# Cookies
rows = get_cookies(browser_params['profile_path'], start_time)
if rows is not None:
for row in rows:
query = ("INSERT INTO profile_cookies (crawl_id, visit_id, "
"baseDomain, name, value, host, path, expiry, accessed, "
"creationTime, isSecure, isHttpOnly) VALUES "
"(?,?,?,?,?,?,?,?,?,?,?,?)", (browser_params['crawl_id'],
visit_id) + row)
sock.send(query)
# Close connection to db
sock.close()
def save_screenshot(visit_id, crawl_id, driver, manager_params, suffix=''):
""" Save a screenshot of the current viewport"""
if suffix != '':
suffix = '-' + suffix
urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
outname = os.path.join(manager_params['screenshot_path'],
'%i-%s%s.png' %
(visit_id, urlhash, suffix))
driver.save_screenshot(outname)
def _stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params):
# Read image parts and compute dimensions of output image
total_height = -1
max_scroll = -1
max_width = -1
images = dict()
parts = list()
for f in glob(os.path.join(manager_params['screenshot_path'],
'parts',
'%i*-part-*.png' % visit_id)):
# Load image from disk and parse params out of filename
img_obj = Image.open(f)
width, height = img_obj.size
parts.append((f, width, height))
outname, _, index, curr_scroll = os.path.basename(f).rsplit('-', 3)
curr_scroll = int(curr_scroll.split('.')[0])
index = int(index)
# Update output image size
if curr_scroll > max_scroll:
max_scroll = curr_scroll
total_height = max_scroll + height
if width > max_width:
max_width = width
# Save image parameters
img = {}
img['object'] = img_obj
img['scroll'] = curr_scroll
images[index] = img
# Output filename same for all parts, so we can just use last filename
outname = outname + '.png'
outname = os.path.join(manager_params['screenshot_path'], outname)
output = Image.new('RGB', (max_width, total_height))
# Compute dimensions for output image
for i in range(max(images.keys())+1):
img = images[i]
output.paste(im=img['object'], box=(0, img['scroll']))
img['object'].close()
try:
output.save(outname)
except SystemError:
logger.error(
"BROWSER %i: SystemError while trying to save screenshot %s. \n"
"Slices of image %s \n Final size %s, %s." %
(crawl_id, outname, '\n'.join([str(x) for x in parts]),
max_width, total_height)
)
pass
def screenshot_full_page(visit_id, crawl_id, driver, manager_params,
suffix=''):
logger = loggingclient(*manager_params['logger_address'])
outdir = os.path.join(manager_params['screenshot_path'], 'parts')
if not os.path.isdir(outdir):
os.mkdir(outdir)
if suffix != '':
suffix = '-' + suffix
urlhash = md5(driver.current_url.encode('utf-8')).hexdigest()
outname = os.path.join(outdir, '%i-%s%s-part-%%i-%%i.png' %
(visit_id, urlhash, suffix))
try:
part = 0
max_height = execute_script_with_retry(
driver, 'return document.body.scrollHeight;')
inner_height = execute_script_with_retry(
driver, 'return window.innerHeight;')
curr_scrollY = execute_script_with_retry(
driver, 'return window.scrollY;')
prev_scrollY = -1
driver.save_screenshot(outname % (part, curr_scrollY))
while ((curr_scrollY + inner_height) < max_height
and curr_scrollY != prev_scrollY):
# Scroll down to bottom of previous viewport
try:
driver.execute_script('window.scrollBy(0, window.innerHeight)')
except WebDriverException:
logger.info(
"BROWSER %i: WebDriverException while scrolling, "
"screenshot may be misaligned!" % crawl_id)
pass
# Update control variables
part += 1
prev_scrollY = curr_scrollY
curr_scrollY = execute_script_with_retry(
driver, 'return window.scrollY;')
# Save screenshot
driver.save_screenshot(outname % (part, curr_scrollY))
except WebDriverException:
excp = traceback.format_exception(*sys.exc_info())
logger.error(
"BROWSER %i: Exception while taking full page screenshot \n %s" %
(crawl_id, ''.join(excp)))
return
_stitch_screenshot_parts(visit_id, crawl_id, logger, manager_params)
def dump_page_source(visit_id, driver, manager_params, suffix=''):
if suffix != '':
suffix = '-' + suffix
outname = md5(driver.current_url.encode('utf-8')).hexdigest()
outfile = os.path.join(manager_params['source_dump_path'],
'%i-%s%s.html' % (visit_id, outname, suffix))
with open(outfile, 'wb') as f:
f.write(driver.page_source.encode('utf8'))
f.write(b'\n')
def recursive_dump_page_source(visit_id, driver, manager_params, suffix=''):
"""Dump a compressed html tree for the current page visit"""
if suffix != '':
suffix = '-' + suffix
outname = md5(driver.current_url.encode('utf-8')).hexdigest()
outfile = os.path.join(manager_params['source_dump_path'],
'%i-%s%s.json.gz' % (visit_id, outname, suffix))
def collect_source(driver, frame_stack, rv={}):
is_top_frame = len(frame_stack) == 1
# Gather frame information
doc_url = driver.execute_script("return window.document.URL;")
if is_top_frame:
page_source = rv
else:
page_source = dict()
page_source['doc_url'] = doc_url
source = driver.page_source
if type(source) != six.text_type:
source = six.text_type(source, 'utf-8')
page_source['source'] = source
page_source['iframes'] = dict()
# Store frame info in correct area of return value
if is_top_frame:
return
out_dict = rv['iframes']
for frame in frame_stack[1:-1]:
out_dict = out_dict[frame.id]['iframes']
out_dict[frame_stack[-1].id] = page_source
page_source = dict()
execute_in_all_frames(driver, collect_source, {'rv': page_source})
with gzip.GzipFile(outfile, 'wb') as f:
f.write(json.dumps(page_source).encode('utf-8'))
def detect_cookie_banner(visit_id, webdriver, browser_params, manager_params):
"""Detect if the fetched site contains a cookie-notice/cookie-wall banner.
We detect banners by searching for HTML elements using a large set of CSS selectors. The
selectors have been collected by the browser extension "I don't care about cookies" using
crowdsourceding. They are read from the file of `browser-param['banner_list_location']`.
If this parameter is not specified, the latest version is downloaded and cached.
Once we find one or more cookie banner elements, we record them in the table 'cookie_banners'.
"""
# Locate the banner list
if 'banner_list_location' not in browser_params:
bl_loc = os.path.join(tempfile.gettempdir(), 'bannerlist.txt')
if not os.path.isfile(bl_loc):
fetch_banner_list(tempfile.gettempdir())
browser_params['banner_list_location'] = tempfile.gettempdir() # cache for next runs
else:
bl_loc = os.path.expanduser(browser_params['banner_list_location'])
time0 = time.time()
logger = loggingclient(*manager_params['logger_address']) # Connect to logger
try:
# The main searching happens in utils.banner_utils.find_banners_by_selectors()
# with the lxml-cssselect module, which is much faster (5x) than Selenium's search.
# See that function for some known limitations.
n_selectors, banners_part = find_banners_by_selectors(webdriver.current_url,
webdriver.page_source,
bl_loc,
logger=logger)
except Exception as err:
n_selectors, banners_part = None, []
logger.warning("DETECT_COOKIE_BANNER: Exception in 'find_banners_by_selectors': %s" % err)
logger.debug("DETECT_COOKIE_BANNER: find_banners_by_selectors() returned %d results in %.1fs" %
(len(banners_part), time.time()-time0))
B = namedtuple('Banner', ['selector', 'tag', 'id', 'text', 'html', 'x', 'y', 'h', 'w'])
banners = []
for b in banners_part:
# Selenium's find_elements_by_css gives also the size/pos of the element;
# match it only for the selectors of the banner elements already found
matched = None
try:
elements = webdriver.find_elements_by_css_selector(b.selector)
for e in elements:
if b.tag == e.tag_name and b.id == e.get_attribute('id'):
matched = e
break
if elements and not matched:
logger.warning("DETECT_COOKIE_BANNER: couldn't match %d elements found by "
"webdriver to cssselector" % len(elements))
except Exception as err:
logger.warning("DETECT_COOKIE_BANNER: exception in 'webdriver.find_elements': %s"
% err)
if not matched:
banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
html=None, x=None, y=None, w=None, h=None))
else:
banners.append(B(selector=b.selector, tag=b.tag, id=b.id, text=b.text,
html=matched.get_attribute('innerHTML'),
x=matched.location['x'], y=matched.location['y'],
w=matched.size['width'], h=matched.size['height']))
logger.info("DETECT_COOKIE_BANNER: found %d banners in %.1fs (%d selectors) for '%s'" % (
len(banners), time.time()-time0, n_selectors, webdriver.current_url))
# Write result to database.
sock = clientsocket()
sock.connect(*manager_params['aggregator_address'])
for b in banners:
query = ("INSERT INTO cookie_banners "
"(crawl_id, visit_id, url, css_selector, selected_tag, selected_id, "
" text, html, pos_x, pos_y, size_w, size_h) "
"VALUES (?,?,?, ?,?,?, ?,?, ?,?,?,?)",
(browser_params["crawl_id"], visit_id, webdriver.current_url,
b.selector, b.tag, b.id, b.text, b.html, b.x, b.y, b.w, b.h))
sock.send(query)
sock.close()
from __future__ import absolute_import
import sys
sys.path.insert(0, '/opt/OpenWPM')
from automation import TaskManager, CommandSequence
from six.moves import range
from automation.MPLogger import loggingclient
# how-to custom methods:
# https://github.com/citp/OpenWPM/wiki/Available-Commands#run_custom_function
def check_js_frameworks(**kwargs):
logger = loggingclient(*manager_params['logger_address'])
driver = kwargs['driver']
# check for AngularJS framework
has_angular = driver.execute_script("return (typeof angular == 'undefined' ? false : angular.version.full)");
if has_angular:
logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));
# check for VueJS framework
has_vuejs = driver.execute_script("return (typeof Vue == 'undefined' ? false : Vue.version)");
if has_angular:
logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));
# check for react framework
has_react = driver.execute_script("var v; try {v = require('React2').version} catch(e) {v = false}; return v;");
if has_react:
logger.warn("Angular Test for %s: %s" % (driver.current_url, has_angular));
return
# The list of sites that we wish to crawl
NUM_BROWSERS = 1
sites = [
'https://edps.europa.eu/'
, 'http://www.example.com'
, 'http://www.princeton.edu'
, 'http://chirurgie-lichtenberg.de'
, 'https://angularjs.org'
]
# Loads the manager preference and 3 copies of the default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
# Record HTTP Requests and Responses
browser_params[i]['http_instrument'] = True
# Enable flash for all three browsers
browser_params[i]['disable_flash'] = False
browser_params[i]['headless'] = True # Launch all headless
browser_params[i]['banner_list_location'] = '/opt/OpenWPM/bannerlist.txt'
# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/'
manager_params['log_directory'] = '~/Desktop/'
# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)
# Visits the sites with all browsers simultaneously
for site in sites:
# use of reset=True for stateless crawl, see:
# https://github.com/citp/OpenWPM#stateful-vs-stateless-crawls
command_sequence = CommandSequence.CommandSequence(site, reset=True)
# Start by visiting the page
command_sequence.get(sleep=0, timeout=60)
# check js framework
command_sequence.run_custom_function(check_js_frameworks)
# test cookie banner
command_sequence.detect_cookie_banner()
command_sequence.save_screenshot()
# command_sequence.screenshot_full_page('test_full_%d' % i)
command_sequence.extract_links()
command_sequence.browse(num_links=2, sleep=1, timeout=60)
# dump_profile_cookies/dump_flash_cookies closes the current tab.
command_sequence.dump_profile_cookies(120)
# index='**' synchronizes visits between the three browsers
manager.execute_command_sequence(command_sequence, index='**')
# Shuts down the browsers and waits for the data to finish logging
manager.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment