Skip to content

Instantly share code, notes, and snippets.

@pirate
Last active November 7, 2024 10:38
Show Gist options
  • Save pirate/853bf7ae2186ba06a0741742f453bb5f to your computer and use it in GitHub Desktop.
Save pirate/853bf7ae2186ba06a0741742f453bb5f to your computer and use it in GitHub Desktop.
Implements an EventEmiter interface thats broadcasts all events across puppeteer context, page context, and service worker contexts.
/*
Moved to: https://github.com/ArchiveBox/abx-spec-behaviors
*/
// OLD Version:
// This file contains the implementation for a 3-way EventEmitter / EventTarget-style event bus.
// It allows linking puppeteer, page, and service worker context and dispatching events from any of them to all of them.
// Events can be emitted from any context, and they are broadcast to all the other contexts.
// Handlers can listen for events in any context.
// There is no requirement to use all three context, you can also use a subset of these to just link any two contexts.
// Usage example:
async function example() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Set up BehaviorBus and bidirectional event forwarding between puppeteer, window, and service worker contexts
const BehaviorBus = new PuppeteerBehaviorBus(page);
await BehaviorBus.setup();
// In puppeteer context
BehaviorBus.on('TEST', (event, BehaviorBus, page) => {
console.log('puppeteer received:', event);
});
// In window context (aka page context)
await page.evaluate(() => {
BehaviorBus.on('TEST', (event, BehaviorBus, window) => {
console.log('window received:', event);
});
});
// In service worker / extension context
// (could also be set up by an extension in its own source code instead of evaluating in worker context using puppeteer)
const worker = browser.targets().find((target) =>
target.url().startsWith("chrome-extension://") && (target.type() === "background_page" || target.type() === "service_worker")
).worker();
await worker.evaluate(() => {
const BehaviorBus = new ServiceWorkerBehaviorBus(window);
BehaviorBus.on('TEST', (event, BehaviorBus, window) => {
console.log('service worker received:', event);
});
});
// Events can be dispatched from any context and will be received by all contexts:
// From puppeteer:
BehaviorBus.emit({ type: 'TEST', data: 'event sent from puppeteer context' });
// From page:
await page.evaluate(() => {
BehaviorBus.emit({ type: 'TEST', data: 'event sent from page window context' });
});
// From service worker:
await worker.evaluate(() => {
BehaviorBus.emit({ type: 'TEST', data: 'event sent from service worker context' });
});
}
/******************** Linkable BehaviorBus Implementations for each context *****************/
class ServiceWorkerBehaviorBus extends EventTarget {
constructor(window) {
super();
this.window = window;
this.connectedTabs = new Set();
this.setupMessageListeners();
}
setupMessageListeners() {
// Listen for messages from content scripts (aka windowToServiceWorkerBehaviorBusForwarder)
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (message._is_behaviorbus_event) {
const event = message.event;
// Emit locally
this.dispatchEvent(new CustomEvent(event.type, { detail: event }));
// Forward to all other connected tabs
this.connectedTabs.forEach(tabId => {
if (tabId !== sender.tab?.id) {
chrome.tabs.sendMessage(tabId, {
_is_behaviorbus_event: true,
event,
source: 'serviceWorker'
}).catch(() => {
// Remove disconnected tabs
this.connectedTabs.delete(tabId);
});
}
});
sendResponse({ received: true });
}
});
// Track connected tabs
chrome.runtime.onConnect.addListener(port => {
if (port.name === 'BehaviorBus') {
const tabId = port.sender?.tab?.id;
if (tabId) {
this.connectedTabs.add(tabId);
port.onDisconnect.addListener(() => {
this.connectedTabs.delete(tabId);
});
}
}
});
}
addEventListener(type, handler, options) {
// injext the extra context params handler(event, BehaviorBus, window) into listener hooks when calling them
const wrappedHandler = (e) => handler(e.detail || e, this, this.window);
super.addEventListener(type, wrappedHandler, options);
}
on(type, handler) {
this.addEventListener(type, handler);
}
emit(type, detail) {
const event = { type, ...detail };
// Emit locally
this.dispatchEvent(new CustomEvent(type, { detail: event }));
// Broadcast to all connected tabs
this.connectedTabs.forEach(tabId => {
chrome.tabs.sendMessage(tabId, {
_is_behaviorbus_event: true,
event,
source: 'serviceWorker'
}).catch(() => {
this.connectedTabs.delete(tabId);
});
});
}
}
// Content script to bridge page and service worker
const windowToServiceWorkerBehaviorBusForwarder = () => {
// Connect to service worker
const port = chrome.runtime.connect({ name: 'BehaviorBus' });
// Forward messages from page to service worker
window.addEventListener('message', (event) => {
if (event.data._is_behaviorbus_event) {
chrome.runtime.sendMessage({
_is_behaviorbus_event: true,
event: event.data.event,
source: 'page'
});
}
});
window._emitToPage = window.postMessage;
// Forward messages from service worker to page
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (message._is_behaviorbus_event) {
window._emitToPage({
_is_behaviorbus_event: true,
event: message.event,
source: 'serviceWorker'
}, '*');
sendResponse({ received: true });
}
});
};
// Modified page-side BehaviorBus implementation
const windowBehaviorBusSetup = () => {
window.BehaviorBus = new class WindowBehaviorBus extends EventTarget {
constructor(window) {
super();
this.window = window;
// Handle events from other contexts
window.addEventListener('message', (e) => {
if (e.data._is_behaviorbus_event) {
this.dispatchEvent(new CustomEvent(e.data.event.type, {
detail: e.data.event
}));
}
});
// Use alias for emitting to service worker for clarity
window._emitToServiceWorker = window.postMessage;
// Handle events from puppeteer
window._emitToPage = (event) => {
this.dispatchEvent(new CustomEvent(event.type, {
detail: event
}));
window._emitToServiceWorker({
_is_behaviorbus_event: true,
event,
source: 'puppeteer'
}, '*');
};
}
addEventListener(type, handler, options) {
// add the extra handler(event, BehaviorBus, window) parameters to the handler
const wrappedHandler = (e) => handler(e.detail || e, this, this.window);
super.addEventListener(type, wrappedHandler, options);
}
on(type, handler) {
this.addEventListener(type, handler);
}
emit(type, detail) {
const event = { type, ...detail };
// Emit locally
this.dispatchEvent(new CustomEvent(type, { detail: event }));
// Emit to puppeteer context
this.window._emitToPuppeteer(event);
// Emit to service worker
this.window._emitToServiceWorker({
_is_behaviorbus_event: true,
event,
source: 'page'
}, '*');
}
}(window);
};
// Modified puppeteer-side BehaviorBus
class PuppeteerBehaviorBus extends EventEmitter {
constructor(page) {
super();
this.page = page;
this.bindToPage = this.bindToPage.bind(this);
this._handlePageEvent = this._handlePageEvent.bind(this);
}
async setup() {
// Setup function that forwards events up from Window context to our Puppeteer context
await this.page.exposeFunction('_emitToPuppeteer', this._handlePageEvent);
// Setup bridge that forward events from page's window context to its peer service worker contexts
await this.page.evaluateOnNewDocument(`(${windowToServiceWorkerBehaviorBusForwarder.toString()})()`);
// Setup the page's window context WindowBehaviorBus
await this.page.evaluateOnNewDocument(`(${windowBehaviorBusSetup.toString()})()`);
}
_handlePageEvent(event) {
this.emit(event.type, event);
}
emit(type, event) {
// call any listeners on registered on our own Bus: handler(event, BehaviorBus, page)
super.emit(type, event, this, this.page);
// then forward the event to the window context's bus
if (this.page) {
this.page.evaluate((event) => {
window._emitToPage(event);
}, event).catch(console.error);
}
}
}
/*********************** Example Behaviors ***********************/
// example: find all the <a href>s on the page and add them to the crawl queue
const DiscoverOutlinksBehavior = {
schema: 'BehaviorSchema@0.1.0',
context: {
window: {
PAGE_CAPTURE: async (event, BehaviorBus, window) => {
for (const elem of window.document.querySelectorAll('a')) {
BehaviorBus.emit({type: 'FOUND_OUTLINK', url: elem.href, elem})
}
},
FOUND_OUTLINK: async (event, BehaviorBus, window) => {
console.log('DiscoverOutlinksBehavior found a new outlink to add to crawl!', event)
// browsertrix driver itself would also listen for this event and use it to add add URLs to the crawl queue
}
},
puppeteer: {
// can also optionally implement handlers that run in other contexts (if driver implements that context)
PAGE_SETUP: async (event, BehaviorBus, page) => {
await page.setRequestInterception(true);
page.on('request', request => {
request.continue();
if (request.url().endsWith('.html')) {
BehaviorBus.emit({type: 'FOUND_OUTLINK', url: request.url()});
// consumes/broadcasts events to all contexts using same shared BehaviorBus
// so the FOUND_OUTLINK handler above would fire even though it's bound in a different context
}
})
},
},
},
}
// example: behavior to extract a page's article text content
class ExtractArticleTextBehavior {
static schema = 'BehaviorSchema@0.1.0'
static contexts = {
window: {
PAGE_CAPTURE: async (event, BehaviorBus, window) => {
const article_text = window.document.body.innerText
BehaviorBus.emit({type: 'FS_WRITE_FILE', path: 'body_text.txt', content: article_text})
BehaviorBus.emit({type: 'DISCOVERED_TEXT', selector: 'body', text: article_text})
// browsertrix could listen for this to build a full-text-search index in the WARC if it wants
},
},
};
}
// example: behavior to expand comments on reddit, facebook, and github
class ExpandCommentsBehavior {
static schema = 'BehaviorSchema@0.1.0';
// private helper methods that behavior can use internally
static _expand = (elem) => { elem.open = true };
static _shouldRun = (page_url) => {
for (const domain of ['//facebook.com', '//reddit.com', '//github.com']) {
if (page_url.includes(domain)) return true;
}
return false;
};
static contexts = {
window: {
PAGE_LOAD: async (event, BehaviorBus, window) => {
if (!ExpandCommentsBehavior._shouldRun(window.location.href)) return;
// expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
[...window.document.querySelectorAll('article details')].forEach(ExpandCommentsBehavior._expand);
[...window.document.querySelectorAll('div.js-discussion details:not(.details-overlay)')].forEach(ExpandCommentsBehavior._expand);
[...window.document.querySelectorAll('.markdown-body details')].forEach(ExpandCommentsBehavior._expand);
}
},
puppeteer: {
PAGE_LOAD: async (event, BehaviorBus, page) => {
if (!ExpandCommentsBehavior._shouldRun(page.url())) return;
// if driver offers a puppeteer context the behavior can use its extra powers to pierce nested iframes/shadow doms/etc
await page.$$eval('pierce/article details', ExpandCommentsBehavior._expand);
await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', ExpandCommentsBehavior._expand);
await page.$$eval('pierce/.markdown-body details', ExpandCommentsBehavior._expand);
}
}
};
}
/*********************** Example Behavior Driver ***********************/
const behaviors = [DiscoverOutlinksBehavior, ExtractArticleTextBehavior, ExpandCommentsBehavior]
async function runBehaviorsInWindowContext(window, behaviors) {
const BehaviorBus = new PageBehaviorBus(window)
window.BehaviorBus = BehaviorBus;
// driver implements the "standard library" of APIs it exposes to behaviors
// by defining handler hooks for the common events types that it listens for
// + it also automatically triggers some events based on page & caputure lifecycle
// (e.g. PAGE_LOAD when the page loads, PAGE_CAPTURE when its time to archive stuff)
BehaviorBus.on('FOUND_OUTLINK', async ({url, elem}, BehaviorBus, window) => {
window.addLink(url) // e.g. tell browsertrix to add the url to its crawl queue
})
// example: when running inside a page context like we are now,
// behavior driver could implement fs handlers using OPFS
BehaviorBus.on('FS_WRITE_FILE', async ({path, content}, BehaviorBus, window) => {
const opfsRoot = await window.navigator.storage.getDirectory();
const fileHandle = await opfsRoot.getFileHandle("fast", { create: true });
const accessHandle = await fileHandle.createSyncAccessHandle();
accessHandle.write(content); accessHandle.flush(); accessHandle.close();
window.behavior_output_files[path] = accessHandle;
// browsertrix could add these ^ to warc after
})
// trigger main archiving events once page has loaded
window.addEventListener('load', async () => {
BehaviorBus.emit('PAGE_LOAD')
await sleep(5000) // wait extra 5 seconds for animations/slow ajax
BehaviorBus.emit('PAGE_CAPTURE')
await sleep(5000)
BehaviorBus.emit('PAGE_CAPTURE_COMPLETE')
})
// attach all the behaviors event handlers to the shared BehaviorBus
for (const behavior of behaviors) {
for (const [eventName, handler] of Object.entries(behavior.context.browser)) {
BehaviorBus.on(eventName, handler)
}
}
}
async function runBehaviorsInNodePuppeteerContext(page, behaviors) {
const crawl_queue = ['https://example.com']
const browser = await puppeteer.launch();
// puppeteer running in node can just use node's fs module for file IO
BehaviorBus.on('FS_WRITE_FILE', async ({path, content}, BehaviorBus, page) => {
fs.writeFileSync(path, content)
})
// this driver might add new URLs to the crawl queue by just adding them to an array
BehaviorBus.on('FOUND_OUTLINK', async ({url}, BehaviorBus, page) => {
crawl_queue.push(url)
})
for (const url of crawl_queue) {
const page = await browser.newPage();
const BehaviorBus = new PuppeteerBehaviorBus(page)
await BehaviorBus.setup();
// attach all the behaviors event handlers to the shared BehaviorBus
for (const behavior of behaviors) {
for (const [eventName, handler] of Object.entries(behavior.context.browser)) {
BehaviorBus.on(eventName, handler)
}
}
const pageCaptureComplete = new Promise()
BehaviorBus.on('PAGE_CAPTURE_COMPLETE', async (event, BehaviorBus, page) => {
pageCaptureComplete.resolve();
})
// trigger main archiving events once page has loaded
page.on('load', async () => {
BehaviorBus.emit('PAGE_LOAD')
await sleep(5000) // wait extra 5 seconds for animations/slow ajax
BehaviorBus.emit('PAGE_CAPTURE')
await sleep(5000)
BehaviorBus.emit('PAGE_CAPTURE_COMPLETE')
})
// navigate to the url and let the page load
await page.goto(url);
// wait for the PAGE_CLOSE event to fire after the page archiving is complete
await pageCaptureComplete;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment