Last active
November 7, 2024 10:38
-
-
Save pirate/853bf7ae2186ba06a0741742f453bb5f to your computer and use it in GitHub Desktop.
Implements an EventEmiter interface thats broadcasts all events across puppeteer context, page context, and service worker contexts.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Moved to: https://github.com/ArchiveBox/abx-spec-behaviors | |
*/ | |
// OLD Version: | |
// This file contains the implementation for a 3-way EventEmitter / EventTarget-style event bus. | |
// It allows linking puppeteer, page, and service worker context and dispatching events from any of them to all of them. | |
// Events can be emitted from any context, and they are broadcast to all the other contexts. | |
// Handlers can listen for events in any context. | |
// There is no requirement to use all three context, you can also use a subset of these to just link any two contexts. | |
// Usage example: | |
async function example() { | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
// Set up BehaviorBus and bidirectional event forwarding between puppeteer, window, and service worker contexts | |
const BehaviorBus = new PuppeteerBehaviorBus(page); | |
await BehaviorBus.setup(); | |
// In puppeteer context | |
BehaviorBus.on('TEST', (event, BehaviorBus, page) => { | |
console.log('puppeteer received:', event); | |
}); | |
// In window context (aka page context) | |
await page.evaluate(() => { | |
BehaviorBus.on('TEST', (event, BehaviorBus, window) => { | |
console.log('window received:', event); | |
}); | |
}); | |
// In service worker / extension context | |
// (could also be set up by an extension in its own source code instead of evaluating in worker context using puppeteer) | |
const worker = browser.targets().find((target) => | |
target.url().startsWith("chrome-extension://") && (target.type() === "background_page" || target.type() === "service_worker") | |
).worker(); | |
await worker.evaluate(() => { | |
const BehaviorBus = new ServiceWorkerBehaviorBus(window); | |
BehaviorBus.on('TEST', (event, BehaviorBus, window) => { | |
console.log('service worker received:', event); | |
}); | |
}); | |
// Events can be dispatched from any context and will be received by all contexts: | |
// From puppeteer: | |
BehaviorBus.emit({ type: 'TEST', data: 'event sent from puppeteer context' }); | |
// From page: | |
await page.evaluate(() => { | |
BehaviorBus.emit({ type: 'TEST', data: 'event sent from page window context' }); | |
}); | |
// From service worker: | |
await worker.evaluate(() => { | |
BehaviorBus.emit({ type: 'TEST', data: 'event sent from service worker context' }); | |
}); | |
} | |
/******************** Linkable BehaviorBus Implementations for each context *****************/ | |
class ServiceWorkerBehaviorBus extends EventTarget { | |
constructor(window) { | |
super(); | |
this.window = window; | |
this.connectedTabs = new Set(); | |
this.setupMessageListeners(); | |
} | |
setupMessageListeners() { | |
// Listen for messages from content scripts (aka windowToServiceWorkerBehaviorBusForwarder) | |
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { | |
if (message._is_behaviorbus_event) { | |
const event = message.event; | |
// Emit locally | |
this.dispatchEvent(new CustomEvent(event.type, { detail: event })); | |
// Forward to all other connected tabs | |
this.connectedTabs.forEach(tabId => { | |
if (tabId !== sender.tab?.id) { | |
chrome.tabs.sendMessage(tabId, { | |
_is_behaviorbus_event: true, | |
event, | |
source: 'serviceWorker' | |
}).catch(() => { | |
// Remove disconnected tabs | |
this.connectedTabs.delete(tabId); | |
}); | |
} | |
}); | |
sendResponse({ received: true }); | |
} | |
}); | |
// Track connected tabs | |
chrome.runtime.onConnect.addListener(port => { | |
if (port.name === 'BehaviorBus') { | |
const tabId = port.sender?.tab?.id; | |
if (tabId) { | |
this.connectedTabs.add(tabId); | |
port.onDisconnect.addListener(() => { | |
this.connectedTabs.delete(tabId); | |
}); | |
} | |
} | |
}); | |
} | |
addEventListener(type, handler, options) { | |
// injext the extra context params handler(event, BehaviorBus, window) into listener hooks when calling them | |
const wrappedHandler = (e) => handler(e.detail || e, this, this.window); | |
super.addEventListener(type, wrappedHandler, options); | |
} | |
on(type, handler) { | |
this.addEventListener(type, handler); | |
} | |
emit(type, detail) { | |
const event = { type, ...detail }; | |
// Emit locally | |
this.dispatchEvent(new CustomEvent(type, { detail: event })); | |
// Broadcast to all connected tabs | |
this.connectedTabs.forEach(tabId => { | |
chrome.tabs.sendMessage(tabId, { | |
_is_behaviorbus_event: true, | |
event, | |
source: 'serviceWorker' | |
}).catch(() => { | |
this.connectedTabs.delete(tabId); | |
}); | |
}); | |
} | |
} | |
// Content script to bridge page and service worker | |
const windowToServiceWorkerBehaviorBusForwarder = () => { | |
// Connect to service worker | |
const port = chrome.runtime.connect({ name: 'BehaviorBus' }); | |
// Forward messages from page to service worker | |
window.addEventListener('message', (event) => { | |
if (event.data._is_behaviorbus_event) { | |
chrome.runtime.sendMessage({ | |
_is_behaviorbus_event: true, | |
event: event.data.event, | |
source: 'page' | |
}); | |
} | |
}); | |
window._emitToPage = window.postMessage; | |
// Forward messages from service worker to page | |
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { | |
if (message._is_behaviorbus_event) { | |
window._emitToPage({ | |
_is_behaviorbus_event: true, | |
event: message.event, | |
source: 'serviceWorker' | |
}, '*'); | |
sendResponse({ received: true }); | |
} | |
}); | |
}; | |
// Modified page-side BehaviorBus implementation | |
const windowBehaviorBusSetup = () => { | |
window.BehaviorBus = new class WindowBehaviorBus extends EventTarget { | |
constructor(window) { | |
super(); | |
this.window = window; | |
// Handle events from other contexts | |
window.addEventListener('message', (e) => { | |
if (e.data._is_behaviorbus_event) { | |
this.dispatchEvent(new CustomEvent(e.data.event.type, { | |
detail: e.data.event | |
})); | |
} | |
}); | |
// Use alias for emitting to service worker for clarity | |
window._emitToServiceWorker = window.postMessage; | |
// Handle events from puppeteer | |
window._emitToPage = (event) => { | |
this.dispatchEvent(new CustomEvent(event.type, { | |
detail: event | |
})); | |
window._emitToServiceWorker({ | |
_is_behaviorbus_event: true, | |
event, | |
source: 'puppeteer' | |
}, '*'); | |
}; | |
} | |
addEventListener(type, handler, options) { | |
// add the extra handler(event, BehaviorBus, window) parameters to the handler | |
const wrappedHandler = (e) => handler(e.detail || e, this, this.window); | |
super.addEventListener(type, wrappedHandler, options); | |
} | |
on(type, handler) { | |
this.addEventListener(type, handler); | |
} | |
emit(type, detail) { | |
const event = { type, ...detail }; | |
// Emit locally | |
this.dispatchEvent(new CustomEvent(type, { detail: event })); | |
// Emit to puppeteer context | |
this.window._emitToPuppeteer(event); | |
// Emit to service worker | |
this.window._emitToServiceWorker({ | |
_is_behaviorbus_event: true, | |
event, | |
source: 'page' | |
}, '*'); | |
} | |
}(window); | |
}; | |
// Modified puppeteer-side BehaviorBus | |
class PuppeteerBehaviorBus extends EventEmitter { | |
constructor(page) { | |
super(); | |
this.page = page; | |
this.bindToPage = this.bindToPage.bind(this); | |
this._handlePageEvent = this._handlePageEvent.bind(this); | |
} | |
async setup() { | |
// Setup function that forwards events up from Window context to our Puppeteer context | |
await this.page.exposeFunction('_emitToPuppeteer', this._handlePageEvent); | |
// Setup bridge that forward events from page's window context to its peer service worker contexts | |
await this.page.evaluateOnNewDocument(`(${windowToServiceWorkerBehaviorBusForwarder.toString()})()`); | |
// Setup the page's window context WindowBehaviorBus | |
await this.page.evaluateOnNewDocument(`(${windowBehaviorBusSetup.toString()})()`); | |
} | |
_handlePageEvent(event) { | |
this.emit(event.type, event); | |
} | |
emit(type, event) { | |
// call any listeners on registered on our own Bus: handler(event, BehaviorBus, page) | |
super.emit(type, event, this, this.page); | |
// then forward the event to the window context's bus | |
if (this.page) { | |
this.page.evaluate((event) => { | |
window._emitToPage(event); | |
}, event).catch(console.error); | |
} | |
} | |
} | |
/*********************** Example Behaviors ***********************/ | |
// example: find all the <a href>s on the page and add them to the crawl queue | |
const DiscoverOutlinksBehavior = { | |
schema: 'BehaviorSchema@0.1.0', | |
context: { | |
window: { | |
PAGE_CAPTURE: async (event, BehaviorBus, window) => { | |
for (const elem of window.document.querySelectorAll('a')) { | |
BehaviorBus.emit({type: 'FOUND_OUTLINK', url: elem.href, elem}) | |
} | |
}, | |
FOUND_OUTLINK: async (event, BehaviorBus, window) => { | |
console.log('DiscoverOutlinksBehavior found a new outlink to add to crawl!', event) | |
// browsertrix driver itself would also listen for this event and use it to add add URLs to the crawl queue | |
} | |
}, | |
puppeteer: { | |
// can also optionally implement handlers that run in other contexts (if driver implements that context) | |
PAGE_SETUP: async (event, BehaviorBus, page) => { | |
await page.setRequestInterception(true); | |
page.on('request', request => { | |
request.continue(); | |
if (request.url().endsWith('.html')) { | |
BehaviorBus.emit({type: 'FOUND_OUTLINK', url: request.url()}); | |
// consumes/broadcasts events to all contexts using same shared BehaviorBus | |
// so the FOUND_OUTLINK handler above would fire even though it's bound in a different context | |
} | |
}) | |
}, | |
}, | |
}, | |
} | |
// example: behavior to extract a page's article text content | |
class ExtractArticleTextBehavior { | |
static schema = 'BehaviorSchema@0.1.0' | |
static contexts = { | |
window: { | |
PAGE_CAPTURE: async (event, BehaviorBus, window) => { | |
const article_text = window.document.body.innerText | |
BehaviorBus.emit({type: 'FS_WRITE_FILE', path: 'body_text.txt', content: article_text}) | |
BehaviorBus.emit({type: 'DISCOVERED_TEXT', selector: 'body', text: article_text}) | |
// browsertrix could listen for this to build a full-text-search index in the WARC if it wants | |
}, | |
}, | |
}; | |
} | |
// example: behavior to expand comments on reddit, facebook, and github | |
class ExpandCommentsBehavior { | |
static schema = 'BehaviorSchema@0.1.0'; | |
// private helper methods that behavior can use internally | |
static _expand = (elem) => { elem.open = true }; | |
static _shouldRun = (page_url) => { | |
for (const domain of ['//facebook.com', '//reddit.com', '//github.com']) { | |
if (page_url.includes(domain)) return true; | |
} | |
return false; | |
}; | |
static contexts = { | |
window: { | |
PAGE_LOAD: async (event, BehaviorBus, window) => { | |
if (!ExpandCommentsBehavior._shouldRun(window.location.href)) return; | |
// expand all <details> sections in Github READMEs, HedgeDoc pages, etc. | |
[...window.document.querySelectorAll('article details')].forEach(ExpandCommentsBehavior._expand); | |
[...window.document.querySelectorAll('div.js-discussion details:not(.details-overlay)')].forEach(ExpandCommentsBehavior._expand); | |
[...window.document.querySelectorAll('.markdown-body details')].forEach(ExpandCommentsBehavior._expand); | |
} | |
}, | |
puppeteer: { | |
PAGE_LOAD: async (event, BehaviorBus, page) => { | |
if (!ExpandCommentsBehavior._shouldRun(page.url())) return; | |
// if driver offers a puppeteer context the behavior can use its extra powers to pierce nested iframes/shadow doms/etc | |
await page.$$eval('pierce/article details', ExpandCommentsBehavior._expand); | |
await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', ExpandCommentsBehavior._expand); | |
await page.$$eval('pierce/.markdown-body details', ExpandCommentsBehavior._expand); | |
} | |
} | |
}; | |
} | |
/*********************** Example Behavior Driver ***********************/ | |
const behaviors = [DiscoverOutlinksBehavior, ExtractArticleTextBehavior, ExpandCommentsBehavior] | |
async function runBehaviorsInWindowContext(window, behaviors) { | |
const BehaviorBus = new PageBehaviorBus(window) | |
window.BehaviorBus = BehaviorBus; | |
// driver implements the "standard library" of APIs it exposes to behaviors | |
// by defining handler hooks for the common events types that it listens for | |
// + it also automatically triggers some events based on page & caputure lifecycle | |
// (e.g. PAGE_LOAD when the page loads, PAGE_CAPTURE when its time to archive stuff) | |
BehaviorBus.on('FOUND_OUTLINK', async ({url, elem}, BehaviorBus, window) => { | |
window.addLink(url) // e.g. tell browsertrix to add the url to its crawl queue | |
}) | |
// example: when running inside a page context like we are now, | |
// behavior driver could implement fs handlers using OPFS | |
BehaviorBus.on('FS_WRITE_FILE', async ({path, content}, BehaviorBus, window) => { | |
const opfsRoot = await window.navigator.storage.getDirectory(); | |
const fileHandle = await opfsRoot.getFileHandle("fast", { create: true }); | |
const accessHandle = await fileHandle.createSyncAccessHandle(); | |
accessHandle.write(content); accessHandle.flush(); accessHandle.close(); | |
window.behavior_output_files[path] = accessHandle; | |
// browsertrix could add these ^ to warc after | |
}) | |
// trigger main archiving events once page has loaded | |
window.addEventListener('load', async () => { | |
BehaviorBus.emit('PAGE_LOAD') | |
await sleep(5000) // wait extra 5 seconds for animations/slow ajax | |
BehaviorBus.emit('PAGE_CAPTURE') | |
await sleep(5000) | |
BehaviorBus.emit('PAGE_CAPTURE_COMPLETE') | |
}) | |
// attach all the behaviors event handlers to the shared BehaviorBus | |
for (const behavior of behaviors) { | |
for (const [eventName, handler] of Object.entries(behavior.context.browser)) { | |
BehaviorBus.on(eventName, handler) | |
} | |
} | |
} | |
async function runBehaviorsInNodePuppeteerContext(page, behaviors) { | |
const crawl_queue = ['https://example.com'] | |
const browser = await puppeteer.launch(); | |
// puppeteer running in node can just use node's fs module for file IO | |
BehaviorBus.on('FS_WRITE_FILE', async ({path, content}, BehaviorBus, page) => { | |
fs.writeFileSync(path, content) | |
}) | |
// this driver might add new URLs to the crawl queue by just adding them to an array | |
BehaviorBus.on('FOUND_OUTLINK', async ({url}, BehaviorBus, page) => { | |
crawl_queue.push(url) | |
}) | |
for (const url of crawl_queue) { | |
const page = await browser.newPage(); | |
const BehaviorBus = new PuppeteerBehaviorBus(page) | |
await BehaviorBus.setup(); | |
// attach all the behaviors event handlers to the shared BehaviorBus | |
for (const behavior of behaviors) { | |
for (const [eventName, handler] of Object.entries(behavior.context.browser)) { | |
BehaviorBus.on(eventName, handler) | |
} | |
} | |
const pageCaptureComplete = new Promise() | |
BehaviorBus.on('PAGE_CAPTURE_COMPLETE', async (event, BehaviorBus, page) => { | |
pageCaptureComplete.resolve(); | |
}) | |
// trigger main archiving events once page has loaded | |
page.on('load', async () => { | |
BehaviorBus.emit('PAGE_LOAD') | |
await sleep(5000) // wait extra 5 seconds for animations/slow ajax | |
BehaviorBus.emit('PAGE_CAPTURE') | |
await sleep(5000) | |
BehaviorBus.emit('PAGE_CAPTURE_COMPLETE') | |
}) | |
// navigate to the url and let the page load | |
await page.goto(url); | |
// wait for the PAGE_CLOSE event to fire after the page archiving is complete | |
await pageCaptureComplete; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment