Skip to content

Instantly share code, notes, and snippets.

@xrip
Forked from prescience-data/strip-page.ts
Created July 21, 2023 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xrip/b826b9eadc3de5ed6a47a4ad4815ac2d to your computer and use it in GitHub Desktop.
Save xrip/b826b9eadc3de5ed6a47a4ad4815ac2d to your computer and use it in GitHub Desktop.
Strip Page With Puppeteer
import { Buffer } from "buffer"
import { createHash } from "crypto"
import { HTTPRequest, Protocol } from "puppeteer-core"
import { isPuppeteerPage, Page } from "../types"
type CaptureSnapshotResponse = Protocol.DOMSnapshot.CaptureSnapshotResponse
type RequestHook = (request: HTTPRequest) => Promise<void>
interface ImageSnapshot {
hash: string
url: string
data: string
}
interface PageSnapshot {
url: string
dom: CaptureSnapshotResponse
images: Map<string, ImageSnapshot>
}
/**
* Determines if a value is an image url.
*
* @param {string} value
* @return {boolean}
*/
const isImage = (value: string): boolean =>
[".jpg", ".jpeg", ".png", ".gif"].includes(
value.match(/\.\w{3,4}($|\?)/)?.[0] ?? ``
)
/**
* Generates a md5 hash of an image url.
*
* @param {string} value
* @return {string}
*/
const md5 = (value: string): string =>
createHash("md5").update(value).digest("hex")
/**
* Captures a complete snapshot of the DOM using CDP.
* Note: A `DOMSnapshot` is represented as a complex table,
* so reconstitution may be desired prior to persisting to database.
*
* @see https://chromedevtools.github.io/devtools-protocol/tot/DOMSnapshot/#method-captureSnapshot
* @param {Page} page
* @return {Promise<CaptureSnapshotResponse>}
*/
const captureSnapshot = async (
page: Page
): Promise<CaptureSnapshotResponse> => {
// Use raw CPD to capture a snapshot of the DOM.
const [, snapshot] = await Promise.all([
page.client().send(`DOMSnapshot.enable`),
page.client().send(`DOMSnapshot.captureSnapshot`, {
computedStyles: [], // Add styles to capture inline here...
includeDOMRects: false,
includePaintOrder: false
})
])
if (!snapshot) {
throw new Error(`Failed to capture snapshot.`)
}
return snapshot
}
/**
* Request listener to push all image responses to a provided map as base64.
*
* @param {Map<string, ImageSnapshot>} images
* @return {RequestHook}
*/
const captureImages =
(images: Map<string, ImageSnapshot>): RequestHook =>
async (request: HTTPRequest): Promise<void> => {
const url: string = request.url()
const hash: string = md5(url)
if (isImage(url) && !images.has(hash)) {
const buffer: Buffer | undefined = await (
await request.response()
)?.buffer()
if (buffer) {
const data: string = buffer.toString("base64")
images.set(hash, {
hash,
url,
data
})
}
}
}
/**
* Captures a snapshot of specified url including DOM and images (as base64 strings).
*
* @param {Page} page
* @param {string} url
* @return {Promise<PageSnapshot>}
*/
export const stripPage = async (
page: Page,
url: string
): Promise<PageSnapshot> => {
// Create a map of all images received by the page.
const images: Map<string, ImageSnapshot> = new Map<string, ImageSnapshot>()
// Listen to all finished requests and capture images to the image map.
// A similar approach can be implemented for any other asset type, however the DOMSnapshot has the ability to inline styles.
page.on("requestfinished", captureImages(images))
// Load the intended url.
await page.goto(url, {
waitUntil: ["domcontentloaded", "networkidle2"]
})
// Capture a DOM Snapshot lookup table (for later recomposition via async queue worker).
const dom: CaptureSnapshotResponse = await captureSnapshot(page)
// Return a complete object to save in database.
return {
url,
dom,
images
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment