Skip to content

Instantly share code, notes, and snippets.

@arelra
Created July 12, 2023 15:54
Show Gist options
  • Save arelra/81c687762366888d196ae58e6a86b8dc to your computer and use it in GitHub Desktop.
Save arelra/81c687762366888d196ae58e6a86b8dc to your computer and use it in GitHub Desktop.
dcr-puppeteer.js
const puppeteer = require("puppeteer");
const fs = require("fs");
const breakpoints = {
mobile: [320, 768],
mobileMedium: 375,
mobileLandscape: 480,
phablet: 660,
tablet: 740,
desktop: 980,
leftCol: 1140,
wide: 1300,
};
const timestamp = Date.now();
const writeTemplate = () => {
const fileText = fs.readFileSync("./index.html", "utf8");
const template = fileText.replace('%%width%%', 320).replace('%%height%%', 768);
fs.writeFileSync(`./index-${timestamp}.html`, template, "utf8");
};
const scrollPage = async (page) => {
await page.evaluate((_) => {
window.scrollTo(0, window.document.body.scrollHeight);
});
await page.evaluate((_) => {
window.scrollTo(0, 0);
});
};
const hydrate = async (page) => {
await scrollPage(page);
// individually wait for all islands to hydrate
const islands = await page.$$("gu-island");
for (let i = 0; i < islands.length; i++) {
const island = islands[i];
const deferuntil = await island.evaluate((el) =>
el.getAttribute("deferuntil")
);
const name = await island.evaluate((el) => el.getAttribute("name"));
if (["idle", "visible", undefined].includes(deferuntil)) {
console.log(`\nScrolling to ${name}`);
await island.evaluate((el) =>
el.scrollIntoView({
behavior: "smooth",
block: "start",
inline: "start",
})
);
console.log(`Waiting for ready ${name}`);
if (!name.startsWith("MostViewedRight") && !name.startsWith("DiscussionMeta")) {
await page.waitForTimeout(1000);
await page.waitForSelector(
`gu-island[name="${name}"][data-gu-ready="true"]`,
{ timeout: 30000 }
);
console.log(`Waiting for a bit ${name}`);
}
await page.waitForTimeout(2000);
} else {
console.log(`Skipping ${name}`);
}
}
await scrollPage(page);
await page.waitForTimeout(5000);
};
const hydrateAds = async (page) => {
await scrollPage(page);
const ads = await page.$$(".ad-slot");
for (let i = 0; i < ads.length; i++) {
const ad = ads[i];
const name = await ad.evaluate((el) => el.getAttribute("data-name"));
console.log(`\nScrolling to ${name}`);
await ad.evaluate((el) =>
el.scrollIntoView({
behavior: "smooth",
block: "start",
inline: "start",
})
);
console.log(`Waiting for ready ${name}`);
await page.waitForTimeout(1000);
await page.waitForSelector(
`.ad-slot--rendered[data-name="${name}"]`,
{ timeout: 30000 }
);
console.log(`Waiting for a bit ${name}`);
}
await scrollPage(page);
await page.waitForTimeout(5000);
}
const getLinkCoords = async (page) => {
const links = await page.$$("a");
for (let i = 0; i < links.length; i++) {
const link = links[i];
const href = await link.evaluate((el) => el.getAttribute("href"));
if (href && href.startsWith("http")) {
const rect = await link.boundingBox();
if (rect && href.startsWith("https://www.theguardian.com")) {
console.log(rect, href);
}
}
}
}
(async () => {
const browser = await puppeteer.launch({
headless: false,
args: [
"--disable-gpu",
"--disable-dev-shm-usage",
"--disable-setuid-sandbox",
"--no-first-run",
"--no-sandbox",
"--no-zygote",
"--deterministic-fetch",
"--disable-features=IsolateOrigins",
"--disable-site-isolation-trials",
],
});
const page = await browser.newPage();
await page.setViewport({ width: 375, height: 812 });
// const url =
// "https://www.theguardian.com/culture/2023/apr/28/eva-green-wins-high-court-battle-over-collapse-of-sci-fi-film";
const url =
"https://www.theguardian.com/food/2023/may/13/breakfasts-from-around-the-world-recipes-udon-strata-chilaquiles-yotam-ottolenghi?adtest=fixed-puppies";
await page.goto(url, { waitUntil: "networkidle0" });
const frame = page
.frames()
.find((frame) =>
frame.url().startsWith("https://sourcepoint.theguardian.com")
);
await frame.click("button.sp_choice_type_11");
await hydrate(page);
await hydrateAds(page);
await page.waitForNetworkIdle(500);
await page.screenshot({
path: `screenshot-${Date.now()}.jpg`,
fullPage: true,
});
await getLinkCoords(page);
writeTemplate();
await browser.close();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment