Skip to content

Instantly share code, notes, and snippets.

@pirate
Created March 22, 2024 13:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pirate/9553004f99c0862689386d6c054f5cc7 to your computer and use it in GitHub Desktop.
Save pirate/9553004f99c0862689386d6c054f5cc7 to your computer and use it in GitHub Desktop.
const fs = require('fs');
const path = require('path');
const pathTo2captchaExtension = path.join(__dirname, '2captcha-solver');
const pathToPuppeteerStreamExtension = path.join(__dirname, 'puppeteer-stream-ext');
const { Cluster } = require('puppeteer-cluster');
const puppeteer = require("puppeteer-extra");
// add recaptcha plugin to solve captchas automatically
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha')
// add stealth plugin and use defaults (all evasion techniques)
const stealthPlugin = require("puppeteer-extra-plugin-stealth");
function hashCode(str) { // java String#hashCode
var hash = 0;
for (var i = 0; i < str.length; i++) {
hash = str.charCodeAt(i) + ((hash << 5) - hash);
}
return Math.abs(hash);
}
const URL_PATH = (url) => path.join(__dirname, `${hashCode(url)}`)
const PAGE_PATH = (page) => URL_PATH(page.url())
const SCREENRECORDING_PATH = (page) => `${PAGE_PATH(page)}/screenrecording.webm'`
const SCREENSHOT_PATH = (page) => `${PAGE_PATH(page)}/screenshot.png`
const PDF_PATH = (page) => `${PAGE_PATH(page)}/output.pdf`
const HEADERS_PATH= (page) => `${PAGE_PATH(page)}/headers.json`
const AUTH_JSON_PATH = 'auth.json'
const API_KEY_2CAPTCHA = '<your 2Captcha API key here>'
const DEFAULT_TIMEOUT = 20_000
const DEFAULT_VIEWPORT = {
width: 1920,
height: 1080,
deviceScaleFactor: 1,
isMobile: false,
hasTouch: false,
isLandscape: false,
}
const CHROME_ARGS = [
'--test-type',
'--remote-debugging-port=9222',
'--remote-debugging-address=0.0.0.0',
'--disable-session-crashed-bubble',
'--hide-crash-restore-bubble',
'--install-autogenerated-theme=169,32,85',
'--window-size=1920,1080',
'--window-position=0,0',
'--virtual-time-budget=60000',
'--force-color-profile=srgb',
'--hide-scrollbars',
'--deterministic-mode',
'--allow-pre-commit-input',
'--js-flags=--random-seed=1157259157',
// '--use-fake-device-for-media-stream',
// '--use-fake-ui-for-media-stream',
// '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
'--disable-speech-synthesis-api',
'--disable-speech-api',
'--deny-permission-prompts',
'--disable-notifications',
'--disable-desktop-notifications',
'--noerrdialogs',
'--disable-popup-blocking',
'--disable-prompt-on-repost',
'--silent-debugger-extension-api',
'--autoplay-policy=no-user-gesture-required',
'--disable-gesture-requirement-for-media-playback',
'--block-new-web-contents',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--ash-no-nudges',
'--disable-search-engine-choice-screen',
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
'--suppress-message-center-popups',
'--disable-client-side-phishing-detection',
'--disable-domain-reliability',
'--disable-component-update',
'--disable-datasaver-prompt',
'--disable-hang-monitor',
'--disable-session-crashed-bubble',
'--hide-crash-restore-bubble',
'--no-pings',
'--safebrowsing-disable-auto-update',
'--disable-renderer-backgrounding',
'--disable-software-rasterizer',
'--disable-partial-raster',
'--disable-skia-runtime-opts',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-component-extensions-with-background-pages',
'--disable-features=Translate,AcceptCHFrame,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,InterestFeedContentSuggestions,CalculateNativeWinOcclusion,BackForwardCache,HeavyAdPrivacyMitigations,LazyFrameLoading,ImprovedCookieControls,PrivacySandboxSettings4,AutofillServerCommunication,CertificateTransparencyComponentUpdater,DestroyProfileOnBrowserClose,CrashReporting,OverscrollHistoryNavigation,InfiniteSessionRestore',
'--disable-field-trial-config',
'--disable-ipc-flooding-protection',
'--disable-extensions-http-throttling',
'--disable-lazy-loading',
'--disable-back-forward-cache',
'--disable-external-intent-requests',
'--metrics-recording-only',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--allow-running-insecure-content',
'--ignore-certificate-errors',
'--ignore-ssl-errors',
'--ignore-certificate-errors-spki-list',
'--export-tagged-pdf',
'--generate-pdf-document-outline',
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"',
'--user-data-dir=/tmp/chromeprofile',
'--profile-directory=Default',
'--disable-cookie-encryption',
'--disable-sync',
'--use-mock-keychain',
'--password-store=basic',
'--enable-logging=stderr',
'--v=2',
'--screenshot',
`--load-extension=${pathTo2captchaExtension},${pathToPuppeteerStreamExtension}`,
`--allowlisted-extension-id=gedlohppgooipgobimfihdafnbdhpagn`,
`--allowlisted-extension-id=jjndjgheafjngoipoacpjgeicjeomjli`,
// '--headless=new',
// problematic: slows down chrome launching or cause other issues
// '--run-all-compositor-stages-before-draw',
// '--in-process-gpu',
// '--enable-automation',
// '--disable-gpu'
// '--enable-automation',
]
let response
let idx = 0
// https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
puppeteer.use(stealthPlugin());
puppeteer.use(RecaptchaPlugin({
provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
visualFeedback: true,
}))
async function solveCaptcha({ page, data: url }) {
try {
console.log('[➕] Starting task...', url, '>', URL_PATH(url))
fs.mkdirSync(URL_PATH(url), {recursive: true})
const { wss, getStream } = require("puppeteer-stream");
await setupScreenrecording(page, wss);
await setupNewPage(page);
await autoCloseModals(page);
await loadSessionAuth(page);
console.log('[🌐] Visiting website URL...', url)
response = await page.goto(url)
const {stream, streamFile} = await startScreenrecording(page, getStream);
} catch(err) {
console.error(err)
return
}
try {
await solveCAPTCHAS(page);
console.log('[☑️] Submitting form to check if CAPTCHA solve worked...', url)
await page.click('button[type=submit]')
await page.waitForFunction(
'document.querySelectorAll("code")[0].innerText.includes(\'"success": true\')',
{timeout: 30_000},
);
console.log('[🧬] CAPTCHA check succeeded, site thinks we are human.', url)
await saveSessionAuth(page);
await saveHeaders(page, response);
// wait/scroll to finish loading dynamic/lazy/slow content
await wait(10000);
await scrollDown(page);
await savePDF(page);
await saveScreenshot(page);
await saveScreenrecording({stream, streamFile});
console.log('[✅] Finished. Closing page...', url)
await page.goto('about:blank')
} catch (err) {
console.error(err);
await saveScreenrecording({stream, streamFile});
console.log('[❌] Failed. Closing page...', url)
await page.goto('about:blank')
}
}
async function setupNewPage(page) {
const client = await page.target().createCDPSession();
await client.send('Emulation.clearDeviceMetricsOverride');
await page.setViewport(DEFAULT_VIEWPORT);
await page.setGeolocation({latitude: 59.95, longitude: 30.31667});
page.setDefaultTimeout(DEFAULT_TIMEOUT);
return page
}
async function solveCAPTCHAS(page) {
console.log('[🕑] Waiting for page to finish loading and CAPTCHA to appear...')
// await page.solveRecaptchas()
await page.bringToFront()
await page.waitForSelector('.captcha-solver')
console.log('[🤖] CAPTCHA finished loading, submitting to 2Captcha for solving...')
await page.click('.captcha-solver')
console.log('[🕑] Waiting up to 180s for CAPTCHA to be solved...')
await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout: 180_000})
console.log('[🧮] CAPTCHA solution retrieved from 2captcha.')
}
async function autoCloseModals(page) {
page.on('dialog', async (dialog) => {
console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`)
setTimeout(async () => {await dialog.accept()}, 1250);
})
}
async function setupScreenrecording(page, wss) {
console.log('[🎬] Setting up screen recording plugin...');
const stream_port = (await wss).options.port;
// streamPage = await (page.browser()).newPage()
await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
}
async function startScreenrecording(page, getStream) {
console.log(`[🎬] Starting screen recording stream to ${SCREENRECORDING_PATH(page)}...`)
streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
stream = await getStream(page, {
audio: true,
video: true,
bitsPerSecond: 8000000, // 1080p video
});
stream.pipe(streamFile);
return {stream, streamFile}
}
async function saveScreenrecording({stream, streamFile}) {
if (stream && streamFile) {
console.log(`[💾] Saving screen recording video to ${SCREENRECORDING_PATH(page)}...`)
await stream?.destroy();
streamFile?.close();
// await streamPage.close();
}
}
async function saveScreenshot(page) {
console.log(`[📸] Saving full-page screenshot to ${SCREENSHOT_PATH(page)}...`)
await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true })
}
async function savePDF(page) {
console.log(`[📜] Saving PDF snapshot to ${PDF_PATH(page)}...`)
await page.pdf({ path: PDF_PATH(page), outline: true, tagged: true })
}
async function saveSessionAuth(page) {
// const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies
const client = await page.target().createCDPSession();
const cookies = (await client.send('Network.getAllCookies')).cookies;
const sessionStorage = await page.evaluate(() => sessionStorage);
const localStorage = await page.evaluate(() => localStorage);
const authBlob = {
cookies,
sessionStorage,
localStorage,
}
console.log(`[🍪] Saving cookies/localStorage/sessionStorage to ${AUTH_JSON_PATH}...`, Object.keys(cookies).length);
fs.writeFileSync(AUTH_JSON_PATH, JSON.stringify(authBlob, null, 4), 'utf-8');
}
async function loadSessionAuth(page) {
const {
cookies,
sessionStorage,
localStorage,
} = JSON.parse(fs.readFileSync(AUTH_JSON_PATH, 'utf-8'));
console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${AUTH_JSON_PATH}...`, Object.keys(cookies).length)
await page.setCookie(...cookies);
await page.evaluate((data) => {
for (const [key, value] of Object.entries(data)) {
sessionStorage[key] = value;
}
}, sessionStorage);
await page.evaluate((data) => {
for (const [key, value] of Object.entries(data)) {
localStorage[key] = value;
}
}, localStorage);
}
async function scrollDown(page) {
const starting_height = await page.evaluate('document.body.scrollHeight');
let last_height = starting_height
const max_scrolls = 15; // 15 * (1000px every 3s) = 15,000px scroll maximum over 45sec
const scroll_px = 1000;
const scroll_delay = 3000;
let scroll_count = 0;
let scroll_position = scroll_count * scroll_px
while (scroll_count < max_scrolls) {
console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`)
// perform the smooth scroll down by 1000px, sleep 2s, and increment the counter
await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position);
await wait(scroll_delay);
scroll_count++
scroll_position = scroll_count * scroll_px
// check if any new content was added / if we are infiniscrolling
let new_height = await page.evaluate('document.body.scrollHeight')
const added_px = new_height - last_height
if (added_px > 0) {
console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`)
} else if (scroll_position >= new_height + scroll_px) {
// we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
break
}
last_height = new_height
}
// Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`)
await wait(4000);
console.log(`[🔝] Scrolling back up to top. (0000/${last_height})`)
await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
await wait(4000);
return last_height
}
async function saveHeaders(page, response) {
console.log(`[👾] Saving request/response headers, title, url, favicon to ${HEADERS_PATH(page)}...`)
const title = await page.title()
const final_url = await page.url()
const favicon = await page.evaluate(() => document.querySelector('link[rel*="icon"]')?.href)
const req_headers = response.request().headers()
const resp_headers = response.headers()
const pageTitle = await page.evaluate(() => document.title);
const headers = {
url: response.request().url(),
method: response.request().method,
title: pageTitle,
response_url: response.url(),
browser_url: final_url,
auth: AUTH_JSON_PATH,
title,
favicon,
status: response.status(),
statusText: response.statusText(),
request: req_headers,
response: resp_headers,
metrics: await page.metrics(),
}
fs.writeFileSync(HEADERS_PATH(page), JSON.stringify(headers, null, 4), 'utf-8')
}
const wait = (ms) => new Promise(res => setTimeout(res, ms));
async function main() {
console.log('[🎭] Starting puppeteer cluster...')
const cluster = await Cluster.launch({
puppeteer,
maxConcurrency: 3,
timeout: 240_000,
monitor: true,
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
// dumpio: true,
ignoreDefaultArgs: true,
args: CHROME_ARGS,
}
})
// console.log('[🎭] Connecting puppeteer to Chromium headless...')
// const browser = await puppeteer.connect({browserURL: 'http://localhost:9222'})
cluster.queue('https://2captcha.com/demo/recaptcha-v2', solveCaptcha)
cluster.queue('https://2captcha.com/demo/rotatecaptcha', solveCaptcha)
cluster.queue('https://2captcha.com/demo/cloudflare-turnstile', solveCaptcha)
await cluster.idle();
console.log('[✅] Finished all tasks. Shutting down cluster...')
await cluster.close();
process.exit(0);
}
main().catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment