Created
March 22, 2024 13:43
-
-
Save pirate/9553004f99c0862689386d6c054f5cc7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const path = require('path'); | |
const pathTo2captchaExtension = path.join(__dirname, '2captcha-solver'); | |
const pathToPuppeteerStreamExtension = path.join(__dirname, 'puppeteer-stream-ext'); | |
const { Cluster } = require('puppeteer-cluster'); | |
const puppeteer = require("puppeteer-extra"); | |
// add recaptcha plugin to solve captchas automatically | |
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha') | |
// add stealth plugin and use defaults (all evasion techniques) | |
const stealthPlugin = require("puppeteer-extra-plugin-stealth"); | |
function hashCode(str) { // java String#hashCode | |
var hash = 0; | |
for (var i = 0; i < str.length; i++) { | |
hash = str.charCodeAt(i) + ((hash << 5) - hash); | |
} | |
return Math.abs(hash); | |
} | |
const URL_PATH = (url) => path.join(__dirname, `${hashCode(url)}`) | |
const PAGE_PATH = (page) => URL_PATH(page.url()) | |
const SCREENRECORDING_PATH = (page) => `${PAGE_PATH(page)}/screenrecording.webm'` | |
const SCREENSHOT_PATH = (page) => `${PAGE_PATH(page)}/screenshot.png` | |
const PDF_PATH = (page) => `${PAGE_PATH(page)}/output.pdf` | |
const HEADERS_PATH= (page) => `${PAGE_PATH(page)}/headers.json` | |
const AUTH_JSON_PATH = 'auth.json' | |
const API_KEY_2CAPTCHA = '<your 2Captcha API key here>' | |
const DEFAULT_TIMEOUT = 20_000 | |
const DEFAULT_VIEWPORT = { | |
width: 1920, | |
height: 1080, | |
deviceScaleFactor: 1, | |
isMobile: false, | |
hasTouch: false, | |
isLandscape: false, | |
} | |
const CHROME_ARGS = [ | |
'--test-type', | |
'--remote-debugging-port=9222', | |
'--remote-debugging-address=0.0.0.0', | |
'--disable-session-crashed-bubble', | |
'--hide-crash-restore-bubble', | |
'--install-autogenerated-theme=169,32,85', | |
'--window-size=1920,1080', | |
'--window-position=0,0', | |
'--virtual-time-budget=60000', | |
'--force-color-profile=srgb', | |
'--hide-scrollbars', | |
'--deterministic-mode', | |
'--allow-pre-commit-input', | |
'--js-flags=--random-seed=1157259157', | |
// '--use-fake-device-for-media-stream', | |
// '--use-fake-ui-for-media-stream', | |
// '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider', | |
'--disable-speech-synthesis-api', | |
'--disable-speech-api', | |
'--deny-permission-prompts', | |
'--disable-notifications', | |
'--disable-desktop-notifications', | |
'--noerrdialogs', | |
'--disable-popup-blocking', | |
'--disable-prompt-on-repost', | |
'--silent-debugger-extension-api', | |
'--autoplay-policy=no-user-gesture-required', | |
'--disable-gesture-requirement-for-media-playback', | |
'--block-new-web-contents', | |
'--no-first-run', | |
'--no-default-browser-check', | |
'--disable-default-apps', | |
'--ash-no-nudges', | |
'--disable-search-engine-choice-screen', | |
'--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"', | |
'--suppress-message-center-popups', | |
'--disable-client-side-phishing-detection', | |
'--disable-domain-reliability', | |
'--disable-component-update', | |
'--disable-datasaver-prompt', | |
'--disable-hang-monitor', | |
'--disable-session-crashed-bubble', | |
'--hide-crash-restore-bubble', | |
'--no-pings', | |
'--safebrowsing-disable-auto-update', | |
'--disable-renderer-backgrounding', | |
'--disable-software-rasterizer', | |
'--disable-partial-raster', | |
'--disable-skia-runtime-opts', | |
'--disable-breakpad', | |
'--disable-background-networking', | |
'--disable-background-timer-throttling', | |
'--disable-backgrounding-occluded-windows', | |
'--disable-component-extensions-with-background-pages', | |
'--disable-features=Translate,AcceptCHFrame,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,InterestFeedContentSuggestions,CalculateNativeWinOcclusion,BackForwardCache,HeavyAdPrivacyMitigations,LazyFrameLoading,ImprovedCookieControls,PrivacySandboxSettings4,AutofillServerCommunication,CertificateTransparencyComponentUpdater,DestroyProfileOnBrowserClose,CrashReporting,OverscrollHistoryNavigation,InfiniteSessionRestore', | |
'--disable-field-trial-config', | |
'--disable-ipc-flooding-protection', | |
'--disable-extensions-http-throttling', | |
'--disable-lazy-loading', | |
'--disable-back-forward-cache', | |
'--disable-external-intent-requests', | |
'--metrics-recording-only', | |
'--disable-web-security', | |
'--disable-features=IsolateOrigins,site-per-process', | |
'--allow-running-insecure-content', | |
'--ignore-certificate-errors', | |
'--ignore-ssl-errors', | |
'--ignore-certificate-errors-spki-list', | |
'--export-tagged-pdf', | |
'--generate-pdf-document-outline', | |
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"', | |
'--user-data-dir=/tmp/chromeprofile', | |
'--profile-directory=Default', | |
'--disable-cookie-encryption', | |
'--disable-sync', | |
'--use-mock-keychain', | |
'--password-store=basic', | |
'--enable-logging=stderr', | |
'--v=2', | |
'--screenshot', | |
`--load-extension=${pathTo2captchaExtension},${pathToPuppeteerStreamExtension}`, | |
`--allowlisted-extension-id=gedlohppgooipgobimfihdafnbdhpagn`, | |
`--allowlisted-extension-id=jjndjgheafjngoipoacpjgeicjeomjli`, | |
// '--headless=new', | |
// problematic: slows down chrome launching or cause other issues | |
// '--run-all-compositor-stages-before-draw', | |
// '--in-process-gpu', | |
// '--enable-automation', | |
// '--disable-gpu' | |
// '--enable-automation', | |
] | |
let response | |
let idx = 0 | |
// https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer | |
puppeteer.use(stealthPlugin()); | |
puppeteer.use(RecaptchaPlugin({ | |
provider: {id: '2captcha', token: API_KEY_2CAPTCHA}, | |
visualFeedback: true, | |
})) | |
async function solveCaptcha({ page, data: url }) { | |
try { | |
console.log('[➕] Starting task...', url, '>', URL_PATH(url)) | |
fs.mkdirSync(URL_PATH(url), {recursive: true}) | |
const { wss, getStream } = require("puppeteer-stream"); | |
await setupScreenrecording(page, wss); | |
await setupNewPage(page); | |
await autoCloseModals(page); | |
await loadSessionAuth(page); | |
console.log('[🌐] Visiting website URL...', url) | |
response = await page.goto(url) | |
const {stream, streamFile} = await startScreenrecording(page, getStream); | |
} catch(err) { | |
console.error(err) | |
return | |
} | |
try { | |
await solveCAPTCHAS(page); | |
console.log('[☑️] Submitting form to check if CAPTCHA solve worked...', url) | |
await page.click('button[type=submit]') | |
await page.waitForFunction( | |
'document.querySelectorAll("code")[0].innerText.includes(\'"success": true\')', | |
{timeout: 30_000}, | |
); | |
console.log('[🧬] CAPTCHA check succeeded, site thinks we are human.', url) | |
await saveSessionAuth(page); | |
await saveHeaders(page, response); | |
// wait/scroll to finish loading dynamic/lazy/slow content | |
await wait(10000); | |
await scrollDown(page); | |
await savePDF(page); | |
await saveScreenshot(page); | |
await saveScreenrecording({stream, streamFile}); | |
console.log('[✅] Finished. Closing page...', url) | |
await page.goto('about:blank') | |
} catch (err) { | |
console.error(err); | |
await saveScreenrecording({stream, streamFile}); | |
console.log('[❌] Failed. Closing page...', url) | |
await page.goto('about:blank') | |
} | |
} | |
async function setupNewPage(page) { | |
const client = await page.target().createCDPSession(); | |
await client.send('Emulation.clearDeviceMetricsOverride'); | |
await page.setViewport(DEFAULT_VIEWPORT); | |
await page.setGeolocation({latitude: 59.95, longitude: 30.31667}); | |
page.setDefaultTimeout(DEFAULT_TIMEOUT); | |
return page | |
} | |
async function solveCAPTCHAS(page) { | |
console.log('[🕑] Waiting for page to finish loading and CAPTCHA to appear...') | |
// await page.solveRecaptchas() | |
await page.bringToFront() | |
await page.waitForSelector('.captcha-solver') | |
console.log('[🤖] CAPTCHA finished loading, submitting to 2Captcha for solving...') | |
await page.click('.captcha-solver') | |
console.log('[🕑] Waiting up to 180s for CAPTCHA to be solved...') | |
await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout: 180_000}) | |
console.log('[🧮] CAPTCHA solution retrieved from 2captcha.') | |
} | |
async function autoCloseModals(page) { | |
page.on('dialog', async (dialog) => { | |
console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`) | |
setTimeout(async () => {await dialog.accept()}, 1250); | |
}) | |
} | |
async function setupScreenrecording(page, wss) { | |
console.log('[🎬] Setting up screen recording plugin...'); | |
const stream_port = (await wss).options.port; | |
// streamPage = await (page.browser()).newPage() | |
await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`) | |
} | |
async function startScreenrecording(page, getStream) { | |
console.log(`[🎬] Starting screen recording stream to ${SCREENRECORDING_PATH(page)}...`) | |
streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page)) | |
stream = await getStream(page, { | |
audio: true, | |
video: true, | |
bitsPerSecond: 8000000, // 1080p video | |
}); | |
stream.pipe(streamFile); | |
return {stream, streamFile} | |
} | |
async function saveScreenrecording({stream, streamFile}) { | |
if (stream && streamFile) { | |
console.log(`[💾] Saving screen recording video to ${SCREENRECORDING_PATH(page)}...`) | |
await stream?.destroy(); | |
streamFile?.close(); | |
// await streamPage.close(); | |
} | |
} | |
async function saveScreenshot(page) { | |
console.log(`[📸] Saving full-page screenshot to ${SCREENSHOT_PATH(page)}...`) | |
await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true }) | |
} | |
async function savePDF(page) { | |
console.log(`[📜] Saving PDF snapshot to ${PDF_PATH(page)}...`) | |
await page.pdf({ path: PDF_PATH(page), outline: true, tagged: true }) | |
} | |
async function saveSessionAuth(page) { | |
// const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies | |
const client = await page.target().createCDPSession(); | |
const cookies = (await client.send('Network.getAllCookies')).cookies; | |
const sessionStorage = await page.evaluate(() => sessionStorage); | |
const localStorage = await page.evaluate(() => localStorage); | |
const authBlob = { | |
cookies, | |
sessionStorage, | |
localStorage, | |
} | |
console.log(`[🍪] Saving cookies/localStorage/sessionStorage to ${AUTH_JSON_PATH}...`, Object.keys(cookies).length); | |
fs.writeFileSync(AUTH_JSON_PATH, JSON.stringify(authBlob, null, 4), 'utf-8'); | |
} | |
async function loadSessionAuth(page) { | |
const { | |
cookies, | |
sessionStorage, | |
localStorage, | |
} = JSON.parse(fs.readFileSync(AUTH_JSON_PATH, 'utf-8')); | |
console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${AUTH_JSON_PATH}...`, Object.keys(cookies).length) | |
await page.setCookie(...cookies); | |
await page.evaluate((data) => { | |
for (const [key, value] of Object.entries(data)) { | |
sessionStorage[key] = value; | |
} | |
}, sessionStorage); | |
await page.evaluate((data) => { | |
for (const [key, value] of Object.entries(data)) { | |
localStorage[key] = value; | |
} | |
}, localStorage); | |
} | |
async function scrollDown(page) { | |
const starting_height = await page.evaluate('document.body.scrollHeight'); | |
let last_height = starting_height | |
const max_scrolls = 15; // 15 * (1000px every 3s) = 15,000px scroll maximum over 45sec | |
const scroll_px = 1000; | |
const scroll_delay = 3000; | |
let scroll_count = 0; | |
let scroll_position = scroll_count * scroll_px | |
while (scroll_count < max_scrolls) { | |
console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`) | |
// perform the smooth scroll down by 1000px, sleep 2s, and increment the counter | |
await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position); | |
await wait(scroll_delay); | |
scroll_count++ | |
scroll_position = scroll_count * scroll_px | |
// check if any new content was added / if we are infiniscrolling | |
let new_height = await page.evaluate('document.body.scrollHeight') | |
const added_px = new_height - last_height | |
if (added_px > 0) { | |
console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`) | |
} else if (scroll_position >= new_height + scroll_px) { | |
// we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine) | |
break | |
} | |
last_height = new_height | |
} | |
// Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down | |
console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`) | |
await wait(4000); | |
console.log(`[🔝] Scrolling back up to top. (0000/${last_height})`) | |
await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); }); | |
await wait(4000); | |
return last_height | |
} | |
async function saveHeaders(page, response) { | |
console.log(`[👾] Saving request/response headers, title, url, favicon to ${HEADERS_PATH(page)}...`) | |
const title = await page.title() | |
const final_url = await page.url() | |
const favicon = await page.evaluate(() => document.querySelector('link[rel*="icon"]')?.href) | |
const req_headers = response.request().headers() | |
const resp_headers = response.headers() | |
const pageTitle = await page.evaluate(() => document.title); | |
const headers = { | |
url: response.request().url(), | |
method: response.request().method, | |
title: pageTitle, | |
response_url: response.url(), | |
browser_url: final_url, | |
auth: AUTH_JSON_PATH, | |
title, | |
favicon, | |
status: response.status(), | |
statusText: response.statusText(), | |
request: req_headers, | |
response: resp_headers, | |
metrics: await page.metrics(), | |
} | |
fs.writeFileSync(HEADERS_PATH(page), JSON.stringify(headers, null, 4), 'utf-8') | |
} | |
const wait = (ms) => new Promise(res => setTimeout(res, ms)); | |
async function main() { | |
console.log('[🎭] Starting puppeteer cluster...') | |
const cluster = await Cluster.launch({ | |
puppeteer, | |
maxConcurrency: 3, | |
timeout: 240_000, | |
monitor: true, | |
concurrency: Cluster.CONCURRENCY_PAGE, | |
puppeteerOptions: { | |
// dumpio: true, | |
ignoreDefaultArgs: true, | |
args: CHROME_ARGS, | |
} | |
}) | |
// console.log('[🎭] Connecting puppeteer to Chromium headless...') | |
// const browser = await puppeteer.connect({browserURL: 'http://localhost:9222'}) | |
cluster.queue('https://2captcha.com/demo/recaptcha-v2', solveCaptcha) | |
cluster.queue('https://2captcha.com/demo/rotatecaptcha', solveCaptcha) | |
cluster.queue('https://2captcha.com/demo/cloudflare-turnstile', solveCaptcha) | |
await cluster.idle(); | |
console.log('[✅] Finished all tasks. Shutting down cluster...') | |
await cluster.close(); | |
process.exit(0); | |
} | |
main().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment