Created
July 12, 2021 14:56
-
-
Save kyletaylored/f9bc5f0d82942687038040b1a5cdacdd to your computer and use it in GitHub Desktop.
Traffic simulator in a node script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "traffic-simulator", | |
"version": "1.0.0", | |
"description": "", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"keywords": [], | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"puppeteer": "^10.1.0", | |
"sitemapper": "^3.1.12", | |
"user-agents": "^1.0.710" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer') | |
const UserAgent = require('user-agents'); | |
const Sitemapper = require('sitemapper'); | |
// Generate user agents | |
const userAgent = new UserAgent(); | |
const userAgents = Array(500).fill().map(() => userAgent()); | |
// Get random user agent | |
let assignRandomAgent = async (page) => { | |
const now = Date.now(); | |
// Use either native Puppeteer device, or random UA. 50/50 chance. | |
if (now % 2 == 0) { | |
const agent = randomArrayItem(Object.keys(puppeteer.devices)); | |
console.log("user agent: ", agent); | |
const device = puppeteer.devices[agent]; | |
await page.emulate(device); | |
} else { | |
const agent = randomArrayItem(userAgents); | |
console.log("user agent: ", agent.data.userAgent); | |
await page.setUserAgent(agent.data.userAgent, { | |
"platform": agent.data.platform, | |
"mobile": (agent.data.deviceCategory === 'mobile') | |
}) | |
await page.setViewport({ | |
"width": agent.data.viewportWidth, | |
"height": agent.data.viewportHeight, | |
"isMobile": (agent.data.deviceCategory === 'mobile'), | |
"isLandscape": (agent.data.screenHeight < agent.data.screenWidth) | |
}) | |
} | |
} | |
/** | |
* Return random array item. | |
* @param {Array} arr | |
* @returns | |
*/ | |
const randomArrayItem = (arr) => { | |
return arr[Math.floor(Math.random()*arr.length)]; | |
} | |
// Get pages to crawl | |
const getPosts = async () => { | |
const sitemap = new Sitemapper(); | |
return await sitemap.fetch('https://www.thepaperpeople.work/sitemap_index.xml').then((sites) => { | |
return sites; | |
}); | |
} | |
const clearBrowser = async (page) => { | |
// clear cookies | |
const client = await page.target().createCDPSession() | |
await await client.send('Network.clearBrowserCookies') | |
} | |
const visitPage = async (browser, paths) => { | |
const page = await browser.newPage(); | |
await assignRandomAgent(page); | |
// Go to paths | |
for (let path in paths) { | |
await page.goto(paths[path], {waitUntil: 'networkidle2'}).catch((err) => { | |
console.log("err: ", err); | |
}); | |
console.log(await page.title()); | |
} | |
// Clear cookies | |
clearBrowser(page); | |
// await page.screenshot({ | |
// path: 'full.png', | |
// fullPage: true | |
// }); | |
} | |
/** | |
* Generate random subset of posts. | |
* @param {array} postList | |
* @returns | |
*/ | |
const getRandomPosts = (postList) => { | |
let posts = []; | |
// Grab up to 6 pages to crawl. | |
const limit = Math.floor(Math.random() * 6) + 1; | |
for(let i = 0; i < limit; i++) { | |
let post = randomArrayItem(postList); | |
posts.push(post); | |
} | |
return posts; | |
} | |
/** | |
* Crawl pages | |
* @param object browser | |
* @param array posts | |
* @param array postPromises | |
*/ | |
const crawlPage = (browser, posts, postPromises) => { | |
postPromises.push(visitPage(browser, getRandomPosts(posts))); | |
} | |
// Main | |
(async () => { | |
const postsSitemap = await getPosts(); | |
const posts = postsSitemap.sites; | |
const browser = await puppeteer.launch() | |
let postPromises = []; | |
let crawlCount = 0; | |
setInterval(() => { | |
console.log("Crawl Count: ", crawlCount); | |
crawlCount++; | |
crawlPage(browser, posts, postPromises) | |
}, 5000); | |
// await browser.close(); | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment