Skip to content

Instantly share code, notes, and snippets.

@mponizil
Last active January 10, 2019 21:16
Show Gist options
  • Save mponizil/5f932fd1a9520980ce27c7f9bf45c8a4 to your computer and use it in GitHub Desktop.
Save mponizil/5f932fd1a9520980ce27c7f9bf45c8a4 to your computer and use it in GitHub Desktop.
class MultiSiteManager {
sites: any = {}
isReady = false
constructor(configs) {
this.sites = configs.reduce((all, config) => ({
...all,
[config.name]: {
name: config.name,
scraper: config.scraper,
maxConcurrency: config.maxConcurrency || 1,
requests: [],
pendingCount: 0
}
}))
}
async init() {
await Bluebird.each(_.values(this.sites), (config) => config.scraper.init({
manager: this
}))
}
async addRequest(name, request) {
request.userData.siteName = name
const globalQueue = await Apify.openRequestQueue()
if (this.isReady) {
return globalQueue.addRequest(request)
} else {
this.sites[name].requests.push(request)
this.isReady = _.reduce(this.sites, (result, site) => result && site.requests.length > 0, true)
if (this.isReady) {
const sitesList = _.map(this.sites, (site, name) => ({
...site,
name
}))
await Bluebird.each(sitesList, async (site) => {
await Bluebird.each(site.requests, (request) => globalQueue.addRequest(request))
})
}
}
}
siteIsAtMaxConcurrency(name) {
return this.sites[name].pendingCount === this.sites[name].maxConcurrency
}
gotoFunction({ page, request }) {
const siteIsAtMaxConcurrency = this.siteIsAtMaxConcurrency(request.userData.siteName)
if (siteIsAtMaxConcurrency) {
request.retryCount--
throw new Error(`site ${request.userData.siteName} is at max concurrency`)
}
// When do we decrement?
this.sites[request.userData.siteName].pendingCount++
return page.goto(request.url, { timeout: 60000 })
}
}
Apify.main(async () => {
const manager = new MultiSiteManager([{
name: 'site1',
scraper: site1
}, {
name: 'site2',
scraper: site2
}, {
name: 'site3',
scraper: site3
}])
await manager.init()
const queue = Apify.openRequestQueue()
const crawler = new Apify.PuppeteerCrawler({
maxConcurrency: 3,
requestQueue: queue,
gotoFunction: (options) => manager.gotoFunction(options),
handlePageFunction: async ({ page, request }) => {
const siteScraper = manager.sites[request.userData.siteName].scraper
await siteScraper[request.userData.pageType]({
manager,
page,
request
})
},
handleFailedRequestFunction: async ({ request }) => {
console.log('error', request.url, request.errorMessages)
}
})
await crawler.run()
})
const handleInit = async ({ manager }) => {
await manager.addRequest('site1', {
url: 'http://site1.com/entrypoint',
userData: {
pageType: 'list'
}
})
}
const handleListPage = async ({ manager, page, request }) => {
console.log('handleListPage', request.url)
const urls = await scrapeUrlsFromListPage()
await Promise.all(urls.map((url) => manager.addRequest('site1', {
url,
userData: {
pageType: 'detail'
}
})))
}
const handleDetailPage = async ({ page, request }) => {
console.log('handleDetailPage', request.url)
}
export default {
init: handleInit,
list: handleListPage,
detail: handleDetailPage
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment