Skip to content

Instantly share code, notes, and snippets.

@ejfox
Last active July 26, 2020 04:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ejfox/15c8dacc4be31e5012290fd9124c3aec to your computer and use it in GitHub Desktop.
Save ejfox/15c8dacc4be31e5012290fd9124c3aec to your computer and use it in GitHub Desktop.
// Modified from https://github.com/siegfriedgrimbeek/cheerio-pagination-tutorial
// Thanks to Siegfried for open sourcing this
// I only modified it a tiny bit to work for qmap
// External dependencies
const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const chalk = require('chalk')
const baseUrl = 'https://qmap.pub'
const outputFile = 'data.json'
const parsedResults = []
const pageLimit = 120
let pageCounter = 0
let resultCount = 0
console.log(chalk.yellow.bgBlue(`\n Scraping of ${chalk.underline.bold(baseUrl)} initiated...\n`))
const getWebsiteContent = async (url) => {
try {
const response = await axios.get(url)
const $ = cheerio.load(response.data)
// New Lists
$('.container .card').map((i, el) => {
const count = resultCount++
const url = $(el).find('h5 a').attr('href')
const title = $(el).find('h5').text()
const text = $(el).find('.card-text').text()
const tripcode = $(el).find('.card-header .d-flex').next().next().find('span').text()
const dateTime = $(el).find('.card-header .d-flex').next().next().find('time').text()
const metadata = {
count: count,
title: title,
text: text,
url: url,
tripcode: tripcode,
dateTime: dateTime
}
parsedResults.push(metadata)
})
// Pagination Elements Link
const nextPageLink = `${baseUrl}/?pg=${pageCounter+1}`
console.log(chalk.cyan(` Scraping: ${nextPageLink}`))
pageCounter++
if (pageCounter === pageLimit) {
exportResults(parsedResults)
return false
}
getWebsiteContent(nextPageLink)
} catch (error) {
exportResults(parsedResults)
console.error(error)
}
}
const exportResults = (parsedResults) => {
fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
if (err) {
console.log(err)
}
console.log(chalk.yellow.bgBlue(`\n ${chalk.underline.bold(parsedResults.length)} Results exported successfully to ${chalk.underline.bold(outputFile)}\n`))
})
}
getWebsiteContent(baseUrl)
{
"name": "cheerio-pagination-tutorial",
"version": "1.0.0",
"description": "A simple tutorial demonstrating Cheerios ability to scrape multiple webpages.",
"author": "Siegfried <siegfried.grimbeek@gmail.com>",
"main": "index.js",
"scripts": {
"start": "./node_modules/nodemon/bin/nodemon.js ./src/index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [
"cheerio",
"axios",
"nodejs",
"scrape"
],
"license": "ISC",
"dependencies": {
"axios": "^0.18.0",
"chalk": "^2.4.1",
"cheerio": "^1.0.0-rc.2",
"nodemon": "^1.15.1"
},
"nodemonConfig": {
"ignore": [
"*.json"
]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment