Skip to content

Instantly share code, notes, and snippets.

@kami4ka
Created November 23, 2021 17:31
Show Gist options
  • Save kami4ka/6cb40dfbce19aed0ba538d44325de9a0 to your computer and use it in GitHub Desktop.
Save kami4ka/6cb40dfbce19aed0ba538d44325de9a0 to your computer and use it in GitHub Desktop.
Reddit scraping with ScrapingAnt
/**
* Get data from Reddit
*
* ScrapingAnt allows you to scrape for free using proxy servers
*
* npm install @scrapingant/scrapingant-client
* npm install cheerio
**/
const cheerio = require('cheerio');
const ScrapingAnt = require('@scrapingant/scrapingant-client');
const API_KEY = '<SCRAPINGANT_API_KEY>';
const URL_TO_SCRAPE = 'https://www.reddit.com/r/webscraping/';
const BASE_URL = 'https://www.reddit.com';
const POSTS_NUMBER = 100; // Due to 30 seconds of maximum execution time, the max value is about 100
const client = new ScrapingAnt({ apiKey: API_KEY });
main()
.then(console.log)
.catch(console.error);
async function main() {
// Get all posts data
const customJS = getCustomJS(POSTS_NUMBER);
const data = [];
const responseResult = await client.scrape(URL_TO_SCRAPE, { js_snippet: customJS });
const $ = cheerio.load(responseResult.content);
const posts = $('div[data-testid="post-container"]');
posts.each((i, el) => {
const url = $(el).find('a[data-click-id="body"]').attr('href');
const title = $(el).find('h3').text();
const timestamp = $(el).find('a[data-click-id="timestamp"]').text();
data.push({
url: BASE_URL + url,
title: title,
timestamp: timestamp
})
});
return data;
}
function getCustomJS(postNumber) {
let customJS = '';
const numberOfScrollIterations = parseInt(postNumber / 25); // Preliminary value, can be tweaked
let iterator = 0;
while (iterator <= numberOfScrollIterations) {
customJS += 'window.scrollTo(0,document.body.scrollHeight);\n' +
'await new Promise(r => setTimeout(r, 2000));\n';
iterator++;
}
return customJS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment