kami4ka/reddit-scraper.js

## reddit-scraper.js
/**
 * Get data from Reddit
 *
 * ScrapingAnt allows you to scrape for free using proxy servers
 *
 * npm install @scrapingant/scrapingant-client
 * npm install cheerio
 **/

const cheerio = require('cheerio');
const ScrapingAnt = require('@scrapingant/scrapingant-client');

const API_KEY = '<SCRAPINGANT_API_KEY>';
const URL_TO_SCRAPE = 'https://www.reddit.com/r/webscraping/';
const BASE_URL = 'https://www.reddit.com';
const POSTS_NUMBER = 100; // Due to 30 seconds of maximum execution time, the max value is about 100

const client = new ScrapingAnt({ apiKey: API_KEY });

main()
    .then(console.log)
    .catch(console.error);

async function main() {
    // Get all posts data
    const customJS = getCustomJS(POSTS_NUMBER);

    const data = [];
    const responseResult = await client.scrape(URL_TO_SCRAPE, { js_snippet: customJS });
    const $ = cheerio.load(responseResult.content);

    const posts = $('div[data-testid="post-container"]');

    posts.each((i, el) => {
        const url = $(el).find('a[data-click-id="body"]').attr('href');
        const title = $(el).find('h3').text();
        const timestamp = $(el).find('a[data-click-id="timestamp"]').text();

        data.push({
            url: BASE_URL + url,
            title: title,
            timestamp: timestamp
        })
    });

    return data;
}

function getCustomJS(postNumber) {
    let customJS = '';
    const numberOfScrollIterations = parseInt(postNumber / 25); // Preliminary value, can be tweaked

    let iterator = 0;

    while (iterator <= numberOfScrollIterations) {
        customJS += 'window.scrollTo(0,document.body.scrollHeight);\n' +
            'await new Promise(r => setTimeout(r, 2000));\n';
        iterator++;
    }

    return customJS;
}
	/**
	* Get data from Reddit
	*
	* ScrapingAnt allows you to scrape for free using proxy servers
	*
	* npm install @scrapingant/scrapingant-client
	* npm install cheerio
	**/

	const cheerio = require('cheerio');
	const ScrapingAnt = require('@scrapingant/scrapingant-client');

	const API_KEY = '<SCRAPINGANT_API_KEY>';
	const URL_TO_SCRAPE = 'https://www.reddit.com/r/webscraping/';
	const BASE_URL = 'https://www.reddit.com';
	const POSTS_NUMBER = 100; // Due to 30 seconds of maximum execution time, the max value is about 100

	const client = new ScrapingAnt({ apiKey: API_KEY });

	main()
	.then(console.log)
	.catch(console.error);

	async function main() {
	// Get all posts data
	const customJS = getCustomJS(POSTS_NUMBER);

	const data = [];
	const responseResult = await client.scrape(URL_TO_SCRAPE, { js_snippet: customJS });
	const $ = cheerio.load(responseResult.content);

	const posts = $('div[data-testid="post-container"]');

	posts.each((i, el) => {
	const url = $(el).find('a[data-click-id="body"]').attr('href');
	const title = $(el).find('h3').text();
	const timestamp = $(el).find('a[data-click-id="timestamp"]').text();

	data.push({
	url: BASE_URL + url,
	title: title,
	timestamp: timestamp
	})
	});

	return data;
	}

	function getCustomJS(postNumber) {
	let customJS = '';
	const numberOfScrollIterations = parseInt(postNumber / 25); // Preliminary value, can be tweaked

	let iterator = 0;

	while (iterator <= numberOfScrollIterations) {
	customJS += 'window.scrollTo(0,document.body.scrollHeight);\n' +
	'await new Promise(r => setTimeout(r, 2000));\n';
	iterator++;
	}

	return customJS;
	}