Skip to content

Instantly share code, notes, and snippets.

@kami4ka
Created September 13, 2021 08:28
Show Gist options
  • Save kami4ka/00eff02b3f124daa9598c8d5ac76b0f4 to your computer and use it in GitHub Desktop.
Save kami4ka/00eff02b3f124daa9598c8d5ac76b0f4 to your computer and use it in GitHub Desktop.
Most popular Medium writers scraper
/**
* Get the most popular Medium writers by tags
*
* ScrapingAnt allows you to scrape for free using proxy servers
*
* npm install @scrapingant/scrapingant-client
* npm install cheerio
**/
const ScrapingAntClient = require('@scrapingant/scrapingant-client');
const cheerio = require('cheerio');
const client = new ScrapingAntClient({ apiKey: "<SCRAPINGANT_API_KEY>" });
const tags = ['leadership', 'productivity', 'startup', 'technology', 'creativity', 'entrepreneurship'];
const followersRegex = />([0-9KM.]* Followers)<\//;
const openTopWritersJS = `document.evaluate("//p[text()='See More']", document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.click()`;
(async () => {
const writers = [];
for (const tag of tags) {
const response = await client.scrape(getTagURL(tag), { proxy_country: 'US', js_snippet: openTopWritersJS });
writers.push(...await getDataFromTagPage(response.content));
}
for (const writer of writers) {
const writerResponse = await client.scrape(writer.url, { proxy_country: 'US' });
writer.followers = extractFollowersCount(writerResponse.content);
}
// scraped results
console.log(writers);
})();
function getTagURL(tag) {
return `https://medium.com/tag/${tag}`;
}
function getDataFromTagPage(html) {
const pageResults = [];
const $ = cheerio.load(html);
const dialog = $('div[role=dialog]');
const links = $(dialog).find('a');
links.each((i, link) => {
if (i % 2 === 1) {
const result = {};
result.url = getFullURL($(link).attr('href'));
result.name = $(link).find('h2').text();
pageResults.push(result);
}
});
return pageResults;
}
function getFullURL(href) {
if (href.startsWith('/')) {
return `https://medium.com${href}`
}
return href;
}
function extractFollowersCount(writerContent) {
return followersRegex.exec(writerContent)[1];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment