Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraping de Human Coders avec Apify
{
"startUrls": [
{
"url": "https://www.humancoders.com/formations",
"method": "GET",
"userData": {
"label": "START"
}
}
],
"useRequestQueue": true,
"keepUrlFragments": false,
"pseudoUrls": [
{
"purl": "https://www.humancoders.com/formations/[[^/]+]",
"method": "GET",
"userData": {
"label": "FORMATION"
}
}
],
"linkSelector": "a:not([rel=nofollow])",
"pageFunction": "async function pageFunction(context) {\n const { request, log, skipLinks, $ } = context; // $ is Cheerio\n if (request.userData.label === 'START') {\n log.info('start!');\n\n }\n if (request.userData.label === 'FORMATION') {\n await skipLinks();\n // The \"$\" property contains the Cheerio object which is useful\n // for querying DOM elements and extracting data from them.\n const pageTitle = $('title').first().text();\n\n // The \"request\" property contains various information about the web page loaded. \n const url = request.url;\n log.info('Page scraped', { url, pageTitle });\n\n\n // Return an object with the data extracted from the page.\n // It will be stored to the resulting dataset.\n return {\n url,\n ref: url,\n title: $('h1').first().text().trim(),\n meta_canonical: $('link[rel=canonical]').attr('href'),\n meta_keywords: $('meta[name=keywords]').attr('content'),\n meta_description: $('meta[name=description]').attr('content'),\n meta_title: pageTitle,\n category: $('.breadcrumb-item:nth-child(2)').eq(0).text().trim(),\n price: $('.price .value').text().match(/\\d+/g)[0],\n duration_days: $('.duration .value').text().match(/\\d+/g)[0],\n content: $('#description, #objectives, #prerequisites, #outline, #trainer').text().trim(),\n subtitle: $('.training-head .lead').text().trim(),\n ratings_count: $('.stars').length > 0 ? $('.stars span').eq(-1).text().match(/\\d+/g)[0] : 0,\n reviews_count: $('#quotes').length > 0 ? $('#quotes h3').text().match(/\\d+/g)[0] : 0,\n };\n }\n}",
"proxyConfiguration": {
"useApifyProxy": false
},
"proxyRotation": "RECOMMENDED",
"prepareRequestFunction": "async function prepareRequest({ request, Apify }) {\n /* add your logic here, if needed */\n}",
"forceResponseEncoding": false,
"ignoreSslErrors": false,
"maxRequestRetries": 0,
"maxPagesPerCrawl": 0,
"maxResultsPerCrawl": 0,
"debugLog": false
}
async function pageFunction(context) {
const { request, log, skipLinks, $ } = context; // $ is Cheerio
// The "$" property contains the Cheerio object which is useful
// for querying DOM elements and extracting data from them.
if (request.userData.label === 'START') {
log.info('start!');
}
if (request.userData.label === 'FORMATION') {
await skipLinks();
const pageTitle = $('title').first().text();
// The "request" property contains various information about the web page loaded.
const url = request.url;
log.info('Page scraped', { url, pageTitle });
// Return an object with the data extracted from the page.
// It will be stored to the resulting dataset.
return {
url,
ref: url,
title: $('h1').first().text().trim(),
meta_canonical: $('link[rel=canonical]').attr('href'),
meta_keywords: $('meta[name=keywords]').attr('content'),
meta_description: $('meta[name=description]').attr('content'),
meta_title: pageTitle,
category: $('.breadcrumb-item:nth-child(2)').eq(0).text().trim(),
price: $('.price .value').text().match(/\d+/g)[0],
duration_days: $('.duration .value').text().match(/\d+/g)[0],
content: $('#description, #objectives, #prerequisites, #outline, #trainer').text().trim(),
subtitle: $('.training-head .lead').text().trim(),
ratings_count: $('.stars').length > 0 ? $('.stars span').eq(-1).text().match(/\d+/g)[0] : 0,
reviews_count: $('#quotes').length > 0 ? $('#quotes h3').text().match(/\d+/g)[0] : 0,
};
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.