Skip to content

Instantly share code, notes, and snippets.

@titomus
Created August 9, 2023 09:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save titomus/893c9f96b09be0fc96cd3ee642283219 to your computer and use it in GitHub Desktop.
Save titomus/893c9f96b09be0fc96cd3ee642283219 to your computer and use it in GitHub Desktop.
scraper-expired.js
const https = require('https');
const http = require('http');
const fs = require('fs');
const htmlparser = require('htmlparser2');
const urlModule = require('url');
const MAX_CONCURRENT_REQUESTS = 5;
const MAX_RETRIES = 1;
const RETRY_DELAY = 5000; // Delay in milliseconds (5 seconds)
const EXPIRED_DOMAINS_FILE = 'expired_domains.txt';
const MAX_DEPTH = 10; // Par exemple, pour 2 niveaux de profondeur
const START_URL = false;
if(START_URL) {
const INITIAL_HOSTNAME = new URL(START_URL).hostname;
}
const RESTRICT_TO_SITE = false; // Mettez à false pour crawler le web entier
//si START_URL est FALSE, on charge le fichier urls.txt et on le parse pour l'ajouter à urlsToVisit
let urlsToVisit = [];
if (START_URL === false) {
// Vérifier si le fichier urls.txt existe
if (fs.existsSync('urls.txt')) {
const urls = fs.readFileSync('urls.txt', 'utf8');
const urlsArray = urls.split('\n').map(url => url.trim()); // Nettoyer chaque URL
urlsArray.forEach(url => {
// Ajouter seulement les URLs valides à urlsToVisit
if (isValidUrl(url)) {
urlsToVisit.push({ url: url, depth: 0 });
}
});
} else {
console.error("Le fichier urls.txt n'existe pas.");
}
} else {
urlsToVisit = [{ url: START_URL, depth: 0 }];
}
let visitedUrls = new Set();
let checkedDomains = new Set();
// Blacklist of domain patterns
const BLACKLISTED_DOMAINS = [
/google\./,
/apple\./,
/adobe\./,
/youtube\./,
/facebook\./,
/twitter\./,
/linkedin\./,
/pinterest\./,
/bing\./,
/yahoo\./,
/instagram\./,
/amazon\./,
/tiktok\./,
/gouv\./,
// Add more as needed
];
function isBlacklisted(url) {
const hostname = urlModule.parse(url).hostname || '';
return BLACKLISTED_DOMAINS.some(pattern => pattern.test(hostname));
}
function logExpiredDomain(domain) {
fs.appendFileSync(EXPIRED_DOMAINS_FILE, domain + '\n');
}
async function fetchPage(url, retries = MAX_RETRIES) {
return new Promise((resolve, reject) => {
const requester = url.startsWith('https:') ? https : http;
requester.get(url,{
rejectUnauthorized: false // Ajoutez cette ligne pour ignorer les erreurs de certificat SSL
},(res) => {
let data = '';
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
resolve(data);
});
}).on('error', async (err) => {
if (err.message.includes('ERR_NAME_NOT_RESOLVED')) {
console.log(`Domain expired or inaccessible: ${url}`);
logExpiredDomain(new URL(url).hostname);
}else{
//ignore all others erros
resolve("");
}
reject(err);
});
});
}
function isValidUrl(url) {
try {
// Tentez de créer un nouvel objet URL
new URL(url);
// Assurez-vous que l'URL commence par http:// ou https://
// et qu'elle n'a pas de protocole imbriqué.
const pattern = /^https?:\/\/(?!https?:\/\/)/;
return pattern.test(url);
} catch (err) {
// Si une erreur se produit lors de la création de l'objet URL,
// cela signifie que l'URL est invalide.
return false;
}
}
function shouldCrawlUrl(url) {
const hostname = new URL(url).hostname;
return !RESTRICT_TO_SITE || hostname === INITIAL_HOSTNAME;
}
function addUrlToQueue(url, currentDepth) {
if (isValidUrl(url) && shouldCrawlUrl(url) && !visitedUrls.has(url) && currentDepth <= MAX_DEPTH) {
urlsToVisit.push({ url: url, depth: currentDepth + 1 });
}
}
function extractLinks(html, baseUrl) {
const links = [];
const parser = new htmlparser.Parser({
onopentag: (name, attribs) => {
if (name === "a" && attribs.href) {
// Convert relative URLs to absolute URLs
let absoluteUrl = urlModule.resolve(baseUrl, attribs.href);
// Check if the URL is a malformed one with nested protocols
if (absoluteUrl.includes("https://https/") || absoluteUrl.includes("http://http/")) {
return; // Skip this URL
}
// Check if the URL is valid and not blacklisted
if (isValidUrl(absoluteUrl) && !isBlacklisted(absoluteUrl)) {
links.push(absoluteUrl);
}
}
}
}, { decodeEntities: true });
parser.write(html);
parser.end();
return links;
}
async function crawl() {
while (urlsToVisit.length > 0) {
const currentUrlObj = urlsToVisit.shift();
const currentUrl = currentUrlObj.url;
const currentDepth = currentUrlObj.depth;
//console.log(`Visiting ${currentUrl}`);
const domain = new URL(currentUrl).hostname;
if (!visitedUrls.has(currentUrl)) {
if (!checkedDomains.has(domain)) {
checkedDomains.add(domain);
visitedUrls.add(currentUrl);
const html = await fetchPage(currentUrl);
const newUrls = extractLinks(html, currentUrl);
for (const newUrl of newUrls) {
if (!visitedUrls.has(newUrl)) {
addUrlToQueue(newUrl, currentDepth);
}
}
}
}
const promises = [];
for (let i = 0; i < Math.min(MAX_CONCURRENT_REQUESTS - 1, urlsToVisit.length); i++) {
const nextUrlObj = urlsToVisit.shift();
const nextUrl = nextUrlObj.url;
console.log(`Visiting ${nextUrl}`);
const nextDomain = new URL(nextUrl).hostname;
if (!visitedUrls.has(nextUrl)) {
if (!checkedDomains.has(nextDomain)) {
checkedDomains.add(nextDomain);
visitedUrls.add(nextUrl);
promises.push(fetchPage(nextUrl).then(html => extractLinks(html, nextUrl)));
}
}
}
const newUrlsArrays = await Promise.all(promises);
for (const newUrls of newUrlsArrays) {
for (const newUrl of newUrls) {
if (!visitedUrls.has(newUrl)) {
addUrlToQueue(newUrl, currentDepth + 1);
}
}
}
}
}
crawl().then(() => {
console.log('Crawling completed.');
}).catch((err) => {
console.error('An error occurred:', err);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment