scraper-expired.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const https = require('https'); | |
const http = require('http'); | |
const fs = require('fs'); | |
const htmlparser = require('htmlparser2'); | |
const urlModule = require('url'); | |
const MAX_CONCURRENT_REQUESTS = 5; | |
const MAX_RETRIES = 1; | |
const RETRY_DELAY = 5000; // Delay in milliseconds (5 seconds) | |
const EXPIRED_DOMAINS_FILE = 'expired_domains.txt'; | |
const MAX_DEPTH = 10; // Par exemple, pour 2 niveaux de profondeur | |
const START_URL = false; | |
if(START_URL) { | |
const INITIAL_HOSTNAME = new URL(START_URL).hostname; | |
} | |
const RESTRICT_TO_SITE = false; // Mettez à false pour crawler le web entier | |
//si START_URL est FALSE, on charge le fichier urls.txt et on le parse pour l'ajouter à urlsToVisit | |
let urlsToVisit = []; | |
if (START_URL === false) { | |
// Vérifier si le fichier urls.txt existe | |
if (fs.existsSync('urls.txt')) { | |
const urls = fs.readFileSync('urls.txt', 'utf8'); | |
const urlsArray = urls.split('\n').map(url => url.trim()); // Nettoyer chaque URL | |
urlsArray.forEach(url => { | |
// Ajouter seulement les URLs valides à urlsToVisit | |
if (isValidUrl(url)) { | |
urlsToVisit.push({ url: url, depth: 0 }); | |
} | |
}); | |
} else { | |
console.error("Le fichier urls.txt n'existe pas."); | |
} | |
} else { | |
urlsToVisit = [{ url: START_URL, depth: 0 }]; | |
} | |
let visitedUrls = new Set(); | |
let checkedDomains = new Set(); | |
// Blacklist of domain patterns | |
const BLACKLISTED_DOMAINS = [ | |
/google\./, | |
/apple\./, | |
/adobe\./, | |
/youtube\./, | |
/facebook\./, | |
/twitter\./, | |
/linkedin\./, | |
/pinterest\./, | |
/bing\./, | |
/yahoo\./, | |
/instagram\./, | |
/amazon\./, | |
/tiktok\./, | |
/gouv\./, | |
// Add more as needed | |
]; | |
function isBlacklisted(url) { | |
const hostname = urlModule.parse(url).hostname || ''; | |
return BLACKLISTED_DOMAINS.some(pattern => pattern.test(hostname)); | |
} | |
function logExpiredDomain(domain) { | |
fs.appendFileSync(EXPIRED_DOMAINS_FILE, domain + '\n'); | |
} | |
async function fetchPage(url, retries = MAX_RETRIES) { | |
return new Promise((resolve, reject) => { | |
const requester = url.startsWith('https:') ? https : http; | |
requester.get(url,{ | |
rejectUnauthorized: false // Ajoutez cette ligne pour ignorer les erreurs de certificat SSL | |
},(res) => { | |
let data = ''; | |
res.on('data', (chunk) => { | |
data += chunk; | |
}); | |
res.on('end', () => { | |
resolve(data); | |
}); | |
}).on('error', async (err) => { | |
if (err.message.includes('ERR_NAME_NOT_RESOLVED')) { | |
console.log(`Domain expired or inaccessible: ${url}`); | |
logExpiredDomain(new URL(url).hostname); | |
}else{ | |
//ignore all others erros | |
resolve(""); | |
} | |
reject(err); | |
}); | |
}); | |
} | |
function isValidUrl(url) { | |
try { | |
// Tentez de créer un nouvel objet URL | |
new URL(url); | |
// Assurez-vous que l'URL commence par http:// ou https:// | |
// et qu'elle n'a pas de protocole imbriqué. | |
const pattern = /^https?:\/\/(?!https?:\/\/)/; | |
return pattern.test(url); | |
} catch (err) { | |
// Si une erreur se produit lors de la création de l'objet URL, | |
// cela signifie que l'URL est invalide. | |
return false; | |
} | |
} | |
function shouldCrawlUrl(url) { | |
const hostname = new URL(url).hostname; | |
return !RESTRICT_TO_SITE || hostname === INITIAL_HOSTNAME; | |
} | |
function addUrlToQueue(url, currentDepth) { | |
if (isValidUrl(url) && shouldCrawlUrl(url) && !visitedUrls.has(url) && currentDepth <= MAX_DEPTH) { | |
urlsToVisit.push({ url: url, depth: currentDepth + 1 }); | |
} | |
} | |
function extractLinks(html, baseUrl) { | |
const links = []; | |
const parser = new htmlparser.Parser({ | |
onopentag: (name, attribs) => { | |
if (name === "a" && attribs.href) { | |
// Convert relative URLs to absolute URLs | |
let absoluteUrl = urlModule.resolve(baseUrl, attribs.href); | |
// Check if the URL is a malformed one with nested protocols | |
if (absoluteUrl.includes("https://https/") || absoluteUrl.includes("http://http/")) { | |
return; // Skip this URL | |
} | |
// Check if the URL is valid and not blacklisted | |
if (isValidUrl(absoluteUrl) && !isBlacklisted(absoluteUrl)) { | |
links.push(absoluteUrl); | |
} | |
} | |
} | |
}, { decodeEntities: true }); | |
parser.write(html); | |
parser.end(); | |
return links; | |
} | |
async function crawl() { | |
while (urlsToVisit.length > 0) { | |
const currentUrlObj = urlsToVisit.shift(); | |
const currentUrl = currentUrlObj.url; | |
const currentDepth = currentUrlObj.depth; | |
//console.log(`Visiting ${currentUrl}`); | |
const domain = new URL(currentUrl).hostname; | |
if (!visitedUrls.has(currentUrl)) { | |
if (!checkedDomains.has(domain)) { | |
checkedDomains.add(domain); | |
visitedUrls.add(currentUrl); | |
const html = await fetchPage(currentUrl); | |
const newUrls = extractLinks(html, currentUrl); | |
for (const newUrl of newUrls) { | |
if (!visitedUrls.has(newUrl)) { | |
addUrlToQueue(newUrl, currentDepth); | |
} | |
} | |
} | |
} | |
const promises = []; | |
for (let i = 0; i < Math.min(MAX_CONCURRENT_REQUESTS - 1, urlsToVisit.length); i++) { | |
const nextUrlObj = urlsToVisit.shift(); | |
const nextUrl = nextUrlObj.url; | |
console.log(`Visiting ${nextUrl}`); | |
const nextDomain = new URL(nextUrl).hostname; | |
if (!visitedUrls.has(nextUrl)) { | |
if (!checkedDomains.has(nextDomain)) { | |
checkedDomains.add(nextDomain); | |
visitedUrls.add(nextUrl); | |
promises.push(fetchPage(nextUrl).then(html => extractLinks(html, nextUrl))); | |
} | |
} | |
} | |
const newUrlsArrays = await Promise.all(promises); | |
for (const newUrls of newUrlsArrays) { | |
for (const newUrl of newUrls) { | |
if (!visitedUrls.has(newUrl)) { | |
addUrlToQueue(newUrl, currentDepth + 1); | |
} | |
} | |
} | |
} | |
} | |
crawl().then(() => { | |
console.log('Crawling completed.'); | |
}).catch((err) => { | |
console.error('An error occurred:', err); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment