titomus/gist:893c9f96b09be0fc96cd3ee642283219

## gistfile1.txt
const https = require('https');
const http = require('http');
const fs = require('fs');
const htmlparser = require('htmlparser2');
const urlModule = require('url');

const MAX_CONCURRENT_REQUESTS = 5;
const MAX_RETRIES = 1;
const RETRY_DELAY = 5000;  // Delay in milliseconds (5 seconds)
const EXPIRED_DOMAINS_FILE = 'expired_domains.txt';
const MAX_DEPTH = 10;  // Par exemple, pour 2 niveaux de profondeur
const START_URL = false;
if(START_URL) {
const INITIAL_HOSTNAME = new URL(START_URL).hostname;
}
const RESTRICT_TO_SITE = false;  // Mettez à false pour crawler le web entier
//si START_URL est FALSE, on charge le fichier urls.txt et on le parse pour l'ajouter à urlsToVisit
let urlsToVisit = [];
if (START_URL === false) {
    // Vérifier si le fichier urls.txt existe
    if (fs.existsSync('urls.txt')) {
        const urls = fs.readFileSync('urls.txt', 'utf8');
        const urlsArray = urls.split('\n').map(url => url.trim()); // Nettoyer chaque URL

        urlsArray.forEach(url => {
            // Ajouter seulement les URLs valides à urlsToVisit
            if (isValidUrl(url)) {
                urlsToVisit.push({ url: url, depth: 0 });
            }
        });
    } else {
        console.error("Le fichier urls.txt n'existe pas.");
    }
} else {
    urlsToVisit = [{ url: START_URL, depth: 0 }];
}
let visitedUrls = new Set();
let checkedDomains = new Set();
// Blacklist of domain patterns
const BLACKLISTED_DOMAINS = [
    /google\./,
    /apple\./,
    /adobe\./,
    /youtube\./,
    /facebook\./,
    /twitter\./,
    /linkedin\./,
    /pinterest\./,
    /bing\./,
    /yahoo\./,
    /instagram\./,
    /amazon\./,
    /tiktok\./,
    /gouv\./,

    // Add more as needed
];
function isBlacklisted(url) {
    const hostname = urlModule.parse(url).hostname || '';
    return BLACKLISTED_DOMAINS.some(pattern => pattern.test(hostname));
}
function logExpiredDomain(domain) {
    fs.appendFileSync(EXPIRED_DOMAINS_FILE, domain + '\n');
}
async function fetchPage(url, retries = MAX_RETRIES) {
    return new Promise((resolve, reject) => {

        const requester = url.startsWith('https:') ? https : http;
        requester.get(url,{
            rejectUnauthorized: false  // Ajoutez cette ligne pour ignorer les erreurs de certificat SSL


        },(res) => {
            let data = '';
            res.on('data', (chunk) => {
                data += chunk;
            });
            res.on('end', () => {
                resolve(data);
            });
        }).on('error', async (err) => {
            if (err.message.includes('ERR_NAME_NOT_RESOLVED')) {
                console.log(`Domain expired or inaccessible: ${url}`);
                logExpiredDomain(new URL(url).hostname);
            }else{
                //ignore all others erros

                resolve("");
            }
            reject(err);
        });
    });
}
function isValidUrl(url) {
    try {
        // Tentez de créer un nouvel objet URL
        new URL(url);

        // Assurez-vous que l'URL commence par http:// ou https://
        // et qu'elle n'a pas de protocole imbriqué.
        const pattern = /^https?:\/\/(?!https?:\/\/)/;
        return pattern.test(url);
    } catch (err) {
        // Si une erreur se produit lors de la création de l'objet URL,
        // cela signifie que l'URL est invalide.
        return false;
    }
}
function shouldCrawlUrl(url) {
    const hostname = new URL(url).hostname;
    return !RESTRICT_TO_SITE || hostname === INITIAL_HOSTNAME;
}

function addUrlToQueue(url, currentDepth) {
    if (isValidUrl(url) && shouldCrawlUrl(url) && !visitedUrls.has(url) && currentDepth <= MAX_DEPTH) {
        urlsToVisit.push({ url: url, depth: currentDepth + 1 });
    }
}
function extractLinks(html, baseUrl) {
    const links = [];
    const parser = new htmlparser.Parser({
        onopentag: (name, attribs) => {
            if (name === "a" && attribs.href) {
                // Convert relative URLs to absolute URLs
                let absoluteUrl = urlModule.resolve(baseUrl, attribs.href);

                // Check if the URL is a malformed one with nested protocols
                if (absoluteUrl.includes("https://https/") || absoluteUrl.includes("http://http/")) {
                    return;  // Skip this URL
                }

                // Check if the URL is valid and not blacklisted
                if (isValidUrl(absoluteUrl) && !isBlacklisted(absoluteUrl)) {
                    links.push(absoluteUrl);
                }
            }
        }
    }, { decodeEntities: true });
    parser.write(html);
    parser.end();
    return links;
}
async function crawl() {
    while (urlsToVisit.length > 0) {
        const currentUrlObj = urlsToVisit.shift();
        const currentUrl = currentUrlObj.url;
        const currentDepth = currentUrlObj.depth;

        //console.log(`Visiting ${currentUrl}`);
        const domain = new URL(currentUrl).hostname;
        if (!visitedUrls.has(currentUrl)) {
            if (!checkedDomains.has(domain)) {
                checkedDomains.add(domain);
                visitedUrls.add(currentUrl);
                const html = await fetchPage(currentUrl);
                const newUrls = extractLinks(html, currentUrl);
                for (const newUrl of newUrls) {
                    if (!visitedUrls.has(newUrl)) {
                        addUrlToQueue(newUrl, currentDepth);
                    }
                }
            }
        }

        const promises = [];
        for (let i = 0; i < Math.min(MAX_CONCURRENT_REQUESTS - 1, urlsToVisit.length); i++) {
            const nextUrlObj = urlsToVisit.shift();
            const nextUrl = nextUrlObj.url;
            console.log(`Visiting ${nextUrl}`);
            const nextDomain = new URL(nextUrl).hostname;
            if (!visitedUrls.has(nextUrl)) {
                if (!checkedDomains.has(nextDomain)) {
                    checkedDomains.add(nextDomain);
                    visitedUrls.add(nextUrl);
                    promises.push(fetchPage(nextUrl).then(html => extractLinks(html, nextUrl)));
                }
            }
        }
        const newUrlsArrays = await Promise.all(promises);
        for (const newUrls of newUrlsArrays) {
            for (const newUrl of newUrls) {
                if (!visitedUrls.has(newUrl)) {
                    addUrlToQueue(newUrl, currentDepth + 1);
                }
            }
        }
    }
}

crawl().then(() => {
    console.log('Crawling completed.');
}).catch((err) => {
    console.error('An error occurred:', err);
});
	const https = require('https');
	const http = require('http');
	const fs = require('fs');
	const htmlparser = require('htmlparser2');
	const urlModule = require('url');

	const MAX_CONCURRENT_REQUESTS = 5;
	const MAX_RETRIES = 1;
	const RETRY_DELAY = 5000; // Delay in milliseconds (5 seconds)
	const EXPIRED_DOMAINS_FILE = 'expired_domains.txt';
	const MAX_DEPTH = 10; // Par exemple, pour 2 niveaux de profondeur
	const START_URL = false;
	if(START_URL) {
	const INITIAL_HOSTNAME = new URL(START_URL).hostname;
	}
	const RESTRICT_TO_SITE = false; // Mettez à false pour crawler le web entier
	//si START_URL est FALSE, on charge le fichier urls.txt et on le parse pour l'ajouter à urlsToVisit
	let urlsToVisit = [];
	if (START_URL === false) {
	// Vérifier si le fichier urls.txt existe
	if (fs.existsSync('urls.txt')) {
	const urls = fs.readFileSync('urls.txt', 'utf8');
	const urlsArray = urls.split('\n').map(url => url.trim()); // Nettoyer chaque URL

	urlsArray.forEach(url => {
	// Ajouter seulement les URLs valides à urlsToVisit
	if (isValidUrl(url)) {
	urlsToVisit.push({ url: url, depth: 0 });
	}
	});
	} else {
	console.error("Le fichier urls.txt n'existe pas.");
	}
	} else {
	urlsToVisit = [{ url: START_URL, depth: 0 }];
	}
	let visitedUrls = new Set();
	let checkedDomains = new Set();
	// Blacklist of domain patterns
	const BLACKLISTED_DOMAINS = [
	/google\./,
	/apple\./,
	/adobe\./,
	/youtube\./,
	/facebook\./,
	/twitter\./,
	/linkedin\./,
	/pinterest\./,
	/bing\./,
	/yahoo\./,
	/instagram\./,
	/amazon\./,
	/tiktok\./,
	/gouv\./,

	// Add more as needed
	];
	function isBlacklisted(url) {
	const hostname = urlModule.parse(url).hostname \|\| '';
	return BLACKLISTED_DOMAINS.some(pattern => pattern.test(hostname));
	}
	function logExpiredDomain(domain) {
	fs.appendFileSync(EXPIRED_DOMAINS_FILE, domain + '\n');
	}
	async function fetchPage(url, retries = MAX_RETRIES) {
	return new Promise((resolve, reject) => {

	const requester = url.startsWith('https:') ? https : http;
	requester.get(url,{
	rejectUnauthorized: false // Ajoutez cette ligne pour ignorer les erreurs de certificat SSL


	},(res) => {
	let data = '';
	res.on('data', (chunk) => {
	data += chunk;
	});
	res.on('end', () => {
	resolve(data);
	});
	}).on('error', async (err) => {
	if (err.message.includes('ERR_NAME_NOT_RESOLVED')) {
	console.log(`Domain expired or inaccessible: ${url}`);
	logExpiredDomain(new URL(url).hostname);
	}else{
	//ignore all others erros

	resolve("");
	}
	reject(err);
	});
	});
	}
	function isValidUrl(url) {
	try {
	// Tentez de créer un nouvel objet URL
	new URL(url);

	// Assurez-vous que l'URL commence par http:// ou https://
	// et qu'elle n'a pas de protocole imbriqué.
	const pattern = /^https?:\/\/(?!https?:\/\/)/;
	return pattern.test(url);
	} catch (err) {
	// Si une erreur se produit lors de la création de l'objet URL,
	// cela signifie que l'URL est invalide.
	return false;
	}
	}
	function shouldCrawlUrl(url) {
	const hostname = new URL(url).hostname;
	return !RESTRICT_TO_SITE \|\| hostname === INITIAL_HOSTNAME;
	}

	function addUrlToQueue(url, currentDepth) {
	if (isValidUrl(url) && shouldCrawlUrl(url) && !visitedUrls.has(url) && currentDepth <= MAX_DEPTH) {
	urlsToVisit.push({ url: url, depth: currentDepth + 1 });
	}
	}
	function extractLinks(html, baseUrl) {
	const links = [];
	const parser = new htmlparser.Parser({
	onopentag: (name, attribs) => {
	if (name === "a" && attribs.href) {
	// Convert relative URLs to absolute URLs
	let absoluteUrl = urlModule.resolve(baseUrl, attribs.href);

	// Check if the URL is a malformed one with nested protocols
	if (absoluteUrl.includes("https://https/") \|\| absoluteUrl.includes("http://http/")) {
	return; // Skip this URL
	}

	// Check if the URL is valid and not blacklisted
	if (isValidUrl(absoluteUrl) && !isBlacklisted(absoluteUrl)) {
	links.push(absoluteUrl);
	}
	}
	}
	}, { decodeEntities: true });
	parser.write(html);
	parser.end();
	return links;
	}
	async function crawl() {
	while (urlsToVisit.length > 0) {
	const currentUrlObj = urlsToVisit.shift();
	const currentUrl = currentUrlObj.url;
	const currentDepth = currentUrlObj.depth;

	//console.log(`Visiting ${currentUrl}`);
	const domain = new URL(currentUrl).hostname;
	if (!visitedUrls.has(currentUrl)) {
	if (!checkedDomains.has(domain)) {
	checkedDomains.add(domain);
	visitedUrls.add(currentUrl);
	const html = await fetchPage(currentUrl);
	const newUrls = extractLinks(html, currentUrl);
	for (const newUrl of newUrls) {
	if (!visitedUrls.has(newUrl)) {
	addUrlToQueue(newUrl, currentDepth);
	}
	}
	}
	}

	const promises = [];
	for (let i = 0; i < Math.min(MAX_CONCURRENT_REQUESTS - 1, urlsToVisit.length); i++) {
	const nextUrlObj = urlsToVisit.shift();
	const nextUrl = nextUrlObj.url;
	console.log(`Visiting ${nextUrl}`);
	const nextDomain = new URL(nextUrl).hostname;
	if (!visitedUrls.has(nextUrl)) {
	if (!checkedDomains.has(nextDomain)) {
	checkedDomains.add(nextDomain);
	visitedUrls.add(nextUrl);
	promises.push(fetchPage(nextUrl).then(html => extractLinks(html, nextUrl)));
	}
	}
	}
	const newUrlsArrays = await Promise.all(promises);
	for (const newUrls of newUrlsArrays) {
	for (const newUrl of newUrls) {
	if (!visitedUrls.has(newUrl)) {
	addUrlToQueue(newUrl, currentDepth + 1);
	}
	}
	}
	}
	}

	crawl().then(() => {
	console.log('Crawling completed.');
	}).catch((err) => {
	console.error('An error occurred:', err);
	});