iigmir/crawl.js

## crawl.js
import { JSDOM } from "jsdom";
import { writeFile, existsSync, mkdirSync } from "fs";

/**
 * Get HTML content.
 * @param {String} url
 * @returns {String} A HTML source code
 */
const CrawlPage = async (url = "https://example.com") => {
    const r = await fetch(url, {
        headers: {
            "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; Takami/1.0)"
        }
    });
    return await r.text();
};

const ParseDocument = (source_code = "") => {
    const result = new JSDOM(source_code);
    return result.window.document;
};

const project = "example";
function ExportContent(url = "", content = "", links = []) {
    const dir = `result/${project}`;
    const file_name = `${String((new URL(url)).hostname)}-${String(Date.now())}`;
    const file_path = `${dir}/${file_name}.json`;
    const result_json = JSON.stringify({ url, content, links });
    if( !existsSync(dir) ) {
        mkdirSync(dir);
    }
    writeFile(file_path, result_json, (up) => {
        if (up) throw up;
    });
}

/**
 * Get pages
 * @param {String} url Given URL
 * @param {Number} depth stop the code if this is 0
 * @returns
 */
const crawl = (url = "https://example.com", depth = 1) => {
    if( depth === 0 ) {
        return;
    }
    CrawlPage( url ).then( (response) => {
        // Callbacks
        const filter_urls = its => {
            const is_url = ({ href }) => {
                return href.startsWith("http")
                // return href.startsWith("/");
            };
            return is_url(its);
        };
        // const remove_duplicates = (value, index, array) => array.indexOf(value) === index;
        const get_another_page = (link) => {
            const href = link.href;
            setTimeout(() => {
                crawl(href, depth - 1);
            }, 900);
        };

        // vars
        const document = ParseDocument(response);
        const links = [...document.querySelectorAll("a")].filter( filter_urls );
        const exported_links = links.map( ({ href, textContent }) => ({
            href, textContent
        }));

        // Actions
        ExportContent( url, document.body.innerHTML, exported_links );
        links.forEach( get_another_page );
    }).catch( (error) => {
        console.error( "Error: ", error );
    });
};

crawl( "https://example.com", 2 );
	import { JSDOM } from "jsdom";
	import { writeFile, existsSync, mkdirSync } from "fs";

	/**
	* Get HTML content.
	* @param {String} url
	* @returns {String} A HTML source code
	*/
	const CrawlPage = async (url = "https://example.com") => {
	const r = await fetch(url, {
	headers: {
	"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; Takami/1.0)"
	}
	});
	return await r.text();
	};

	const ParseDocument = (source_code = "") => {
	const result = new JSDOM(source_code);
	return result.window.document;
	};

	const project = "example";
	function ExportContent(url = "", content = "", links = []) {
	const dir = `result/${project}`;
	const file_name = `${String((new URL(url)).hostname)}-${String(Date.now())}`;
	const file_path = `${dir}/${file_name}.json`;
	const result_json = JSON.stringify({ url, content, links });
	if( !existsSync(dir) ) {
	mkdirSync(dir);
	}
	writeFile(file_path, result_json, (up) => {
	if (up) throw up;
	});
	}

	/**
	* Get pages
	* @param {String} url Given URL
	* @param {Number} depth stop the code if this is 0
	* @returns
	*/
	const crawl = (url = "https://example.com", depth = 1) => {
	if( depth === 0 ) {
	return;
	}
	CrawlPage( url ).then( (response) => {
	// Callbacks
	const filter_urls = its => {
	const is_url = ({ href }) => {
	return href.startsWith("http")
	// return href.startsWith("/");
	};
	return is_url(its);
	};
	// const remove_duplicates = (value, index, array) => array.indexOf(value) === index;
	const get_another_page = (link) => {
	const href = link.href;
	setTimeout(() => {
	crawl(href, depth - 1);
	}, 900);
	};

	// vars
	const document = ParseDocument(response);
	const links = [...document.querySelectorAll("a")].filter( filter_urls );
	const exported_links = links.map( ({ href, textContent }) => ({
	href, textContent
	}));

	// Actions
	ExportContent( url, document.body.innerHTML, exported_links );
	links.forEach( get_another_page );
	}).catch( (error) => {
	console.error( "Error: ", error );
	});
	};

	crawl( "https://example.com", 2 );