Skip to content

Instantly share code, notes, and snippets.

@iigmir
Created April 17, 2024 06:36
Show Gist options
  • Save iigmir/d29e1941f9b6234b303e00f50324f344 to your computer and use it in GitHub Desktop.
Save iigmir/d29e1941f9b6234b303e00f50324f344 to your computer and use it in GitHub Desktop.
crawl.js
import { JSDOM } from "jsdom";
import { writeFile, existsSync, mkdirSync } from "fs";
/**
* Get HTML content.
* @param {String} url
* @returns {String} A HTML source code
*/
const CrawlPage = async (url = "https://example.com") => {
const r = await fetch(url, {
headers: {
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; Takami/1.0)"
}
});
return await r.text();
};
const ParseDocument = (source_code = "") => {
const result = new JSDOM(source_code);
return result.window.document;
};
const project = "example";
function ExportContent(url = "", content = "", links = []) {
const dir = `result/${project}`;
const file_name = `${String((new URL(url)).hostname)}-${String(Date.now())}`;
const file_path = `${dir}/${file_name}.json`;
const result_json = JSON.stringify({ url, content, links });
if( !existsSync(dir) ) {
mkdirSync(dir);
}
writeFile(file_path, result_json, (up) => {
if (up) throw up;
});
}
/**
* Get pages
* @param {String} url Given URL
* @param {Number} depth stop the code if this is 0
* @returns
*/
const crawl = (url = "https://example.com", depth = 1) => {
if( depth === 0 ) {
return;
}
CrawlPage( url ).then( (response) => {
// Callbacks
const filter_urls = its => {
const is_url = ({ href }) => {
return href.startsWith("http")
// return href.startsWith("/");
};
return is_url(its);
};
// const remove_duplicates = (value, index, array) => array.indexOf(value) === index;
const get_another_page = (link) => {
const href = link.href;
setTimeout(() => {
crawl(href, depth - 1);
}, 900);
};
// vars
const document = ParseDocument(response);
const links = [...document.querySelectorAll("a")].filter( filter_urls );
const exported_links = links.map( ({ href, textContent }) => ({
href, textContent
}));
// Actions
ExportContent( url, document.body.innerHTML, exported_links );
links.forEach( get_another_page );
}).catch( (error) => {
console.error( "Error: ", error );
});
};
crawl( "https://example.com", 2 );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment