Created
April 17, 2024 06:36
-
-
Save iigmir/d29e1941f9b6234b303e00f50324f344 to your computer and use it in GitHub Desktop.
crawl.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { JSDOM } from "jsdom"; | |
import { writeFile, existsSync, mkdirSync } from "fs"; | |
/** | |
* Get HTML content. | |
* @param {String} url | |
* @returns {String} A HTML source code | |
*/ | |
const CrawlPage = async (url = "https://example.com") => { | |
const r = await fetch(url, { | |
headers: { | |
"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; Takami/1.0)" | |
} | |
}); | |
return await r.text(); | |
}; | |
const ParseDocument = (source_code = "") => { | |
const result = new JSDOM(source_code); | |
return result.window.document; | |
}; | |
const project = "example"; | |
function ExportContent(url = "", content = "", links = []) { | |
const dir = `result/${project}`; | |
const file_name = `${String((new URL(url)).hostname)}-${String(Date.now())}`; | |
const file_path = `${dir}/${file_name}.json`; | |
const result_json = JSON.stringify({ url, content, links }); | |
if( !existsSync(dir) ) { | |
mkdirSync(dir); | |
} | |
writeFile(file_path, result_json, (up) => { | |
if (up) throw up; | |
}); | |
} | |
/** | |
* Get pages | |
* @param {String} url Given URL | |
* @param {Number} depth stop the code if this is 0 | |
* @returns | |
*/ | |
const crawl = (url = "https://example.com", depth = 1) => { | |
if( depth === 0 ) { | |
return; | |
} | |
CrawlPage( url ).then( (response) => { | |
// Callbacks | |
const filter_urls = its => { | |
const is_url = ({ href }) => { | |
return href.startsWith("http") | |
// return href.startsWith("/"); | |
}; | |
return is_url(its); | |
}; | |
// const remove_duplicates = (value, index, array) => array.indexOf(value) === index; | |
const get_another_page = (link) => { | |
const href = link.href; | |
setTimeout(() => { | |
crawl(href, depth - 1); | |
}, 900); | |
}; | |
// vars | |
const document = ParseDocument(response); | |
const links = [...document.querySelectorAll("a")].filter( filter_urls ); | |
const exported_links = links.map( ({ href, textContent }) => ({ | |
href, textContent | |
})); | |
// Actions | |
ExportContent( url, document.body.innerHTML, exported_links ); | |
links.forEach( get_another_page ); | |
}).catch( (error) => { | |
console.error( "Error: ", error ); | |
}); | |
}; | |
crawl( "https://example.com", 2 ); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment