Last active
January 7, 2021 12:42
-
-
Save alexandru/ce4459b1c4fc3e4567421b937d95675e to your computer and use it in GitHub Desktop.
Web crawler that downloads HTML content (for analysis) from a list of websites, exporting content as JSON lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
import { IO, Try, Success, Failure, Either, Left, Right, Cancelable, Duration } from "funfix" | |
import { RequestResponse } from "request" | |
import * as fs from "fs" | |
import * as request from "request" | |
import * as Url from "url" | |
import * as cheerio from "cheerio" | |
import * as minimist from "minimist" | |
type Record<K extends string, T> = { | |
[P in K]: T | |
} | |
type Options = { | |
parallelism: number, | |
maxDepth: number, | |
timeout: Duration, | |
maxRedirects: number, | |
maxRetries: number, | |
verbose: boolean | |
} | |
const CONTENT_TYPES: Record<string, boolean> = { | |
'text/html': true, | |
'application/xhtml+xml': true, | |
'application/xml': true | |
} | |
const HEADERS = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml', | |
'Accept-Language': 'en-US,en;q=0.8', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' | |
} | |
/** Prints an error to STDERR, prints help, then exists. */ | |
function printErrorAndExit(error: string, code: number): void { | |
console.error(`ERROR: ${error}\n`) | |
printHelpAndExit(code) | |
} | |
/** Prints a command-line help message. */ | |
function printHelpAndExit(code: number = 1) { | |
console.error("Usage: ./dist/websites.js path/to/list.txt [options]") | |
console.error("\nOPTIONS:") | |
console.error(" --parallelism specifies the maximum parallelism, defaults to 8") | |
console.error(" --timeout specifies the request timeout in millis, defaults to 5000") | |
console.error(" --maxDepth maximum depth (from website root) to crawl, defaults to 1") | |
console.error(" --maxRedirects maximum number of redirects followed on a single request") | |
console.error(" --maxRetries maximum number of times to retry requests on unexpected errors") | |
console.error(" --verbose prints URLs fetched to STDERR") | |
console.error() | |
process.exit(code) | |
} | |
/** | |
* Given an `url`, return it with the path and hash parts stripped. | |
*/ | |
function rootURLOf(url: string): string { | |
const p = Url.parse(url) | |
const protocol = p.protocol ? p.protocol + "//" : "" | |
const hostname = p.hostname || "" | |
const port = p.port ? `:${p.port}` : "" | |
return `${protocol}${hostname}${port}` | |
} | |
/** Stupid util for building an URL from a relative path. */ | |
function joinPath(currentURL: string, link: string): string { | |
// Is link a hash bang? | |
if (link.startsWith('#')) return currentURL | |
// Is link a full URL? | |
if (link.match(/^[a-zA-Z]+[:]\/\//)) return link | |
// Is link an absolute path? | |
if (link.startsWith("/")) return rootURLOf(currentURL) + link | |
// Is currentURL a path to a file? | |
const m = currentURL.match(/^(.+?)\/\w+$/) | |
if (m) return m[1] + "/" + link | |
// Must be a directory + relative path | |
const noEndSlash = currentURL.replace(/\/+$/, "") | |
return noEndSlash + "/" + link | |
} | |
/** Exports `obj` as JSON to STDOUT and waits for callback. */ | |
function writeLineToSTDOUT(obj: Object): IO<void> { | |
return IO.async((ec, cb) => { | |
process.stdout.write(JSON.stringify(obj) + "\n", "utf-8", (err: any) => { | |
if (err) cb(Failure(err)) | |
else cb(Try.unit()) | |
}) | |
}) | |
} | |
/** Writes log lines to STDERR. */ | |
function writeLogToStdErr(msg: string): IO<void> { | |
return IO.async((ec, cb) => { | |
process.stderr.write(msg + "\n", "utf-8", (err: any) => { | |
if (err) cb(Failure(err)) | |
else cb(Try.unit()) | |
}) | |
}) | |
} | |
/** Returns the domain from the given URL, or null if not a valid URL. */ | |
function getDomainOf(url: string): string | null { | |
const host = url.match(/https?:\/\/([^\/]+)/) | |
if (!host) return null | |
const ext = host[1].match(/\.(com?\.\w{2,3}|org\.\w{2,3}|\w{2,6})$/i) | |
if (!ext) return null | |
const domain = host[1].match(new RegExp(`(?:^|\.)(\\w+)\.${ext[1].replace('.', '\\.')}$`)) | |
if (!domain) return null | |
return domain[1] | |
} | |
/** Returns `true` if the Content-Type header is HTML. */ | |
function checkContentIsHtml(contentType: string | string[] | undefined): boolean { | |
const cts = typeof contentType === 'string' | |
? [contentType] | |
: (contentType || []) | |
for (const t of cts) { | |
const parts = t.toLowerCase().replace(/^\s+|\s+$/g, "").split(/\s*;\s*/) | |
for (const p of parts) | |
if (CONTENT_TYPES[p]) return true | |
} | |
return false | |
} | |
/** | |
* Retries on failure, until it succeeds, or until the maximum | |
* number of retries is reached. | |
*/ | |
function retryOnFailure<A>(fa: IO<A>, maxRetries: number): IO<A> { | |
return fa.recoverWith(err => maxRetries > 0 | |
? retryOnFailure(fa, maxRetries - 1) | |
: IO.raise(err)) | |
} | |
/** Fetch body of URL, but only for HTTP Status 200. */ | |
function fetch(url: string, opts: Options): IO<string> { | |
if (!url.match(/^https?[:]/i)) { | |
return IO.raise(new Error(`Not a valid URL: ${url}`)) | |
} | |
// Uses `Either` to signal failures that we can't recover from | |
// (e.g. there's no point in retrying a 404 Not Found) | |
const raw = IO.async<Either<Error, string>>((ec, cb) => { | |
const options: request.CoreOptions = { | |
headers: HEADERS, | |
followAllRedirects: true, | |
followRedirect: true, | |
timeout: Math.round(opts.timeout.toMillis() / 2), | |
maxRedirects: opts.maxRedirects | |
} | |
const task = request(url, options, (error, resp, body) => { | |
if (error) | |
return cb(Failure(error)) | |
else if (resp.statusCode !== 200) | |
return cb(Success(Left(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`)))) | |
else { | |
const contentType = resp.headers['content-type'] | |
if (checkContentIsHtml(contentType) && typeof body === 'string') | |
return cb(Success(Right(body))) | |
else | |
return cb(Success(Left(new Error(`Unexpected Content-Type: ${contentType}`)))) | |
} | |
}) | |
return Cancelable.of(() => task.abort()) | |
}) | |
const log = opts.verbose ? writeLogToStdErr(url) : IO.unit() | |
const io = retryOnFailure(raw, opts.maxRetries) | |
.flatMap(r => r.fold(IO.raise, IO.pure)) | |
return log.followedBy(io) | |
} | |
/** | |
* Some domains are now redirected, but we need to find the new | |
* hostname or protocol of the website, because we only want to crawl | |
* our list, not the whole internet. | |
*/ | |
function discoverRoot(url: string, opts: Options): IO<string> { | |
const options: request.CoreOptions = { | |
headers: HEADERS, | |
followAllRedirects: false, | |
followRedirect: false, | |
method: "HEAD", | |
timeout: opts.timeout.toMillis(), | |
maxRedirects: 0 | |
} | |
// Recursive loop that keeps going until HTTP 200 or until error | |
function loop(current: string, count: number): IO<string> { | |
if (count >= opts.maxRedirects) | |
return IO.raise(new Error(`Maximum redirect count exceeded (${count})`)) | |
const req = IO.async<RequestResponse>((ec, cb) => { | |
const task = request(current, options, (error, resp) => { | |
if (error) return cb(Failure(error)) | |
return cb(Success(resp)) | |
}) | |
return Cancelable.of(() => task) | |
}) | |
const process: IO<Either<string, string>> = | |
retryOnFailure(req, opts.maxRetries).flatMap(resp => { | |
if (resp.statusCode === 301 || resp.statusCode === 302 || resp.statusCode === 303) { | |
const location = resp.headers['location'] as string | |
const next = joinPath(current, location) | |
return IO.pure(Left(next)) | |
} | |
else if (!resp.statusCode || resp.statusCode >= 400) | |
return IO.raise(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`)) | |
else | |
return IO.pure(Right(current)) | |
}) | |
// Loops until error, or HTTP 200 | |
return process.flatMap(either => | |
either.fold( | |
l => loop(l, count + 1), | |
r => IO.pure(r) | |
)) | |
} | |
return loop(url, 0).flatMap(location => | |
getDomainOf(location) === getDomainOf(url) | |
? IO.pure(rootURLOf(location)) | |
: IO.raise(`Redirected, cannot follow at: ${location}`)) | |
} | |
/** | |
* Crawls the given website. | |
* | |
* @param origin is the website read from the input websites file | |
* | |
* @param root represents the starting point, e.g. `https://google.com` | |
* | |
* @param urls is the stack of websites belonging to this hostname | |
* that are scheduled for crawling, plus the detected "depth" | |
* (path from the root) | |
* | |
* @param visited is a map of all visited URLs, to avoid visiting | |
* the same ones repeatedly | |
* | |
* @param opts are crawler options | |
*/ | |
function crawlWebsite( | |
origin: string, | |
root: string, | |
urls: [string, number][], | |
visited: Record<string, boolean>, | |
opts: Options): IO<void> { | |
return IO.suspend(() => { | |
let urlInfo: [string, number] | undefined | |
const rootDomain = getDomainOf(root) | |
while (urlInfo = urls.pop()) { | |
const [url, currentDepth] = urlInfo | |
const id = url.replace(/[\/]+$/, "") | |
const shouldSkip = | |
visited[id] || | |
currentDepth > opts.maxDepth || | |
rootDomain !== getDomainOf(url) | |
if (shouldSkip) continue | |
visited[id] = true | |
return fetch(url, opts).attempt().flatMap(r => { | |
const obj = r.fold( | |
err => ({ origin, root, url, error: `Unexpected: ${err}` }) as Object, | |
body => { | |
const $ = cheerio.load(body) | |
$("a").each((idx, elem) => { | |
let another = (elem.attribs['href'] || "") | |
.replace(/^\s+|\s+$/g, "") // trim it | |
.replace(/[#][^$]+$/, "") // eliminate the hash part | |
if (another) { | |
// Filtering: is it an HTML link? If not, then no crawling! | |
const parsed = Url.parse(another) | |
const isHttp = !parsed.protocol || parsed.protocol.match(/^http/i) | |
const isBinary = another.match(/[.](jpe?g|jpe?g_large|png|gif|pdf)$/i) | |
if (isHttp && !isBinary) { | |
const newUrl = joinPath(url, another) | |
if (!!newUrl && !visited[newUrl]) | |
urls.push([newUrl, currentDepth + 1]) | |
} | |
} | |
}) | |
if (typeof body === 'string') return { origin, root, url, body } | |
return null | |
}) | |
const next = urls.length > 0 | |
? crawlWebsite(origin, root, urls, visited, opts) | |
: IO.unit() | |
if (obj !== null) return writeLineToSTDOUT(obj).followedBy(next) | |
return next | |
}) | |
} | |
return IO.unit() | |
}) | |
} | |
/** | |
* Models a worker that pops from the stack of `websites` and crawls | |
* them until there are no more websites left to crawl. | |
*/ | |
function startWorkerLoop(websites: string[], opts: Options): IO<void> { | |
return IO.suspend(() => { | |
const origin = websites.pop() | |
if (!origin) return IO.unit() | |
const crawl = discoverRoot(origin, opts).attempt().flatMap(root => | |
root.isLeft() | |
? writeLineToSTDOUT({ origin, error: `Failed root discovery: ${root.swap().get()}` }) | |
: crawlWebsite(origin, root.get(), [[root.get(), 0]], {}, opts) | |
) | |
// Keep going until no more websites in the queue! | |
return crawl.flatMap(_ => | |
startWorkerLoop(websites, opts)) | |
}) | |
} | |
/** | |
* Main function that initiates workers to run in parallel. | |
*/ | |
function crawlAllWebsites(websites: string[], opts: Options): IO<void> { | |
return IO.suspend(() => { | |
console.error( | |
"Started crawling with " + | |
`parallelism=${opts.parallelism}, ` + | |
`maxDepth=${opts.maxDepth}, ` + | |
`timeout=${opts.timeout.toMillis()}ms, ` + | |
`maxRedirects=${opts.maxRedirects}, ` + | |
`maxRetries=${opts.maxRetries} and ` + | |
`verbose=${opts.verbose}` | |
) | |
const workers: Array<IO<void>> = [] | |
for (let i = 0; i < opts.parallelism; i++) { | |
workers.push(startWorkerLoop(websites, opts)) | |
} | |
// Workers run in parallel, but the workers are sequential | |
return IO.gather(workers).map(() => {}) | |
}) | |
} | |
const main = IO.suspend<void>(() => { | |
const argv = minimist(process.argv.slice(2)) | |
if (!argv["_"][0]) { | |
printErrorAndExit("Missing path to text file with the list of websites!", 2) | |
} else if (!fs.existsSync(argv["_"][0])) { | |
printErrorAndExit(`File not found: ${process.argv[2]}`, 3) | |
} | |
const opts: Options = { | |
parallelism: parseInt(argv['parallelism'] || 8), | |
timeout: Duration.of(parseInt(argv['timeout'] || 5000)), | |
maxDepth: parseInt(argv['maxDepth'] || 1), | |
maxRedirects: parseInt(argv['maxRedirects'] || 10), | |
maxRetries: parseInt(argv['maxRetries'] || 2), | |
verbose: !!argv['verbose'] | |
} | |
// Expecting websites to be listed in the file one per line | |
const websites = fs | |
.readFileSync(argv["_"][0], { encoding: "utf-8" }) | |
.split(/\s*\r?\n\s*/) | |
.map(l => l.replace(/^\s+|\s+$/g, "")) | |
.filter(l => !!l && l.match(/^https?/i)) | |
return crawlAllWebsites(websites, opts) | |
}) | |
main.run().onComplete(r => r.fold( | |
console.error, | |
() => console.error("Done!")) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I nead this in html5 please