Skip to content

Instantly share code, notes, and snippets.

@alexandru
Last active January 7, 2021 12:42
Show Gist options
  • Save alexandru/ce4459b1c4fc3e4567421b937d95675e to your computer and use it in GitHub Desktop.
Save alexandru/ce4459b1c4fc3e4567421b937d95675e to your computer and use it in GitHub Desktop.
Web crawler that downloads HTML content (for analysis) from a list of websites, exporting content as JSON lines
#!/usr/bin/env node
import { IO, Try, Success, Failure, Either, Left, Right, Cancelable, Duration } from "funfix"
import { RequestResponse } from "request"
import * as fs from "fs"
import * as request from "request"
import * as Url from "url"
import * as cheerio from "cheerio"
import * as minimist from "minimist"
type Record<K extends string, T> = {
[P in K]: T
}
type Options = {
parallelism: number,
maxDepth: number,
timeout: Duration,
maxRedirects: number,
maxRetries: number,
verbose: boolean
}
const CONTENT_TYPES: Record<string, boolean> = {
'text/html': true,
'application/xhtml+xml': true,
'application/xml': true
}
const HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
/** Prints an error to STDERR, prints help, then exists. */
function printErrorAndExit(error: string, code: number): void {
console.error(`ERROR: ${error}\n`)
printHelpAndExit(code)
}
/** Prints a command-line help message. */
function printHelpAndExit(code: number = 1) {
console.error("Usage: ./dist/websites.js path/to/list.txt [options]")
console.error("\nOPTIONS:")
console.error(" --parallelism specifies the maximum parallelism, defaults to 8")
console.error(" --timeout specifies the request timeout in millis, defaults to 5000")
console.error(" --maxDepth maximum depth (from website root) to crawl, defaults to 1")
console.error(" --maxRedirects maximum number of redirects followed on a single request")
console.error(" --maxRetries maximum number of times to retry requests on unexpected errors")
console.error(" --verbose prints URLs fetched to STDERR")
console.error()
process.exit(code)
}
/**
* Given an `url`, return it with the path and hash parts stripped.
*/
function rootURLOf(url: string): string {
const p = Url.parse(url)
const protocol = p.protocol ? p.protocol + "//" : ""
const hostname = p.hostname || ""
const port = p.port ? `:${p.port}` : ""
return `${protocol}${hostname}${port}`
}
/** Stupid util for building an URL from a relative path. */
function joinPath(currentURL: string, link: string): string {
// Is link a hash bang?
if (link.startsWith('#')) return currentURL
// Is link a full URL?
if (link.match(/^[a-zA-Z]+[:]\/\//)) return link
// Is link an absolute path?
if (link.startsWith("/")) return rootURLOf(currentURL) + link
// Is currentURL a path to a file?
const m = currentURL.match(/^(.+?)\/\w+$/)
if (m) return m[1] + "/" + link
// Must be a directory + relative path
const noEndSlash = currentURL.replace(/\/+$/, "")
return noEndSlash + "/" + link
}
/** Exports `obj` as JSON to STDOUT and waits for callback. */
function writeLineToSTDOUT(obj: Object): IO<void> {
return IO.async((ec, cb) => {
process.stdout.write(JSON.stringify(obj) + "\n", "utf-8", (err: any) => {
if (err) cb(Failure(err))
else cb(Try.unit())
})
})
}
/** Writes log lines to STDERR. */
function writeLogToStdErr(msg: string): IO<void> {
return IO.async((ec, cb) => {
process.stderr.write(msg + "\n", "utf-8", (err: any) => {
if (err) cb(Failure(err))
else cb(Try.unit())
})
})
}
/** Returns the domain from the given URL, or null if not a valid URL. */
function getDomainOf(url: string): string | null {
const host = url.match(/https?:\/\/([^\/]+)/)
if (!host) return null
const ext = host[1].match(/\.(com?\.\w{2,3}|org\.\w{2,3}|\w{2,6})$/i)
if (!ext) return null
const domain = host[1].match(new RegExp(`(?:^|\.)(\\w+)\.${ext[1].replace('.', '\\.')}$`))
if (!domain) return null
return domain[1]
}
/** Returns `true` if the Content-Type header is HTML. */
function checkContentIsHtml(contentType: string | string[] | undefined): boolean {
const cts = typeof contentType === 'string'
? [contentType]
: (contentType || [])
for (const t of cts) {
const parts = t.toLowerCase().replace(/^\s+|\s+$/g, "").split(/\s*;\s*/)
for (const p of parts)
if (CONTENT_TYPES[p]) return true
}
return false
}
/**
* Retries on failure, until it succeeds, or until the maximum
* number of retries is reached.
*/
function retryOnFailure<A>(fa: IO<A>, maxRetries: number): IO<A> {
return fa.recoverWith(err => maxRetries > 0
? retryOnFailure(fa, maxRetries - 1)
: IO.raise(err))
}
/** Fetch body of URL, but only for HTTP Status 200. */
function fetch(url: string, opts: Options): IO<string> {
if (!url.match(/^https?[:]/i)) {
return IO.raise(new Error(`Not a valid URL: ${url}`))
}
// Uses `Either` to signal failures that we can't recover from
// (e.g. there's no point in retrying a 404 Not Found)
const raw = IO.async<Either<Error, string>>((ec, cb) => {
const options: request.CoreOptions = {
headers: HEADERS,
followAllRedirects: true,
followRedirect: true,
timeout: Math.round(opts.timeout.toMillis() / 2),
maxRedirects: opts.maxRedirects
}
const task = request(url, options, (error, resp, body) => {
if (error)
return cb(Failure(error))
else if (resp.statusCode !== 200)
return cb(Success(Left(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))))
else {
const contentType = resp.headers['content-type']
if (checkContentIsHtml(contentType) && typeof body === 'string')
return cb(Success(Right(body)))
else
return cb(Success(Left(new Error(`Unexpected Content-Type: ${contentType}`))))
}
})
return Cancelable.of(() => task.abort())
})
const log = opts.verbose ? writeLogToStdErr(url) : IO.unit()
const io = retryOnFailure(raw, opts.maxRetries)
.flatMap(r => r.fold(IO.raise, IO.pure))
return log.followedBy(io)
}
/**
* Some domains are now redirected, but we need to find the new
* hostname or protocol of the website, because we only want to crawl
* our list, not the whole internet.
*/
function discoverRoot(url: string, opts: Options): IO<string> {
const options: request.CoreOptions = {
headers: HEADERS,
followAllRedirects: false,
followRedirect: false,
method: "HEAD",
timeout: opts.timeout.toMillis(),
maxRedirects: 0
}
// Recursive loop that keeps going until HTTP 200 or until error
function loop(current: string, count: number): IO<string> {
if (count >= opts.maxRedirects)
return IO.raise(new Error(`Maximum redirect count exceeded (${count})`))
const req = IO.async<RequestResponse>((ec, cb) => {
const task = request(current, options, (error, resp) => {
if (error) return cb(Failure(error))
return cb(Success(resp))
})
return Cancelable.of(() => task)
})
const process: IO<Either<string, string>> =
retryOnFailure(req, opts.maxRetries).flatMap(resp => {
if (resp.statusCode === 301 || resp.statusCode === 302 || resp.statusCode === 303) {
const location = resp.headers['location'] as string
const next = joinPath(current, location)
return IO.pure(Left(next))
}
else if (!resp.statusCode || resp.statusCode >= 400)
return IO.raise(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))
else
return IO.pure(Right(current))
})
// Loops until error, or HTTP 200
return process.flatMap(either =>
either.fold(
l => loop(l, count + 1),
r => IO.pure(r)
))
}
return loop(url, 0).flatMap(location =>
getDomainOf(location) === getDomainOf(url)
? IO.pure(rootURLOf(location))
: IO.raise(`Redirected, cannot follow at: ${location}`))
}
/**
* Crawls the given website.
*
* @param origin is the website read from the input websites file
*
* @param root represents the starting point, e.g. `https://google.com`
*
* @param urls is the stack of websites belonging to this hostname
* that are scheduled for crawling, plus the detected "depth"
* (path from the root)
*
* @param visited is a map of all visited URLs, to avoid visiting
* the same ones repeatedly
*
* @param opts are crawler options
*/
function crawlWebsite(
origin: string,
root: string,
urls: [string, number][],
visited: Record<string, boolean>,
opts: Options): IO<void> {
return IO.suspend(() => {
let urlInfo: [string, number] | undefined
const rootDomain = getDomainOf(root)
while (urlInfo = urls.pop()) {
const [url, currentDepth] = urlInfo
const id = url.replace(/[\/]+$/, "")
const shouldSkip =
visited[id] ||
currentDepth > opts.maxDepth ||
rootDomain !== getDomainOf(url)
if (shouldSkip) continue
visited[id] = true
return fetch(url, opts).attempt().flatMap(r => {
const obj = r.fold(
err => ({ origin, root, url, error: `Unexpected: ${err}` }) as Object,
body => {
const $ = cheerio.load(body)
$("a").each((idx, elem) => {
let another = (elem.attribs['href'] || "")
.replace(/^\s+|\s+$/g, "") // trim it
.replace(/[#][^$]+$/, "") // eliminate the hash part
if (another) {
// Filtering: is it an HTML link? If not, then no crawling!
const parsed = Url.parse(another)
const isHttp = !parsed.protocol || parsed.protocol.match(/^http/i)
const isBinary = another.match(/[.](jpe?g|jpe?g_large|png|gif|pdf)$/i)
if (isHttp && !isBinary) {
const newUrl = joinPath(url, another)
if (!!newUrl && !visited[newUrl])
urls.push([newUrl, currentDepth + 1])
}
}
})
if (typeof body === 'string') return { origin, root, url, body }
return null
})
const next = urls.length > 0
? crawlWebsite(origin, root, urls, visited, opts)
: IO.unit()
if (obj !== null) return writeLineToSTDOUT(obj).followedBy(next)
return next
})
}
return IO.unit()
})
}
/**
* Models a worker that pops from the stack of `websites` and crawls
* them until there are no more websites left to crawl.
*/
function startWorkerLoop(websites: string[], opts: Options): IO<void> {
return IO.suspend(() => {
const origin = websites.pop()
if (!origin) return IO.unit()
const crawl = discoverRoot(origin, opts).attempt().flatMap(root =>
root.isLeft()
? writeLineToSTDOUT({ origin, error: `Failed root discovery: ${root.swap().get()}` })
: crawlWebsite(origin, root.get(), [[root.get(), 0]], {}, opts)
)
// Keep going until no more websites in the queue!
return crawl.flatMap(_ =>
startWorkerLoop(websites, opts))
})
}
/**
* Main function that initiates workers to run in parallel.
*/
function crawlAllWebsites(websites: string[], opts: Options): IO<void> {
return IO.suspend(() => {
console.error(
"Started crawling with " +
`parallelism=${opts.parallelism}, ` +
`maxDepth=${opts.maxDepth}, ` +
`timeout=${opts.timeout.toMillis()}ms, ` +
`maxRedirects=${opts.maxRedirects}, ` +
`maxRetries=${opts.maxRetries} and ` +
`verbose=${opts.verbose}`
)
const workers: Array<IO<void>> = []
for (let i = 0; i < opts.parallelism; i++) {
workers.push(startWorkerLoop(websites, opts))
}
// Workers run in parallel, but the workers are sequential
return IO.gather(workers).map(() => {})
})
}
const main = IO.suspend<void>(() => {
const argv = minimist(process.argv.slice(2))
if (!argv["_"][0]) {
printErrorAndExit("Missing path to text file with the list of websites!", 2)
} else if (!fs.existsSync(argv["_"][0])) {
printErrorAndExit(`File not found: ${process.argv[2]}`, 3)
}
const opts: Options = {
parallelism: parseInt(argv['parallelism'] || 8),
timeout: Duration.of(parseInt(argv['timeout'] || 5000)),
maxDepth: parseInt(argv['maxDepth'] || 1),
maxRedirects: parseInt(argv['maxRedirects'] || 10),
maxRetries: parseInt(argv['maxRetries'] || 2),
verbose: !!argv['verbose']
}
// Expecting websites to be listed in the file one per line
const websites = fs
.readFileSync(argv["_"][0], { encoding: "utf-8" })
.split(/\s*\r?\n\s*/)
.map(l => l.replace(/^\s+|\s+$/g, ""))
.filter(l => !!l && l.match(/^https?/i))
return crawlAllWebsites(websites, opts)
})
main.run().onComplete(r => r.fold(
console.error,
() => console.error("Done!"))
)
Copy link

ghost commented Jan 7, 2021

I nead this in html5 please

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment