alexandru/web-crawler.ts

## web-crawler.ts
#!/usr/bin/env node

import { IO, Try, Success, Failure, Either, Left, Right, Cancelable, Duration } from "funfix"
import { RequestResponse } from "request"

import * as fs from "fs"
import * as request from "request"
import * as Url from "url"
import * as cheerio from "cheerio"
import * as minimist from "minimist"


type Record<K extends string, T> = {
  [P in K]: T
}

type Options = {
  parallelism: number,
  maxDepth: number,
  timeout: Duration,
  maxRedirects: number,
  maxRetries: number,
  verbose: boolean
}

const CONTENT_TYPES: Record<string, boolean> = {
  'text/html': true,
  'application/xhtml+xml': true,
  'application/xml': true
}

const HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml',
  'Accept-Language': 'en-US,en;q=0.8',
  'Cache-Control': 'max-age=0',
  'Connection': 'keep-alive',
  'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}

/** Prints an error to STDERR, prints help, then exists. */
function printErrorAndExit(error: string, code: number): void {
  console.error(`ERROR: ${error}\n`)
  printHelpAndExit(code)
}

/** Prints a command-line help message. */
function printHelpAndExit(code: number = 1) {
  console.error("Usage: ./dist/websites.js path/to/list.txt [options]")
  console.error("\nOPTIONS:")
  console.error("  --parallelism     specifies the maximum parallelism, defaults to 8")
  console.error("  --timeout         specifies the request timeout in millis, defaults to 5000")
  console.error("  --maxDepth        maximum depth (from website root) to crawl, defaults to 1")
  console.error("  --maxRedirects    maximum number of redirects followed on a single request")
  console.error("  --maxRetries      maximum number of times to retry requests on unexpected errors")
  console.error("  --verbose         prints URLs fetched to STDERR")
  console.error()
  process.exit(code)
}

/**
 * Given an `url`, return it with the path and hash parts stripped.
 */
function rootURLOf(url: string): string {
  const p = Url.parse(url)
  const protocol = p.protocol ? p.protocol + "//" : ""
  const hostname = p.hostname || ""
  const port = p.port ? `:${p.port}` : ""
  return `${protocol}${hostname}${port}`
}

/** Stupid util for building an URL from a relative path. */
function joinPath(currentURL: string, link: string): string {
  // Is link a hash bang?
  if (link.startsWith('#')) return currentURL
  // Is link a full URL?
  if (link.match(/^[a-zA-Z]+[:]\/\//)) return link
  // Is link an absolute path?
  if (link.startsWith("/")) return rootURLOf(currentURL) + link
  // Is currentURL a path to a file?
  const m = currentURL.match(/^(.+?)\/\w+$/)
  if (m) return m[1] + "/" + link
  // Must be a directory + relative path
  const noEndSlash = currentURL.replace(/\/+$/, "")
  return noEndSlash + "/" + link
}

/** Exports `obj` as JSON to STDOUT and waits for callback. */
function writeLineToSTDOUT(obj: Object): IO<void> {
  return IO.async((ec, cb) => {
    process.stdout.write(JSON.stringify(obj) + "\n", "utf-8", (err: any) => {
      if (err) cb(Failure(err))
      else cb(Try.unit())
    })
  })
}

/** Writes log lines to STDERR. */
function writeLogToStdErr(msg: string): IO<void> {
  return IO.async((ec, cb) => {
    process.stderr.write(msg + "\n", "utf-8", (err: any) => {
      if (err) cb(Failure(err))
      else cb(Try.unit())
    })
  })
}

/** Returns the domain from the given URL, or null if not a valid URL. */
function getDomainOf(url: string): string | null {
  const host = url.match(/https?:\/\/([^\/]+)/)
  if (!host) return null
  const ext = host[1].match(/\.(com?\.\w{2,3}|org\.\w{2,3}|\w{2,6})$/i)
  if (!ext) return null
  const domain = host[1].match(new RegExp(`(?:^|\.)(\\w+)\.${ext[1].replace('.', '\\.')}$`))
  if (!domain) return null
  return domain[1]
}

/** Returns `true` if the Content-Type header is HTML. */
function checkContentIsHtml(contentType: string | string[] | undefined): boolean {
  const cts = typeof contentType === 'string'
    ? [contentType]
    : (contentType || [])

  for (const t of cts) {
    const parts = t.toLowerCase().replace(/^\s+|\s+$/g, "").split(/\s*;\s*/)
    for (const p of parts)
      if (CONTENT_TYPES[p]) return true
  }
  return false
}

/**
 * Retries on failure, until it succeeds, or until the maximum
 * number of retries is reached.
 */
function retryOnFailure<A>(fa: IO<A>, maxRetries: number): IO<A> {
  return fa.recoverWith(err => maxRetries > 0
    ? retryOnFailure(fa, maxRetries - 1)
    : IO.raise(err))
}

/** Fetch body of URL, but only for HTTP Status 200. */
function fetch(url: string, opts: Options): IO<string> {
  if (!url.match(/^https?[:]/i)) {
    return IO.raise(new Error(`Not a valid URL: ${url}`))
  }

  // Uses `Either` to signal failures that we can't recover from
  // (e.g. there's no point in retrying a 404 Not Found)
  const raw = IO.async<Either<Error, string>>((ec, cb) => {
    const options: request.CoreOptions = {
      headers: HEADERS,
      followAllRedirects: true,
      followRedirect: true,
      timeout: Math.round(opts.timeout.toMillis() / 2),
      maxRedirects: opts.maxRedirects
    }

    const task = request(url, options, (error, resp, body) => {
      if (error)
        return cb(Failure(error))
      else if (resp.statusCode !== 200)
        return cb(Success(Left(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))))
      else {
        const contentType = resp.headers['content-type']
        if (checkContentIsHtml(contentType) && typeof body === 'string')
          return cb(Success(Right(body)))
        else
          return cb(Success(Left(new Error(`Unexpected Content-Type: ${contentType}`))))
      }
    })

    return Cancelable.of(() => task.abort())
  })

  const log = opts.verbose ? writeLogToStdErr(url) : IO.unit()
  const io = retryOnFailure(raw, opts.maxRetries)
    .flatMap(r => r.fold(IO.raise, IO.pure))

  return log.followedBy(io)
}

/**
 * Some domains are now redirected, but we need to find the new
 * hostname or protocol of the website, because we only want to crawl
 * our list, not the whole internet.
 */
function discoverRoot(url: string, opts: Options): IO<string> {
  const options: request.CoreOptions = {
    headers: HEADERS,
    followAllRedirects: false,
    followRedirect: false,
    method: "HEAD",
    timeout: opts.timeout.toMillis(),
    maxRedirects: 0
  }

  // Recursive loop that keeps going until HTTP 200 or until error
  function loop(current: string, count: number): IO<string> {
    if (count >= opts.maxRedirects)
      return IO.raise(new Error(`Maximum redirect count exceeded (${count})`))

    const req = IO.async<RequestResponse>((ec, cb) => {
      const task = request(current, options, (error, resp) => {
        if (error) return cb(Failure(error))
        return cb(Success(resp))
      })

      return Cancelable.of(() => task)
    })

    const process: IO<Either<string, string>> =
      retryOnFailure(req, opts.maxRetries).flatMap(resp => {
        if (resp.statusCode === 301 || resp.statusCode === 302 || resp.statusCode === 303) {
          const location = resp.headers['location'] as string
          const next = joinPath(current, location)
          return IO.pure(Left(next))
        }
        else if (!resp.statusCode || resp.statusCode >= 400)
          return IO.raise(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))
        else
          return IO.pure(Right(current))
      })

    // Loops until error, or HTTP 200
    return process.flatMap(either =>
      either.fold(
        l => loop(l, count + 1),
        r => IO.pure(r)
      ))
  }

  return loop(url, 0).flatMap(location =>
    getDomainOf(location) === getDomainOf(url)
      ? IO.pure(rootURLOf(location))
      : IO.raise(`Redirected, cannot follow at: ${location}`))
}

/**
 * Crawls the given website.
 *
 * @param origin is the website read from the input websites file
 *
 * @param root represents the starting point, e.g. `https://google.com`
 *
 * @param urls is the stack of websites belonging to this hostname
 *        that are scheduled for crawling, plus the detected "depth"
 *        (path from the root)
 *
 * @param visited is a map of all visited URLs, to avoid visiting
 *        the same ones repeatedly
 *
 * @param opts are crawler options
 */
function crawlWebsite(
  origin: string,
  root: string,
  urls: [string, number][],
  visited: Record<string, boolean>,
  opts: Options): IO<void> {

  return IO.suspend(() => {
    let urlInfo: [string, number] | undefined
    const rootDomain = getDomainOf(root)

    while (urlInfo = urls.pop()) {
      const [url, currentDepth] = urlInfo
      const id = url.replace(/[\/]+$/, "")
      const shouldSkip =
        visited[id] ||
        currentDepth > opts.maxDepth ||
        rootDomain !== getDomainOf(url)

      if (shouldSkip) continue
      visited[id] = true

      return fetch(url, opts).attempt().flatMap(r => {
        const obj = r.fold(
          err => ({ origin, root, url, error: `Unexpected: ${err}` }) as Object,
          body => {
            const $ = cheerio.load(body)

            $("a").each((idx, elem) => {
              let another = (elem.attribs['href'] || "")
                .replace(/^\s+|\s+$/g, "") // trim it
                .replace(/[#][^$]+$/, "")  // eliminate the hash part

              if (another) {
                // Filtering: is it an HTML link? If not, then no crawling!
                const parsed = Url.parse(another)
                const isHttp = !parsed.protocol || parsed.protocol.match(/^http/i)
                const isBinary = another.match(/[.](jpe?g|jpe?g_large|png|gif|pdf)$/i)

                if (isHttp && !isBinary) {
                  const newUrl = joinPath(url, another)
                  if (!!newUrl && !visited[newUrl])
                    urls.push([newUrl, currentDepth + 1])
                }
              }
            })

            if (typeof body === 'string') return { origin, root, url, body }
            return null
          })

        const next = urls.length > 0
          ? crawlWebsite(origin, root, urls, visited, opts)
          : IO.unit()

        if (obj !== null) return writeLineToSTDOUT(obj).followedBy(next)
        return next
      })
    }

    return IO.unit()
  })
}

/**
 * Models a worker that pops from the stack of `websites` and crawls
 * them until there are no more websites left to crawl.
 */
function startWorkerLoop(websites: string[], opts: Options): IO<void> {
  return IO.suspend(() => {
    const origin = websites.pop()
    if (!origin) return IO.unit()

    const crawl = discoverRoot(origin, opts).attempt().flatMap(root =>
      root.isLeft()
        ? writeLineToSTDOUT({ origin, error: `Failed root discovery: ${root.swap().get()}` })
        : crawlWebsite(origin, root.get(), [[root.get(), 0]], {}, opts)
    )

    // Keep going until no more websites in the queue!
    return crawl.flatMap(_ =>
      startWorkerLoop(websites, opts))
  })
}

/**
 * Main function that initiates workers to run in parallel.
 */
function crawlAllWebsites(websites: string[], opts: Options): IO<void> {
  return IO.suspend(() => {
    console.error(
      "Started crawling with " +
      `parallelism=${opts.parallelism}, ` +
      `maxDepth=${opts.maxDepth}, ` +
      `timeout=${opts.timeout.toMillis()}ms, ` +
      `maxRedirects=${opts.maxRedirects}, ` +
      `maxRetries=${opts.maxRetries} and ` +
      `verbose=${opts.verbose}`
    )

    const workers: Array<IO<void>> = []
    for (let i = 0; i < opts.parallelism; i++) {
      workers.push(startWorkerLoop(websites, opts))
    }

    // Workers run in parallel, but the workers are sequential
    return IO.gather(workers).map(() => {})
  })
}

const main = IO.suspend<void>(() => {
  const argv = minimist(process.argv.slice(2))

  if (!argv["_"][0]) {
    printErrorAndExit("Missing path to text file with the list of websites!", 2)
  } else if (!fs.existsSync(argv["_"][0])) {
    printErrorAndExit(`File not found: ${process.argv[2]}`, 3)
  }

  const opts: Options = {
    parallelism: parseInt(argv['parallelism'] || 8),
    timeout: Duration.of(parseInt(argv['timeout'] || 5000)),
    maxDepth: parseInt(argv['maxDepth'] || 1),
    maxRedirects: parseInt(argv['maxRedirects'] || 10),
    maxRetries: parseInt(argv['maxRetries'] || 2),
    verbose: !!argv['verbose']
  }

  // Expecting websites to be listed in the file one per line
  const websites = fs
    .readFileSync(argv["_"][0], { encoding: "utf-8" })
    .split(/\s*\r?\n\s*/)
    .map(l => l.replace(/^\s+|\s+$/g, ""))
    .filter(l => !!l && l.match(/^https?/i))

  return crawlAllWebsites(websites, opts)
})

main.run().onComplete(r => r.fold(
  console.error,
  () => console.error("Done!"))
)
	#!/usr/bin/env node

	import { IO, Try, Success, Failure, Either, Left, Right, Cancelable, Duration } from "funfix"
	import { RequestResponse } from "request"

	import * as fs from "fs"
	import * as request from "request"
	import * as Url from "url"
	import * as cheerio from "cheerio"
	import * as minimist from "minimist"


	type Record<K extends string, T> = {
	[P in K]: T
	}

	type Options = {
	parallelism: number,
	maxDepth: number,
	timeout: Duration,
	maxRedirects: number,
	maxRetries: number,
	verbose: boolean
	}

	const CONTENT_TYPES: Record<string, boolean> = {
	'text/html': true,
	'application/xhtml+xml': true,
	'application/xml': true
	}

	const HEADERS = {
	'Accept': 'text/html,application/xhtml+xml,application/xml',
	'Accept-Language': 'en-US,en;q=0.8',
	'Cache-Control': 'max-age=0',
	'Connection': 'keep-alive',
	'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
	}

	/** Prints an error to STDERR, prints help, then exists. */
	function printErrorAndExit(error: string, code: number): void {
	console.error(`ERROR: ${error}\n`)
	printHelpAndExit(code)
	}

	/** Prints a command-line help message. */
	function printHelpAndExit(code: number = 1) {
	console.error("Usage: ./dist/websites.js path/to/list.txt [options]")
	console.error("\nOPTIONS:")
	console.error(" --parallelism specifies the maximum parallelism, defaults to 8")
	console.error(" --timeout specifies the request timeout in millis, defaults to 5000")
	console.error(" --maxDepth maximum depth (from website root) to crawl, defaults to 1")
	console.error(" --maxRedirects maximum number of redirects followed on a single request")
	console.error(" --maxRetries maximum number of times to retry requests on unexpected errors")
	console.error(" --verbose prints URLs fetched to STDERR")
	console.error()
	process.exit(code)
	}

	/**
	* Given an `url`, return it with the path and hash parts stripped.
	*/
	function rootURLOf(url: string): string {
	const p = Url.parse(url)
	const protocol = p.protocol ? p.protocol + "//" : ""
	const hostname = p.hostname \|\| ""
	const port = p.port ? `:${p.port}` : ""
	return `${protocol}${hostname}${port}`
	}

	/** Stupid util for building an URL from a relative path. */
	function joinPath(currentURL: string, link: string): string {
	// Is link a hash bang?
	if (link.startsWith('#')) return currentURL
	// Is link a full URL?
	if (link.match(/^[a-zA-Z]+[:]\/\//)) return link
	// Is link an absolute path?
	if (link.startsWith("/")) return rootURLOf(currentURL) + link
	// Is currentURL a path to a file?
	const m = currentURL.match(/^(.+?)\/\w+$/)
	if (m) return m[1] + "/" + link
	// Must be a directory + relative path
	const noEndSlash = currentURL.replace(/\/+$/, "")
	return noEndSlash + "/" + link
	}

	/** Exports `obj` as JSON to STDOUT and waits for callback. */
	function writeLineToSTDOUT(obj: Object): IO<void> {
	return IO.async((ec, cb) => {
	process.stdout.write(JSON.stringify(obj) + "\n", "utf-8", (err: any) => {
	if (err) cb(Failure(err))
	else cb(Try.unit())
	})
	})
	}

	/** Writes log lines to STDERR. */
	function writeLogToStdErr(msg: string): IO<void> {
	return IO.async((ec, cb) => {
	process.stderr.write(msg + "\n", "utf-8", (err: any) => {
	if (err) cb(Failure(err))
	else cb(Try.unit())
	})
	})
	}

	/** Returns the domain from the given URL, or null if not a valid URL. */
	function getDomainOf(url: string): string \| null {
	const host = url.match(/https?:\/\/([^\/]+)/)
	if (!host) return null
	const ext = host[1].match(/\.(com?\.\w{2,3}\|org\.\w{2,3}\|\w{2,6})$/i)
	if (!ext) return null
	const domain = host[1].match(new RegExp(`(?:^\|\.)(\\w+)\.${ext[1].replace('.', '\\.')}$`))
	if (!domain) return null
	return domain[1]
	}

	/** Returns `true` if the Content-Type header is HTML. */
	function checkContentIsHtml(contentType: string \| string[] \| undefined): boolean {
	const cts = typeof contentType === 'string'
	? [contentType]
	: (contentType \|\| [])

	for (const t of cts) {
	const parts = t.toLowerCase().replace(/^\s+\|\s+$/g, "").split(/\s;\s/)
	for (const p of parts)
	if (CONTENT_TYPES[p]) return true
	}
	return false
	}

	/**
	* Retries on failure, until it succeeds, or until the maximum
	* number of retries is reached.
	*/
	function retryOnFailure<A>(fa: IO<A>, maxRetries: number): IO<A> {
	return fa.recoverWith(err => maxRetries > 0
	? retryOnFailure(fa, maxRetries - 1)
	: IO.raise(err))
	}

	/** Fetch body of URL, but only for HTTP Status 200. */
	function fetch(url: string, opts: Options): IO<string> {
	if (!url.match(/^https?[:]/i)) {
	return IO.raise(new Error(`Not a valid URL: ${url}`))
	}

	// Uses `Either` to signal failures that we can't recover from
	// (e.g. there's no point in retrying a 404 Not Found)
	const raw = IO.async<Either<Error, string>>((ec, cb) => {
	const options: request.CoreOptions = {
	headers: HEADERS,
	followAllRedirects: true,
	followRedirect: true,
	timeout: Math.round(opts.timeout.toMillis() / 2),
	maxRedirects: opts.maxRedirects
	}

	const task = request(url, options, (error, resp, body) => {
	if (error)
	return cb(Failure(error))
	else if (resp.statusCode !== 200)
	return cb(Success(Left(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))))
	else {
	const contentType = resp.headers['content-type']
	if (checkContentIsHtml(contentType) && typeof body === 'string')
	return cb(Success(Right(body)))
	else
	return cb(Success(Left(new Error(`Unexpected Content-Type: ${contentType}`))))
	}
	})

	return Cancelable.of(() => task.abort())
	})

	const log = opts.verbose ? writeLogToStdErr(url) : IO.unit()
	const io = retryOnFailure(raw, opts.maxRetries)
	.flatMap(r => r.fold(IO.raise, IO.pure))

	return log.followedBy(io)
	}

	/**
	* Some domains are now redirected, but we need to find the new
	* hostname or protocol of the website, because we only want to crawl
	* our list, not the whole internet.
	*/
	function discoverRoot(url: string, opts: Options): IO<string> {
	const options: request.CoreOptions = {
	headers: HEADERS,
	followAllRedirects: false,
	followRedirect: false,
	method: "HEAD",
	timeout: opts.timeout.toMillis(),
	maxRedirects: 0
	}

	// Recursive loop that keeps going until HTTP 200 or until error
	function loop(current: string, count: number): IO<string> {
	if (count >= opts.maxRedirects)
	return IO.raise(new Error(`Maximum redirect count exceeded (${count})`))

	const req = IO.async<RequestResponse>((ec, cb) => {
	const task = request(current, options, (error, resp) => {
	if (error) return cb(Failure(error))
	return cb(Success(resp))
	})

	return Cancelable.of(() => task)
	})

	const process: IO<Either<string, string>> =
	retryOnFailure(req, opts.maxRetries).flatMap(resp => {
	if (resp.statusCode === 301 \|\| resp.statusCode === 302 \|\| resp.statusCode === 303) {
	const location = resp.headers['location'] as string
	const next = joinPath(current, location)
	return IO.pure(Left(next))
	}
	else if (!resp.statusCode \|\| resp.statusCode >= 400)
	return IO.raise(new Error(`Unexpected HTTP Status ${resp.statusCode} for ${url}`))
	else
	return IO.pure(Right(current))
	})

	// Loops until error, or HTTP 200
	return process.flatMap(either =>
	either.fold(
	l => loop(l, count + 1),
	r => IO.pure(r)
	))
	}

	return loop(url, 0).flatMap(location =>
	getDomainOf(location) === getDomainOf(url)
	? IO.pure(rootURLOf(location))
	: IO.raise(`Redirected, cannot follow at: ${location}`))
	}

	/**
	* Crawls the given website.
	*
	* @param origin is the website read from the input websites file
	*
	* @param root represents the starting point, e.g. `https://google.com`
	*
	* @param urls is the stack of websites belonging to this hostname
	* that are scheduled for crawling, plus the detected "depth"
	* (path from the root)
	*
	* @param visited is a map of all visited URLs, to avoid visiting
	* the same ones repeatedly
	*
	* @param opts are crawler options
	*/
	function crawlWebsite(
	origin: string,
	root: string,
	urls: [string, number][],
	visited: Record<string, boolean>,
	opts: Options): IO<void> {

	return IO.suspend(() => {
	let urlInfo: [string, number] \| undefined
	const rootDomain = getDomainOf(root)

	while (urlInfo = urls.pop()) {
	const [url, currentDepth] = urlInfo
	const id = url.replace(/[\/]+$/, "")
	const shouldSkip =
	visited[id] \|\|
	currentDepth > opts.maxDepth \|\|
	rootDomain !== getDomainOf(url)

	if (shouldSkip) continue
	visited[id] = true

	return fetch(url, opts).attempt().flatMap(r => {
	const obj = r.fold(
	err => ({ origin, root, url, error: `Unexpected: ${err}` }) as Object,
	body => {
	const $ = cheerio.load(body)

	$("a").each((idx, elem) => {
	let another = (elem.attribs['href'] \|\| "")
	.replace(/^\s+\|\s+$/g, "") // trim it
	.replace(/[#][^$]+$/, "") // eliminate the hash part

	if (another) {
	// Filtering: is it an HTML link? If not, then no crawling!
	const parsed = Url.parse(another)
	const isHttp = !parsed.protocol \|\| parsed.protocol.match(/^http/i)
	const isBinary = another.match(/[.](jpe?g\|jpe?g_large\|png\|gif\|pdf)$/i)

	if (isHttp && !isBinary) {
	const newUrl = joinPath(url, another)
	if (!!newUrl && !visited[newUrl])
	urls.push([newUrl, currentDepth + 1])
	}
	}
	})

	if (typeof body === 'string') return { origin, root, url, body }
	return null
	})

	const next = urls.length > 0
	? crawlWebsite(origin, root, urls, visited, opts)
	: IO.unit()

	if (obj !== null) return writeLineToSTDOUT(obj).followedBy(next)
	return next
	})
	}

	return IO.unit()
	})
	}

	/**
	* Models a worker that pops from the stack of `websites` and crawls
	* them until there are no more websites left to crawl.
	*/
	function startWorkerLoop(websites: string[], opts: Options): IO<void> {
	return IO.suspend(() => {
	const origin = websites.pop()
	if (!origin) return IO.unit()

	const crawl = discoverRoot(origin, opts).attempt().flatMap(root =>
	root.isLeft()
	? writeLineToSTDOUT({ origin, error: `Failed root discovery: ${root.swap().get()}` })
	: crawlWebsite(origin, root.get(), [[root.get(), 0]], {}, opts)
	)

	// Keep going until no more websites in the queue!
	return crawl.flatMap(_ =>
	startWorkerLoop(websites, opts))
	})
	}

	/**
	* Main function that initiates workers to run in parallel.
	*/
	function crawlAllWebsites(websites: string[], opts: Options): IO<void> {
	return IO.suspend(() => {
	console.error(
	"Started crawling with " +
	`parallelism=${opts.parallelism}, ` +
	`maxDepth=${opts.maxDepth}, ` +
	`timeout=${opts.timeout.toMillis()}ms, ` +
	`maxRedirects=${opts.maxRedirects}, ` +
	`maxRetries=${opts.maxRetries} and ` +
	`verbose=${opts.verbose}`
	)

	const workers: Array<IO<void>> = []
	for (let i = 0; i < opts.parallelism; i++) {
	workers.push(startWorkerLoop(websites, opts))
	}

	// Workers run in parallel, but the workers are sequential
	return IO.gather(workers).map(() => {})
	})
	}

	const main = IO.suspend<void>(() => {
	const argv = minimist(process.argv.slice(2))

	if (!argv["_"][0]) {
	printErrorAndExit("Missing path to text file with the list of websites!", 2)
	} else if (!fs.existsSync(argv["_"][0])) {
	printErrorAndExit(`File not found: ${process.argv[2]}`, 3)
	}

	const opts: Options = {
	parallelism: parseInt(argv['parallelism'] \|\| 8),
	timeout: Duration.of(parseInt(argv['timeout'] \|\| 5000)),
	maxDepth: parseInt(argv['maxDepth'] \|\| 1),
	maxRedirects: parseInt(argv['maxRedirects'] \|\| 10),
	maxRetries: parseInt(argv['maxRetries'] \|\| 2),
	verbose: !!argv['verbose']
	}

	// Expecting websites to be listed in the file one per line
	const websites = fs
	.readFileSync(argv["_"][0], { encoding: "utf-8" })
	.split(/\s\r?\n\s/)
	.map(l => l.replace(/^\s+\|\s+$/g, ""))
	.filter(l => !!l && l.match(/^https?/i))

	return crawlAllWebsites(websites, opts)
	})

	main.run().onComplete(r => r.fold(
	console.error,
	() => console.error("Done!"))
	)