Skip to content

Instantly share code, notes, and snippets.

@chriseidhof
Last active October 23, 2023 20:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chriseidhof/6f94422d66b0d5b796b841af25040271 to your computer and use it in GitHub Desktop.
Save chriseidhof/6f94422d66b0d5b796b841af25040271 to your computer and use it in GitHub Desktop.
//
// Crawler.swift
// Crawler
//
// Created by Chris Eidhof on 21.12.21.
//
import Foundation
actor Queue {
var items: Set<URL> = []
var inProgress: Set<URL> = []
var waitingWorkers: [() -> ()] = []
func dequeue() async -> URL? {
guard !done else { return nil }
if let result = items.popFirst() {
inProgress.insert(result)
return result
} else {
await withCheckedContinuation { cont in
waitingWorkers.append(cont.resume())
}
return await dequeue()
}
}
func finish(_ item: URL) {
inProgress.remove(item)
if done { resumeWorkers() }
}
var done: Bool {
items.isEmpty && inProgress.isEmpty
}
func add(newItems: [URL]) {
items.formUnion(newItems)
resumeWorkers()
}
func resumeWorkers() {
for w in waitingWorkers { w() }
waitingWorkers = []
}
}
@MainActor
final class Crawler: ObservableObject {
@Published var state: [URL: Page] = [:]
func add(_ page: Page) {
state[page.url] = page
}
func seenURLs() -> Set<URL> {
Set(state.keys)
}
func crawl(url: URL, numberOfWorkers: Int = 4) async throws {
let basePrefix = url.absoluteString
let queue = Queue()
await queue.add(newItems: [url])
await withThrowingTaskGroup(of: Void.self) { group in
for i in 0..<numberOfWorkers {
group.addTask {
var numberOfJobs = 0
while let job = await queue.dequeue() {
let page = try await URLSession.shared.page(from: job)
let seen = await self.seenURLs()
let newURLs = page.outgoingLinks.filter { url in
url.absoluteString.hasPrefix(basePrefix) && !seen.contains(url)
}
await queue.add(newItems: newURLs)
await self.add(page)
await queue.finish(page.url)
numberOfJobs += 1
}
print("Worker \(i) did \(numberOfJobs) jobs")
}
}
}
}
}
extension URLSession {
func page(from url: URL) async throws -> Page {
let (data, _) = try await data(from: url)
let doc = try XMLDocument(data: data, options: .documentTidyHTML)
let title = try doc.nodes(forXPath: "//title").first?.stringValue
let links: [URL] = try doc.nodes(forXPath: "//a[@href]").compactMap { node in
guard let el = node as? XMLElement else { return nil }
guard let href = el.attribute(forName: "href")?.stringValue else { return nil }
return URL(string: href, relativeTo: url)?.simplified
}
return Page(url: url, title: title ?? "", outgoingLinks: links)
}
}
extension URL {
var simplified: URL {
var result = absoluteString
if let i = result.lastIndex(of: "#") {
result = String(result[..<i])
}
if result.last == "/" {
result.removeLast()
}
return URL(string: result)!
}
}
extension URL: @unchecked Sendable { }
struct Page {
var url: URL
var title: String
var outgoingLinks: [URL]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment