-
-
Save chriseidhof/6f94422d66b0d5b796b841af25040271 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Crawler.swift | |
// Crawler | |
// | |
// Created by Chris Eidhof on 21.12.21. | |
// | |
import Foundation | |
actor Queue { | |
var items: Set<URL> = [] | |
var inProgress: Set<URL> = [] | |
var waitingWorkers: [() -> ()] = [] | |
func dequeue() async -> URL? { | |
guard !done else { return nil } | |
if let result = items.popFirst() { | |
inProgress.insert(result) | |
return result | |
} else { | |
await withCheckedContinuation { cont in | |
waitingWorkers.append(cont.resume()) | |
} | |
return await dequeue() | |
} | |
} | |
func finish(_ item: URL) { | |
inProgress.remove(item) | |
if done { resumeWorkers() } | |
} | |
var done: Bool { | |
items.isEmpty && inProgress.isEmpty | |
} | |
func add(newItems: [URL]) { | |
items.formUnion(newItems) | |
resumeWorkers() | |
} | |
func resumeWorkers() { | |
for w in waitingWorkers { w() } | |
waitingWorkers = [] | |
} | |
} | |
@MainActor | |
final class Crawler: ObservableObject { | |
@Published var state: [URL: Page] = [:] | |
func add(_ page: Page) { | |
state[page.url] = page | |
} | |
func seenURLs() -> Set<URL> { | |
Set(state.keys) | |
} | |
func crawl(url: URL, numberOfWorkers: Int = 4) async throws { | |
let basePrefix = url.absoluteString | |
let queue = Queue() | |
await queue.add(newItems: [url]) | |
await withThrowingTaskGroup(of: Void.self) { group in | |
for i in 0..<numberOfWorkers { | |
group.addTask { | |
var numberOfJobs = 0 | |
while let job = await queue.dequeue() { | |
let page = try await URLSession.shared.page(from: job) | |
let seen = await self.seenURLs() | |
let newURLs = page.outgoingLinks.filter { url in | |
url.absoluteString.hasPrefix(basePrefix) && !seen.contains(url) | |
} | |
await queue.add(newItems: newURLs) | |
await self.add(page) | |
await queue.finish(page.url) | |
numberOfJobs += 1 | |
} | |
print("Worker \(i) did \(numberOfJobs) jobs") | |
} | |
} | |
} | |
} | |
} | |
extension URLSession { | |
func page(from url: URL) async throws -> Page { | |
let (data, _) = try await data(from: url) | |
let doc = try XMLDocument(data: data, options: .documentTidyHTML) | |
let title = try doc.nodes(forXPath: "//title").first?.stringValue | |
let links: [URL] = try doc.nodes(forXPath: "//a[@href]").compactMap { node in | |
guard let el = node as? XMLElement else { return nil } | |
guard let href = el.attribute(forName: "href")?.stringValue else { return nil } | |
return URL(string: href, relativeTo: url)?.simplified | |
} | |
return Page(url: url, title: title ?? "", outgoingLinks: links) | |
} | |
} | |
extension URL { | |
var simplified: URL { | |
var result = absoluteString | |
if let i = result.lastIndex(of: "#") { | |
result = String(result[..<i]) | |
} | |
if result.last == "/" { | |
result.removeLast() | |
} | |
return URL(string: result)! | |
} | |
} | |
extension URL: @unchecked Sendable { } | |
struct Page { | |
var url: URL | |
var title: String | |
var outgoingLinks: [URL] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment