Skip to content

Instantly share code, notes, and snippets.

@dhagrow
Last active January 3, 2016 04:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dhagrow/6bb39b37b8c35d35af14 to your computer and use it in GitHub Desktop.
Save dhagrow/6bb39b37b8c35d35af14 to your computer and use it in GitHub Desktop.
Eidolang Web Crawler
# imports
imp html
imp http
imp process
imp coroutine
# standard library is implicitly accessible in the global namespace
# e.g. set(), queue(), print()
# class definition
cls Crawler:
# method definition
fn init!(start_url) {
# the '.' prefix references the current class instance
# (aka 'self' or 'this')
.url_q = queue()
.url_q.put(start_url)
.response_q = queue()
.seen = set()
}
fn crawl() {
# fork a new process to parse responses
process.fork(.process_responses, .response_q, .url_q)
for url in .url_q:
if url in .seen:
continue
.seen.add(url)
# spawn a coroutine to make the HTTP request using an anonymous
# function
coroutine.spawn(fn() {
response = http.get(url)
.response_q.put(response)
})
}
# mark this as a static function (i.e. not bound to the parent class)
@static
fn process_responses(response_q, url_q) {
for response in response_q:
doc = html.parse(response.content)
for link in doc.links:
url_q.put(link.attrs.href)
}
# imports
imp args
imp crawler
# functions and methods are the only language contructs that require curly
# brace syntax
# the '!' suffix marks a function with special semantics
fn main!() {
# the ':' prefix references the global namespace
# it is optional, but can be used to assign to a global name or for
# disambiguation
parser = :args.Parser()
parser.add('start-url', help='the URL to begin crawling from')
args = parser.parse()
c = crawler.Crawler(args.start_url)
c.crawl()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment