Skip to content

Instantly share code, notes, and snippets.

@dmukhg
Created May 25, 2016 04:26
Show Gist options
  • Save dmukhg/1b17df3c1b57b45c9735cc3d44a24537 to your computer and use it in GitHub Desktop.
Save dmukhg/1b17df3c1b57b45c9735cc3d44a24537 to your computer and use it in GitHub Desktop.
class Crawler:
def __init__(self, base_url, writer, predicate):
"""
* base_url is the first URL that would be crawled.
* writer is a class that is defined below. It is in-charge of writing
to the file system.
* predicate is a function with signature predicate(url). It
returns true, false and on the basis of that, the decision on
whether to download a link or not is taken. """
....
def start(self):
""" Starts the crawler. While one should avoid
lifecycle methods like these unless using a framework,
Starting the crawler as soon as you create it doesn't
seem like the right thing to do."""
....
def _get_crawlable_URLs(self, body):
""" Reads the contents of a requests response body
and returns URLs to crawl. Runs URLs against
self.predicate. """
...
def _write(self, url, body):
""" Uses writer to write the body of a response via
self.writer"""
class BaseWriter:
def __init__(self, base_url):
""" Creates whatever buckets or folders you need to create so that
writing can start. We want this level of separation to be able to
move to a new repository as and when required. Say you want to write
files to AWS S3 rather than your local file system. """
def write(self, url, body):
""" Creates file like object and writes the contents to it. """
class LocalFileSystemWriter(BaseWriter):
""" Uses the local file system to write files. The init method creates
folders for the base URL. The write() method creates files relative to
it and writes contents. """
def maxCountPredicate(max_count, nextPredicate):
def maxCountInner(self, url):
if (nextPredicate(url)):
max_count = max_count - 1
return max_count >= 0
return false
return maxCountInner
def sameBaseUrlPredicate(base_url):
def baseUrlInner(self, url):
return url.startsWith(base_url)
return baseUrlInner
"""
The last two functions are kind of clever coding. We are using closures to
create local state for the functions and are chaining predicates.
I have been using javascript way too long to think about anything but closures.
Sorry about that. :P
"""
if __name__ === '__main__':
base_url = sys.argv[]....
max_count = sys.argv[]...
writer = LocalFileSystemWriter(self, base_url)
predicate = maxCountPredicate(max_count, sameBaseUrlPredicate(base_url))
crawler = Crawler(base_url, writer, predicate)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment