Skip to content

Instantly share code, notes, and snippets.

@tjeason
Last active June 16, 2016 23:46
Show Gist options
  • Save tjeason/01e89351cd59dd88fd2c to your computer and use it in GitHub Desktop.
Save tjeason/01e89351cd59dd88fd2c to your computer and use it in GitHub Desktop.
Python web crawler
import httplib
import re
import argparse
def searchURL(url="https://news.ycombinator.com/", depth=2, search="python"):
processed = []
# only do http links
if (url.startswith("https://") and (not url in processed)):
processed.append(url)
url = url.replace("https://", "", 1)
# split out the url into host and doc
host = url
path = "/"
urlparts = url.split("/")
if (len(urlparts) > 1):
host = urlparts[0]
path = url.replace(host, "", 1)
# make the first request
print "crawling host: " + host + " path: " + path
conn = httplib.HTTPConnection(host)
req = conn.request("GET", path)
res = conn.getresponse()
# find the links
contents = res.read()
m = re.findall('href="(.*?)"', contents)
if (search in contents):
print "Found " + search + " at " + url
print str(depth) + ": processing " + str(len(m)) + " links"
for href in m:
# do relative urls
if (href.startswith("/")):
href = "http://" + host + href
# follow the links
if (depth > 0):
searchURL(href, depth-1, search)
else:
print "skipping " + url
def main():
parser = argparse.ArgumentParser(description="python web crawler")
parser.add_argument('url',help='url to be crawled')
parser.add_argument('depth',type=int,help='depth levels which means go into links on a page till depth level')
parser.add_argument('search',help='search text')
args = parser.parse_args()
searchURL(args.url, args.depth, args.search)
if __name__== '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment