tjeason/web_crawler.py

## web_crawler.py
import httplib
import re
import argparse


def searchURL(url="https://news.ycombinator.com/", depth=2, search="python"):
	processed = []
	# only do http links
	if (url.startswith("https://") and (not url in processed)):
		processed.append(url)
		url = url.replace("https://", "", 1)

		# split out the url into host and doc
		host = url
		path = "/"

		urlparts = url.split("/")
		if (len(urlparts) > 1):
			host = urlparts[0]
			path = url.replace(host, "", 1)

		# make the first request
		print "crawling host: " + host + " path: " + path
		conn = httplib.HTTPConnection(host)
		req = conn.request("GET", path)
		res = conn.getresponse()

		# find the links
		contents = res.read()
		m = re.findall('href="(.*?)"', contents)

		if (search in contents):
			print "Found " + search + " at " + url

		print str(depth) + ": processing " + str(len(m)) + " links"
		for href in m:
			# do relative urls
			if (href.startswith("/")):
				href = "http://" + host + href

			# follow the links
			if (depth > 0):
				searchURL(href, depth-1, search)
	else:
		print "skipping " + url

def main():
    parser = argparse.ArgumentParser(description="python web crawler")
    parser.add_argument('url',help='url to be crawled')
    parser.add_argument('depth',type=int,help='depth levels which means go into links on a page till depth level')
    parser.add_argument('search',help='search text')
    args = parser.parse_args()

    searchURL(args.url, args.depth, args.search)

if __name__== '__main__':
    main()
	import httplib
	import re
	import argparse


	def searchURL(url="https://news.ycombinator.com/", depth=2, search="python"):
	processed = []
	# only do http links
	if (url.startswith("https://") and (not url in processed)):
	processed.append(url)
	url = url.replace("https://", "", 1)

	# split out the url into host and doc
	host = url
	path = "/"

	urlparts = url.split("/")
	if (len(urlparts) > 1):
	host = urlparts[0]
	path = url.replace(host, "", 1)

	# make the first request
	print "crawling host: " + host + " path: " + path
	conn = httplib.HTTPConnection(host)
	req = conn.request("GET", path)
	res = conn.getresponse()

	# find the links
	contents = res.read()
	m = re.findall('href="(.*?)"', contents)

	if (search in contents):
	print "Found " + search + " at " + url

	print str(depth) + ": processing " + str(len(m)) + " links"
	for href in m:
	# do relative urls
	if (href.startswith("/")):
	href = "http://" + host + href

	# follow the links
	if (depth > 0):
	searchURL(href, depth-1, search)
	else:
	print "skipping " + url

	def main():
	parser = argparse.ArgumentParser(description="python web crawler")
	parser.add_argument('url',help='url to be crawled')
	parser.add_argument('depth',type=int,help='depth levels which means go into links on a page till depth level')
	parser.add_argument('search',help='search text')
	args = parser.parse_args()

	searchURL(args.url, args.depth, args.search)

	if __name__== '__main__':
	main()