Chitrank-Dixit/web_crawler.py

## web_crawler.py
# Write a web crawler

'''
 A crawler is a program that starts with a url on the web (ex: http://python.org), fetches the web-page corresponding to that url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
'''

# urllib2 for downloading web pages
import urllib2

# get_next_target() takes a page and checks for the positions of the links it finds from '<a href='
def get_next_target(page):
    start_link=page.find('<a href=')
    if start_link == -1:
        return None,0
    start_quote=page.find('"',start_link)
    end_quote=page.find('"',start_quote+1)
    url=page[start_quote+1:end_quote]
    return url,end_quote

# takes the page source and urllib2 downloads the whole page
def get_page(source):
    user_agent = 'Lynx/2.8.8dev.3 libwww-FM/2.14 SSL-MM/1.4.1'
    headers = {'User-Agent': user_agent}
    res = urllib2.Request(source, headers=headers)
    sourceFile = urllib2.urlopen(res)
    sourceText=sourceFile.read()
    return sourceText

# if any page is not in the list it adds the page to the lists
def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

# gets all the links and append it to the lists
def get_all_links(page):
    lists=[]
    while True:
        url,endpos=get_next_target(page)
        if url:
            lists.append(url)
            page=page[endpos:]
        else:
            break
    return lists

'''
starting of the web crawler , add the initial page(named seed) to the tocrawl list and then until tocrawl is empty the while
loop proceeds, if page is not in crawled list then the page is searched for further more pages , now the new pages gets added to tocrawl list and then that crawled page is added to the crawled list. At last the crawled list is returned which contains all the links
'''
def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl,get_all_links(get_page(page)))
            print "Current Page",page
            crawled.append(page)

    return crawled


# http://chitrank-dixit.github.io/crawl-world/index.html (source page where we will start crawling)
# just change the crawl_web() parameter to start crawling from a different seed page(starting point)
pagestore=crawl_web('http://chitrank-dixit.github.io/crawl-world/index.html')
print pagestore
	# Write a web crawler

	'''
	A crawler is a program that starts with a url on the web (ex: http://python.org), fetches the web-page corresponding to that url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
	'''

	# urllib2 for downloading web pages
	import urllib2

	# get_next_target() takes a page and checks for the positions of the links it finds from '<a href='
	def get_next_target(page):
	start_link=page.find('<a href=')
	if start_link == -1:
	return None,0
	start_quote=page.find('"',start_link)
	end_quote=page.find('"',start_quote+1)
	url=page[start_quote+1:end_quote]
	return url,end_quote

	# takes the page source and urllib2 downloads the whole page
	def get_page(source):
	user_agent = 'Lynx/2.8.8dev.3 libwww-FM/2.14 SSL-MM/1.4.1'
	headers = {'User-Agent': user_agent}
	res = urllib2.Request(source, headers=headers)
	sourceFile = urllib2.urlopen(res)
	sourceText=sourceFile.read()
	return sourceText

	# if any page is not in the list it adds the page to the lists
	def union(p,q):
	for e in q:
	if e not in p:
	p.append(e)

	# gets all the links and append it to the lists
	def get_all_links(page):
	lists=[]
	while True:
	url,endpos=get_next_target(page)
	if url:
	lists.append(url)
	page=page[endpos:]
	else:
	break
	return lists

	'''
	starting of the web crawler , add the initial page(named seed) to the tocrawl list and then until tocrawl is empty the while
	loop proceeds, if page is not in crawled list then the page is searched for further more pages , now the new pages gets added to tocrawl list and then that crawled page is added to the crawled list. At last the crawled list is returned which contains all the links
	'''
	def crawl_web(seed):
	tocrawl = [seed]
	crawled = []
	while tocrawl:
	page = tocrawl.pop()
	if page not in crawled:
	union(tocrawl,get_all_links(get_page(page)))
	print "Current Page",page
	crawled.append(page)

	return crawled



	# http://chitrank-dixit.github.io/crawl-world/index.html (source page where we will start crawling)
	# just change the crawl_web() parameter to start crawling from a different seed page(starting point)
	pagestore=crawl_web('http://chitrank-dixit.github.io/crawl-world/index.html')
	print pagestore