jcxia43/simplecrawler.py

## simplecrawler.py
import urllib

#this is just a very simple web crawler, can not actually do
#what a real web crawler do :)

#get the next link on the page,here page is the content of
#the HTML text, also a string
def get_next_link(page):
    start_pos = page.find("<a href=")
    if start_pos == -1:
        return None,0
    start_pos = page.find('"',start_pos)
    end_pos = page.find('"',start_pos + 1)
    url = page[start_pos + 1:end_pos]
    return url,end_pos

#get all links on one page
def get_all_link(page):
    crawl = []
    while True:
        url,end_pos = get_next_link(page)
        if url != None:
            crawl.append(url)
            page = page[end_pos:] #the NEW page starts where the last link is
        else:
            break
    return crawl

#union two lists
def union(a,b):
    for element in b:
        if element not in a:
            a.append(element)

#crawl a seed URL, get every link that directly/indirectly connected
#to the seed page.
def crawl(seed):
    tocrawl = [seed] #tocrawl contains those yet to be crawled pages
    crawled = [] #crawled pages
    last_url = ''
    while len(tocrawl) > 0:
        page = tocrawl.pop()
        if page not in crawled:
            url = str(page)
            if url[0] == '/':
                url = last_url + url #turns a relative path to an absolute one
            if url.find('http') != -1:
                last_url = page
		try:
                	file = urllib.urlopen(url)
                	page = file.read()
		except:
			page = ''
                union(tocrawl,get_all_link(page))
                crawled.append(last_url)
                print url
                file.close()


crawl('http://www.google.com')
	import urllib

	#this is just a very simple web crawler, can not actually do
	#what a real web crawler do :)

	#get the next link on the page,here page is the content of
	#the HTML text, also a string
	def get_next_link(page):
	start_pos = page.find("<a href=")
	if start_pos == -1:
	return None,0
	start_pos = page.find('"',start_pos)
	end_pos = page.find('"',start_pos + 1)
	url = page[start_pos + 1:end_pos]
	return url,end_pos

	#get all links on one page
	def get_all_link(page):
	crawl = []
	while True:
	url,end_pos = get_next_link(page)
	if url != None:
	crawl.append(url)
	page = page[end_pos:] #the NEW page starts where the last link is
	else:
	break
	return crawl

	#union two lists
	def union(a,b):
	for element in b:
	if element not in a:
	a.append(element)

	#crawl a seed URL, get every link that directly/indirectly connected
	#to the seed page.
	def crawl(seed):
	tocrawl = [seed] #tocrawl contains those yet to be crawled pages
	crawled = [] #crawled pages
	last_url = ''
	while len(tocrawl) > 0:
	page = tocrawl.pop()
	if page not in crawled:
	url = str(page)
	if url[0] == '/':
	url = last_url + url #turns a relative path to an absolute one
	if url.find('http') != -1:
	last_url = page
	try:
	file = urllib.urlopen(url)
	page = file.read()
	except:
	page = ''
	union(tocrawl,get_all_link(page))
	crawled.append(last_url)
	print url
	file.close()


	crawl('http://www.google.com')