femmerling/crawler.py

## crawler.py
"""
	crawler.py
	web link crawler
	Nov 3rd 2012 saturday night insomnia coding session
	Fauzan Erich Emmerling
	erich@emfeld.com
"""

import re
from urllib2 import urlopen

links = []
root_url = 'http://www.google.com/'  # sample root url only. Use any links you wish


def extract_links(url):
    counter = 0  # use this to count the links found in url
    anchor_pattern = '<a'  # search for this to ensure that you are checking an anchor link element
    href_pattern = 'href="http:'  # search this for easier link extraction

    print 'Crawl links in ' + url

    try:
        html_data = urlopen(url)
        lines_list = html_data.readlines()
        for line in lines_list:
            anchor_element = re.search(anchor_pattern, line)
            if anchor_element:
                attributes = line.split(' ')
                for attribute in attributes:
                    href_link = re.search(href_pattern, attribute)
                    if href_link:
                        link = attribute.split('"')
                        if len(links) == 0:  # if the list is empty, append the component
                            links.append(link[1])
                            counter = counter + 1
                        else:
                            if not link[1] in links:  # if the list is not empty, check if link already existed
                                links.append(link[1])
                                counter = counter + 1
        print str(counter) + ' links found in ' + url
    except:
        print 'link inaccessible'

extract_links(root_url)  # crawl base page

for link in links:  # crawl all links other than base page
    if link != root_url:
        extract_links(link)

for link in links:  # display all links found in the website
    print link

print str(len(links)) + ' total links found in the website'
	"""
	crawler.py
	web link crawler
	Nov 3rd 2012 saturday night insomnia coding session
	Fauzan Erich Emmerling
	erich@emfeld.com
	"""

	import re
	from urllib2 import urlopen

	links = []
	root_url = 'http://www.google.com/' # sample root url only. Use any links you wish


	def extract_links(url):
	counter = 0 # use this to count the links found in url
	anchor_pattern = '<a' # search for this to ensure that you are checking an anchor link element
	href_pattern = 'href="http:' # search this for easier link extraction

	print 'Crawl links in ' + url

	try:
	html_data = urlopen(url)
	lines_list = html_data.readlines()
	for line in lines_list:
	anchor_element = re.search(anchor_pattern, line)
	if anchor_element:
	attributes = line.split(' ')
	for attribute in attributes:
	href_link = re.search(href_pattern, attribute)
	if href_link:
	link = attribute.split('"')
	if len(links) == 0: # if the list is empty, append the component
	links.append(link[1])
	counter = counter + 1
	else:
	if not link[1] in links: # if the list is not empty, check if link already existed
	links.append(link[1])
	counter = counter + 1
	print str(counter) + ' links found in ' + url
	except:
	print 'link inaccessible'

	extract_links(root_url) # crawl base page

	for link in links: # crawl all links other than base page
	if link != root_url:
	extract_links(link)

	for link in links: # display all links found in the website
	print link

	print str(len(links)) + ' total links found in the website'