Skip to content

Instantly share code, notes, and snippets.

@femmerling
Created November 3, 2012 16:49
Show Gist options
  • Save femmerling/4007878 to your computer and use it in GitHub Desktop.
Save femmerling/4007878 to your computer and use it in GitHub Desktop.
Simple Crawler Using Python
"""
crawler.py
web link crawler
Nov 3rd 2012 saturday night insomnia coding session
Fauzan Erich Emmerling
erich@emfeld.com
"""
import re
from urllib2 import urlopen
links = []
root_url = 'http://www.google.com/' # sample root url only. Use any links you wish
def extract_links(url):
counter = 0 # use this to count the links found in url
anchor_pattern = '<a' # search for this to ensure that you are checking an anchor link element
href_pattern = 'href="http:' # search this for easier link extraction
print 'Crawl links in ' + url
try:
html_data = urlopen(url)
lines_list = html_data.readlines()
for line in lines_list:
anchor_element = re.search(anchor_pattern, line)
if anchor_element:
attributes = line.split(' ')
for attribute in attributes:
href_link = re.search(href_pattern, attribute)
if href_link:
link = attribute.split('"')
if len(links) == 0: # if the list is empty, append the component
links.append(link[1])
counter = counter + 1
else:
if not link[1] in links: # if the list is not empty, check if link already existed
links.append(link[1])
counter = counter + 1
print str(counter) + ' links found in ' + url
except:
print 'link inaccessible'
extract_links(root_url) # crawl base page
for link in links: # crawl all links other than base page
if link != root_url:
extract_links(link)
for link in links: # display all links found in the website
print link
print str(len(links)) + ' total links found in the website'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment