Skip to content

Instantly share code, notes, and snippets.

@rathankalluri
Last active October 27, 2019 21:48
Show Gist options
  • Save rathankalluri/ff233c7630baed1f19862542666f64ee to your computer and use it in GitHub Desktop.
Save rathankalluri/ff233c7630baed1f19862542666f64ee to your computer and use it in GitHub Desktop.
WebCrawler
#!/usr/bin/python
import urllib
def get_page(page):
if(seed != ''):
return urllib.urlopen(page).read()
else:
return ''
def get_next_url(page):
pre_link = page.find('<a href=')
if pre_link == -1:
return None,0
pre_quote = page.find('"',pre_link)
post_quote = page.find('"', pre_quote+1)
url = page[pre_quote+1:post_quote]
return url, post_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_pages(page):
links = []
while True:
url, endpos = get_next_url(page)
if url:
if url.find('http') == 0:
links.append(url)
page = page[endpos:]
else:
break
def crawl_web(seed):
toCrawl = [seed]
crawled = []
while toCrawl:
page = toCrawl.pop()
if page not in crawled:
union(toCrawl,get_all_pages(get_page(page)))
crawled.append(page)
return crawled
seed = 'http://rathankalluri.com/'
print crawl_web(seed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment