Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Created September 24, 2012 09:07
Show Gist options
  • Save fnielsen/3775055 to your computer and use it in GitHub Desktop.
Save fnielsen/3775055 to your computer and use it in GitHub Desktop.
Web crawling with htmllib & co.
import htmllib, formatter, urllib, urlparse
k = 1
urls = {}
todownload = set(['http://www.dtu.dk'])
while todownload:
url0 = todownload.pop()
urls[url0] = set()
try:
p = htmllib.HTMLParser(formatter.NullFormatter())
p.feed(urllib.urlopen(url0).read())
p.close()
except:
continue
for url in p.anchorlist:
urlparts = urlparse.urlparse(url)
if not urlparts[0] and not urlparts[1]:
urlparts0 = urlparse.urlparse(url0)
url = urlparse.urlunparse((urlparts0[0], urlparts0[1],
urlparts[2], '', '', ''))
else:
url = urlparse.urlunparse((urlparts[0], urlparts[1],
urlparts[2], '', '', ''))
urlparts = urlparse.urlparse(url)
if urlparts[1][-7:] != '.dtu.dk': continue # Not DTU
if urlparts[0] != 'http': continue # Not Web
urls[url0] = urls[url0].union([url])
if url not in urls:
todownload = todownload.union([url])
k += 1
print("%4d %4d %s" % (k, len(todownload), url0))
if k > 1000: break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment