Skip to content

Instantly share code, notes, and snippets.

@hiccupzhu
Created June 2, 2013 15:11
Show Gist options
  • Save hiccupzhu/5693786 to your computer and use it in GitHub Desktop.
Save hiccupzhu/5693786 to your computer and use it in GitHub Desktop.
Web Spider
#!/usr/bin/python
import re
import urllib2
nb_addr = 0;
addrs = set();
def get_tag_a(data):
# pattern = re.compile('''http://[\w:/\.&%=?-]+''')
alist = [];
data = data.replace(''' ''', "");
data = data.replace('''>''', "");
pattern = re.compile('''<a .*?</a>''')
lines = pattern.findall(data);
pattern = re.compile('''(http://[\w:/\.&%=?-]+).*?>(.*?)<''')
for i in lines:
mm = pattern.findall(i);
if mm:
alist = alist + mm;
# for url,text in mm:
# print text, "+", url;
# print alist;
# print [x[0] for x in alist] ;
return alist;
def get_url(url, level):
if(level >= 2):
return ;
# if check_url(url):
# print url + " HAS VISITED!!"
# return;
global addrs;
try:
response = urllib2.urlopen(url, None, timeout = 2);
data = response.read();
except urllib2.URLError,e:
print "URLError", e.reason
except urllib2.HTTPError,e:
print "HTTPError", e.code
print e.read()
else:
pass
# data, num = re.subn("&nbsp", "", data)
# data = data.replace(">", ">\n");
# print data;
# pattern = re.compile('''http://[\w:/\.&%=?-]+''')
# lines = pattern.findall(data);
alist = get_tag_a(data);
suffix = (".jpg", ".js", ".png", ".gif");
addrs = addrs | set([x[0] for x in alist]);
# print addrs
for m in addrs:
# for m in lines:
m = m.strip();
if (m.endswith(suffix)):
continue;
# print m ;
global nb_addr;
nb_addr += 1;
get_url(m, level + 1)
print "[%d,%d]:%s" %(level, nb_addr, m)
if __name__ == "__main__":
nb_addr = 0
cmd = '''http://www.baidu.com''';
get_url(cmd, 0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment