Skip to content

Instantly share code, notes, and snippets.

@huatangzhi
Created September 1, 2013 04:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save huatangzhi/6402436 to your computer and use it in GitHub Desktop.
Save huatangzhi/6402436 to your computer and use it in GitHub Desktop.
# coding=utf-8
import urllib
import time
url = [''] * 350
page = 1
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+ str(page)+ '.html').read()
i = 0
title = con.find(r'<a title=')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
while i < 60 and title != -1 and href != -1 and html != -1:
url[i] = con[href + 6:html + 5 ]
print url[i]
title = con.find(r'<a title=',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = i + 1
else :
print "failed"
page = page + 1
'''j = 0
while j < 60:
content = urllib.urlopen(url[j]).read()
open(r'hanhan/' + url[j][-26:],'w+').write(content)
print 'downloading',url[j]
j = j + 1
time.sleep(15)
else:
print 'failed to download'
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment