huatangzhi/网络爬虫

## 网络爬虫
# coding=utf-8
import urllib
import time
url = [''] * 350
page = 1

while page <= 7:
    con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+ str(page)+ '.html').read()

    i = 0

    title = con.find(r'<a title=')


    href = con.find(r'href=',title)


    html = con.find(r'.html',href)


    while i < 60 and title != -1 and href != -1 and html != -1:

        url[i] = con[href + 6:html + 5 ]

        print url[i]

        title = con.find(r'<a title=',html)

        href = con.find(r'href=',title)

        html = con.find(r'.html',href)

        i = i + 1
    else :
        print "failed"
    page = page + 1

    '''j = 0

    while j < 60:

        content = urllib.urlopen(url[j]).read()
        open(r'hanhan/' + url[j][-26:],'w+').write(content)
        print 'downloading',url[j]
        j = j + 1
        time.sleep(15)
    else:
        print 'failed to download'
    '''
	# coding=utf-8
	import urllib
	import time
	url = [''] * 350
	page = 1

	while page <= 7:
	con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+ str(page)+ '.html').read()

	i = 0

	title = con.find(r'<a title=')


	href = con.find(r'href=',title)


	html = con.find(r'.html',href)


	while i < 60 and title != -1 and href != -1 and html != -1:

	url[i] = con[href + 6:html + 5 ]

	print url[i]

	title = con.find(r'<a title=',html)

	href = con.find(r'href=',title)

	html = con.find(r'.html',href)

	i = i + 1
	else :
	print "failed"
	page = page + 1

	'''j = 0

	while j < 60:

	content = urllib.urlopen(url[j]).read()
	open(r'hanhan/' + url[j][-26:],'w+').write(content)
	print 'downloading',url[j]
	j = j + 1
	time.sleep(15)
	else:
	print 'failed to download'
	'''