luxinyan/jandan_spider.py

## jandan_spider.py
#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8

import urllib
import urllib2
import os
import time
from BeautifulSoup import BeautifulSoup

def getAllImageLink(targetlink):
    html = urllib2.urlopen(targetlink).read()
    soup = BeautifulSoup(html)
    print(targetlink)
    liResult = soup.findAll('ol', attrs={'class': "commentlist"})
    nextHtml = soup.find('a', attrs={'class': 'previous-comment-page'}).get('href')

    for li in liResult:
        imageEntityArray = li.findAll('img')
        for image in imageEntityArray:
            link = image.get("src")
            imageName = time.time()
            currentPath = os.path.dirname(os.path.realpath(__file__))

            if not os.path.exists(currentPath + '/ooxx'):
                os.makedirs(currentPath + '/ooxx')

            filePath = currentPath + '/ooxx/%s.jpg' % imageName
            if not os.path.exists(filePath):
                try:
                    urllib.urlretrieve(link, filePath)
                except:
                    pass
                # print filePath

    getAllImageLink(nextHtml)


if __name__ == '__main__':
    # print os.path.dirname(os.path.realpath(__file__))
    getAllImageLink('http://jandan.net/ooxx')
	#!/usr/bin/python
	#-- coding: utf-8 --
	#encoding=utf-8

	import urllib
	import urllib2
	import os
	import time
	from BeautifulSoup import BeautifulSoup

	def getAllImageLink(targetlink):
	html = urllib2.urlopen(targetlink).read()
	soup = BeautifulSoup(html)
	print(targetlink)
	liResult = soup.findAll('ol', attrs={'class': "commentlist"})
	nextHtml = soup.find('a', attrs={'class': 'previous-comment-page'}).get('href')

	for li in liResult:
	imageEntityArray = li.findAll('img')
	for image in imageEntityArray:
	link = image.get("src")
	imageName = time.time()
	currentPath = os.path.dirname(os.path.realpath(__file__))

	if not os.path.exists(currentPath + '/ooxx'):
	os.makedirs(currentPath + '/ooxx')

	filePath = currentPath + '/ooxx/%s.jpg' % imageName
	if not os.path.exists(filePath):
	try:
	urllib.urlretrieve(link, filePath)
	except:
	pass
	# print filePath

	getAllImageLink(nextHtml)


	if __name__ == '__main__':
	# print os.path.dirname(os.path.realpath(__file__))
	getAllImageLink('http://jandan.net/ooxx')