Skip to content

Instantly share code, notes, and snippets.

@luxinyan
Created December 16, 2015 14:28
Show Gist options
  • Save luxinyan/01f6b6a3e3d3e040cbcb to your computer and use it in GitHub Desktop.
Save luxinyan/01f6b6a3e3d3e040cbcb to your computer and use it in GitHub Desktop.
use BeautifulSoup to crow jandant
#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8
import urllib
import urllib2
import os
import time
from BeautifulSoup import BeautifulSoup
def getAllImageLink(targetlink):
html = urllib2.urlopen(targetlink).read()
soup = BeautifulSoup(html)
print(targetlink)
liResult = soup.findAll('ol', attrs={'class': "commentlist"})
nextHtml = soup.find('a', attrs={'class': 'previous-comment-page'}).get('href')
for li in liResult:
imageEntityArray = li.findAll('img')
for image in imageEntityArray:
link = image.get("src")
imageName = time.time()
currentPath = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(currentPath + '/ooxx'):
os.makedirs(currentPath + '/ooxx')
filePath = currentPath + '/ooxx/%s.jpg' % imageName
if not os.path.exists(filePath):
try:
urllib.urlretrieve(link, filePath)
except:
pass
# print filePath
getAllImageLink(nextHtml)
if __name__ == '__main__':
# print os.path.dirname(os.path.realpath(__file__))
getAllImageLink('http://jandan.net/ooxx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment