Skip to content

Instantly share code, notes, and snippets.

@wenLiangcan
Last active January 3, 2018 07:19
Show Gist options
  • Save wenLiangcan/7212043 to your computer and use it in GitHub Desktop.
Save wenLiangcan/7212043 to your computer and use it in GitHub Desktop.
下载豆瓣相册图片
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Filename: get_douban_album_pic.py
import urllib2, re, os, sys
from os.path import basename
from urlparse import urlsplit
def fetch(url):
try:
return urllib2.urlopen(url).read()
except Exception as e:
print(e)
def get_count(url):
urlContent = fetch(url)
ptn = u'<span class="count">\(\u5171(\d+)\u5f20\)</span>'
count = re.findall(ptn, unicode(urlContent, "utf-8"))
if len(count) < 1:
return 1
return int(count[0])
def down_img(url):
urlContent = fetch(url)
spans = r'(http://img\d.douban.com/view/photo/thumb/public/p\d+.jpg)'
imgUrls = re.findall(spans, urlContent)
#1770000000 号以后的照片都有更大尺寸的原图,url 中含 large. via: 豆藤
imgUrls = [
i.replace('thumb', 'large')
if int(re.findall(r'.*?public/p(\d+).jpg', i)[0]) > 1770000000
else i.replace('thumb', 'photo')
for i in imgUrls
]
for imgUrl in imgUrls:
imgData = fetch(imgUrl)
fileName = basename(urlsplit(imgUrl)[2])
with open('douban/'+fileName, 'wb') as output:
output.write(imgData)
def download(url):
base = url
num = 0
count = get_count(url)
start = 0
while (start < count):
num += 1
print("Downloading images in page %d ...") % num
down_img(url)
start += 18
url = base + '?start=' + str(start)
print("Finished")
def input_url():
link = r'(^http://www.douban.com/photos/album/\d+).*?'
if len(sys.argv) == 1:
inputString = raw_input('Enter album url --> ')
else:
inputString = sys.argv[1]
url = re.findall(link, inputString)[0] + '/'
return url
if __name__ == "__main__":
if (os.path.exists('douban') == False):
os.mkdir('douban')
download(input_url())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment