Skip to content

Instantly share code, notes, and snippets.

@tonyseek
Last active December 19, 2015 08:39
Show Gist options
  • Save tonyseek/5927191 to your computer and use it in GitHub Desktop.
Save tonyseek/5927191 to your computer and use it in GitHub Desktop.
批量下载深圳大学毕业照
#-*- coding:utf-8 -*-
import gevent.monkey
import gevent.pool
gevent.monkey.patch_all()
import requests
from zipfile import ZipFile
from urlparse import urljoin
from bs4 import BeautifulSoup
BASE_URL = "http://210.39.3.84:2013"
PHOTO_EXTS = frozenset(["jpg", "png", "bmp", "gif"])
is_file = lambda url: url.split(".")[-1].lower() in PHOTO_EXTS
def iter_links(soup, current_url):
for link in soup.find_all("a"):
link.attrs['href'] = urljoin(current_url, link.attrs['href'])
yield link
def search_photo(url, path=None, visited=None):
path = [] if path is None else path
visited = set() if visited is None else visited
# skip visited url
if url in visited:
raise StopIteration
# fetch page and parse it
index_page = requests.get(url)
index_page.encoding = "utf-8"
index_page.raise_for_status()
soup = BeautifulSoup(index_page.text)
# record visited
visited.add(url)
# get links
for link in iter_links(soup, url):
if u"父目录" in link.text:
continue
link_url = link.attrs["href"]
_path = path + [link.text]
if is_file(link_url):
yield link_url, _path
else:
# yield from search_photo(...)
for p in search_photo(link_url, _path, visited):
yield p
pool = gevent.pool.Pool(5)
def download_and_pack_photo(url, path, zipfile):
arcname = "/".join(path)
arcbytes = requests.get(url).content
print "* %s" % arcname
zipfile.writestr(arcname, arcbytes)
def pack_zip(filename):
with ZipFile(filename, "w", allowZip64=True) as zipfile:
for url, path in search_photo(BASE_URL):
pool.spawn(download_and_pack_photo, url, path, zipfile)
pool.join()
if __name__ == "__main__":
pack_zip("photo.zip")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment