Last active
December 19, 2015 08:39
-
-
Save tonyseek/5927191 to your computer and use it in GitHub Desktop.
批量下载深圳大学毕业照
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import gevent.monkey | |
import gevent.pool | |
gevent.monkey.patch_all() | |
import requests | |
from zipfile import ZipFile | |
from urlparse import urljoin | |
from bs4 import BeautifulSoup | |
BASE_URL = "http://210.39.3.84:2013" | |
PHOTO_EXTS = frozenset(["jpg", "png", "bmp", "gif"]) | |
is_file = lambda url: url.split(".")[-1].lower() in PHOTO_EXTS | |
def iter_links(soup, current_url): | |
for link in soup.find_all("a"): | |
link.attrs['href'] = urljoin(current_url, link.attrs['href']) | |
yield link | |
def search_photo(url, path=None, visited=None): | |
path = [] if path is None else path | |
visited = set() if visited is None else visited | |
# skip visited url | |
if url in visited: | |
raise StopIteration | |
# fetch page and parse it | |
index_page = requests.get(url) | |
index_page.encoding = "utf-8" | |
index_page.raise_for_status() | |
soup = BeautifulSoup(index_page.text) | |
# record visited | |
visited.add(url) | |
# get links | |
for link in iter_links(soup, url): | |
if u"父目录" in link.text: | |
continue | |
link_url = link.attrs["href"] | |
_path = path + [link.text] | |
if is_file(link_url): | |
yield link_url, _path | |
else: | |
# yield from search_photo(...) | |
for p in search_photo(link_url, _path, visited): | |
yield p | |
pool = gevent.pool.Pool(5) | |
def download_and_pack_photo(url, path, zipfile): | |
arcname = "/".join(path) | |
arcbytes = requests.get(url).content | |
print "* %s" % arcname | |
zipfile.writestr(arcname, arcbytes) | |
def pack_zip(filename): | |
with ZipFile(filename, "w", allowZip64=True) as zipfile: | |
for url, path in search_photo(BASE_URL): | |
pool.spawn(download_and_pack_photo, url, path, zipfile) | |
pool.join() | |
if __name__ == "__main__": | |
pack_zip("photo.zip") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment