Skip to content

Instantly share code, notes, and snippets.

@nyarurato
Last active Aug 29, 2015
Embed
What would you like to do?
#coding:utf-8
import os
import urllib.request
import time
from bs4 import BeautifulSoup
# すぱこーの掲載ページのurl抜き出し
def scan_url(url,crtdir):
pages = [url]
req = urllib.request.urlopen(url)
soup = BeautifulSoup(req)
pg = soup.find("div",class_ = "pagination")
links = pg.find_all("a")
for a in links:
pages.append(a.get("href"))
for p in pages:
scan_article(p,crtdir)
# articleタグから画像のurl抜き出し
def scan_article(url,crtdir):
waittime = 5.0
req = urllib.request.urlopen(url)
soup = BeautifulSoup(req)
articles = soup.find_all("article",class_="loop-entry")
for atcl in articles:
a = atcl.find("img")
src = a.get("src")
# ?以下を消す
if "?" in src:
src = src.split("?")[0]
print("found "+src)
download_img(src,crtdir)
# スリープ
time.sleep(waittime)
def download_img(src,crtdir):
fname = src.split("/")[-1]
try:
# 画像をダウンロード
urllib.request.urlretrieve(src,crtdir+os.sep+fname)
print("Download "+fname)
except Exception as e:
print(e)
def make_dir():
crtdir = os.path.abspath("imgs")
if(not os.path.exists(crtdir)):
os.mkdir("imgs")
return crtdir
if __name__ == "__main__":
crtdir = make_dir()
scan_url("http://pronama.azurewebsites.net/web-comic/",crtdir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment