Skip to content

Instantly share code, notes, and snippets.

@Nanguage
Created February 12, 2018 15:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nanguage/0edc85bb3a3acb546f5ec836c3c08757 to your computer and use it in GitHub Desktop.
Save Nanguage/0edc85bb3a3acb546f5ec836c3c08757 to your computer and use it in GitHub Desktop.
"""
提取 S1 壁纸战页面中的壁纸链接并下载
https://bbs.saraba1st.com/2b/thread-1579059-1-1.html
"""
import os
import pathlib
import re
import urllib3
from typing import List, Optional
import concurrent.futures
import click
import wget
import ipdb
PATTERN = "http://wx.*?\.jpg"
def extract_img_links(html: str, pattern: Optional[str]) -> List[str]:
if pattern is None:
pattern = PATTERN
return re.findall(pattern, html)
def download_links(links: List[str], outdir: str, threads: int):
def down_img(url):
try:
wget.download(url, out=outdir, bar=None)
print(f"\"{url}\" download succees!")
except:
print(f"\"{url}\" download failed!")
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as ex:
ex.map(down_img, links)
@click.command()
@click.argument("input-html")
@click.option("--outdir", "-O",
default=os.path.curdir,
help="output directory")
@click.option("--pattern", "-p",
help="target pattern(regular expression)")
@click.option("--threads", "-t",
default=4,
help="threads used to download images")
def _main(input_html, outdir, pattern, threads):
with open(input_html, 'rb') as f:
html = f.read().decode('utf-8')
links = extract_img_links(html, pattern)
#ipdb.set_trace()
outpath = pathlib.Path(outdir)
if not outpath.is_dir():
outpath.mkdir()
download_links(links, outdir, threads)
if __name__ == "__main__":
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment