Skip to content

Instantly share code, notes, and snippets.

@jimklo
Last active January 21, 2023 20:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jimklo/8b0883e6b3d9553f2115d2df3e858f46 to your computer and use it in GitHub Desktop.
Save jimklo/8b0883e6b3d9553f2115d2df3e858f46 to your computer and use it in GitHub Desktop.
Gallery Image Download
name: imgextract
channels:
- conda-forge
- defaults
dependencies:
- bzip2=1.0.8
- ca-certificates=2022.12.7
- geckodriver=0.32.0
- libcxx=14.0.6
- libffi=3.4.2
- libsqlite=3.40.0
- libzlib=1.2.13
- ncurses=6.3
- openssl=3.0.7
- pip=22.3.1
- python=3.11.0
- python-chromedriver-binary=2.42.0
- readline=8.1.2
- setuptools=66.1.0
- tk=8.6.12
- tzdata=2022g
- wheel=0.38.4
- xz=5.2.6
- pip:
- async-generator==1.10
- attrs==22.2.0
- certifi==2022.12.7
- charset-normalizer==3.0.1
- h11==0.14.0
- idna==3.4
- outcome==1.2.0
- pysocks==1.7.1
- requests==2.28.2
- selenium==4.7.2
- sniffio==1.3.0
- sortedcontainers==2.4.0
- trio==0.22.0
- trio-websocket==0.9.2
- urllib3==1.26.14
- wsproto==1.2.0
prefix: /usr/local/Caskroom/miniconda/base/envs/imgextract
from dataclasses import dataclass
import re
import unicodedata
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from pathlib import Path
@dataclass
class GalleryImg():
alt: str
src: str
def get_images(uri: str):
browser = webdriver.Firefox()
browser.get(uri)
sleep(5)
elems = browser.find_elements(By.TAG_NAME, 'img')
images = []
for e in elems:
e_src = e.get_attribute('src')
if e_src.startswith("https://cdn.midjourney.com/") and e_src.endswith(".webp"):
alt_text = e.get_attribute('alt')
images.append(GalleryImg(alt_text, e_src))
browser.close()
return images
def get_image_name(img: GalleryImg):
root = re.sub(r",.*$", "", img.alt)
root = slugify(root)
m = re.match(r"\.([^\.]+)$", img.src)
if m:
suffix = m.groups[1]
else:
suffix = "webp"
if len(root) + len(suffix) >= 254:
max_len = 253 - len(suffix)
root = root[0:max_len]
return f"{root}.{suffix}"
def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')
def download_img(img: GalleryImg, tgt: Path = Path("output")):
if not tgt.exists():
tgt.mkdir(parents=True)
resp = requests.get(img.src, stream=True)
if resp.status_code == 200:
filename = tgt / get_image_name(img)
filename.parent.mkdir(parents=True, exist_ok=True)
with filename.open('wb') as wh:
wh.write(resp.raw.read())
if __name__ == "__main__":
gallery_uri = "https://midjourney.com/showcase/recent/?fbclid=IwAR3WriA1xPrYwEoF4CGvl98yN-ZieNLRkhk2hz05ia5yqrxQYbTtMi2p0e4"
img_list = get_images(gallery_uri)
for img in img_list:
download_img(img)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment