Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Python Script for downloading and organizing images from The Painting Dataset:
# Public Domain CC0 license.
# Download images from The Painting Dataset:
# The image urls are outdaed in the Excel sheet but the painting urls are not,
# so this script re-crawls those images and downloads them locally.
# It works as of July 2020.
# Run this first with:
# $ scrapy runspider -o paintings.json
# Images are stored in 'out/raw'
# Then optionally create the appropriate folders with:
# $ python
import logging
import os
import urllib
import pandas as pd
import scrapy
outdir = "out/raw"
os.makedirs(outdir, exist_ok=True)
filename = "paintings.xlsx"
if not os.path.exists(filename):"Downloading paintings dataset CSV.")
urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
df = pd.read_excel(filename)
image_urls = df["Web page URL"]"Number of urls to crawl: {len(image_urls)}")
class PaintingsSpider(scrapy.Spider):
name = "paintingsspider"
custom_settings = {
"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
"IMAGES_STORE": outdir,
start_urls = list(image_urls)
def parse(self, response):
for div in response.css("div.single_img"):
image_src = div.css("img::attr(src)").extract_first()
if image_src:"Found image {image_src}")
yield {"image_urls": [image_src], "url": response.request.url}
# Public Domain CC0 license.
# Run the crawler first:
# $ scrapy runspider -o paintings.json
# Then optionally run this file:
# $ python
# Images are stored in out/organized folder.
import json
import logging
import os
import shutil
import pandas as pd
rawdir = "out/raw"
outdir = "out/organized"
os.makedirs(outdir, exist_ok=True)
df = pd.read_excel("paintings.xlsx")
df = df.groupby("Web page URL").first()
with open("paintings.json") as f:
crawl_infos = json.load(f)
for crawl_info in crawl_infos:
row = df.loc[crawl_info["url"]]
labels = str(row["Labels"])
labels = [l.strip().replace("'", "") for l in labels.split(" ")]
labels = [l for l in labels if l]
for image in crawl_info["images"]:
for label in labels:
labeldir = os.path.join(outdir, label)
os.makedirs(labeldir, exist_ok=True)
os.path.join(rawdir, image["path"]),
os.path.join(labeldir, os.path.basename(image["path"])),
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment