dlebech/paintings_crawl.py

## paintings_crawl.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
#
# Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx
# The image urls are outdaed in the Excel sheet but the painting urls are not,
# so this script re-crawls those images and downloads them locally.
# It works as of July 2020.
#
# Run this first with:
# $ scrapy runspider paintings_crawl.py -o paintings.json
# Images are stored in 'out/raw'
#
# Then optionally create the appropriate folders with:
# $ python paintings_extract.py

THE_PAINTINGS_DATASET_URL = (
    "https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx"
)

import logging
import os
import urllib

import pandas as pd
import scrapy

logging.basicConfig(level=logging.INFO)


outdir = "out/raw"
os.makedirs(outdir, exist_ok=True)

filename = "paintings.xlsx"
if not os.path.exists(filename):
    logging.info("Downloading paintings dataset CSV.")
    urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
df = pd.read_excel(filename)

image_urls = df["Web page URL"]
logging.info(f"Number of urls to crawl: {len(image_urls)}")


class PaintingsSpider(scrapy.Spider):
    name = "paintingsspider"

    custom_settings = {
        "ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
        "IMAGES_STORE": outdir,
        "LOG_LEVEL": "INFO",
    }

    start_urls = list(image_urls)

    def parse(self, response):
        for div in response.css("div.single_img"):
            image_src = div.css("img::attr(src)").extract_first()
            if image_src:
                logging.info(f"Found image {image_src}")
                yield {"image_urls": [image_src], "url": response.request.url}

## paintings_extract.py
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
#
# Run the crawler first:
# $ scrapy runspider paintings_crawl.py -o paintings.json
#
# Then optionally run this file:
# $ python paintings_extract.py
# Images are stored in out/organized folder.

import json
import logging
import os
import shutil

import pandas as pd

logging.basicConfig(level=logging.INFO)

rawdir = "out/raw"
outdir = "out/organized"
os.makedirs(outdir, exist_ok=True)

df = pd.read_excel("paintings.xlsx")
df = df.groupby("Web page URL").first()

with open("paintings.json") as f:
    crawl_infos = json.load(f)

for crawl_info in crawl_infos:
    row = df.loc[crawl_info["url"]]
    labels = str(row["Labels"])
    labels = [l.strip().replace("'", "") for l in labels.split(" ")]
    labels = [l for l in labels if l]
    for image in crawl_info["images"]:
        for label in labels:
            labeldir = os.path.join(outdir, label)
            os.makedirs(labeldir, exist_ok=True)
            shutil.copyfile(
                os.path.join(rawdir, image["path"]),
                os.path.join(labeldir, os.path.basename(image["path"])),
            )
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	#
	# Download images from The Painting Dataset: https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx
	# The image urls are outdaed in the Excel sheet but the painting urls are not,
	# so this script re-crawls those images and downloads them locally.
	# It works as of July 2020.
	#
	# Run this first with:
	# $ scrapy runspider paintings_crawl.py -o paintings.json
	# Images are stored in 'out/raw'
	#
	# Then optionally create the appropriate folders with:
	# $ python paintings_extract.py

	THE_PAINTINGS_DATASET_URL = (
	"https://www.robots.ox.ac.uk/~vgg/data/paintings/painting_dataset_2018.xlsx"
	)

	import logging
	import os
	import urllib

	import pandas as pd
	import scrapy

	logging.basicConfig(level=logging.INFO)


	outdir = "out/raw"
	os.makedirs(outdir, exist_ok=True)

	filename = "paintings.xlsx"
	if not os.path.exists(filename):
	logging.info("Downloading paintings dataset CSV.")
	urllib.request.urlretrieve(THE_PAINTINGS_DATASET_URL, filename)
	df = pd.read_excel(filename)

	image_urls = df["Web page URL"]
	logging.info(f"Number of urls to crawl: {len(image_urls)}")


	class PaintingsSpider(scrapy.Spider):
	name = "paintingsspider"

	custom_settings = {
	"ITEM_PIPELINES": {"scrapy.pipelines.images.ImagesPipeline": 1},
	"IMAGES_STORE": outdir,
	"LOG_LEVEL": "INFO",
	}

	start_urls = list(image_urls)

	def parse(self, response):
	for div in response.css("div.single_img"):
	image_src = div.css("img::attr(src)").extract_first()
	if image_src:
	logging.info(f"Found image {image_src}")
	yield {"image_urls": [image_src], "url": response.request.url}
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
	#
	# Run the crawler first:
	# $ scrapy runspider paintings_crawl.py -o paintings.json
	#
	# Then optionally run this file:
	# $ python paintings_extract.py
	# Images are stored in out/organized folder.

	import json
	import logging
	import os
	import shutil

	import pandas as pd

	logging.basicConfig(level=logging.INFO)

	rawdir = "out/raw"
	outdir = "out/organized"
	os.makedirs(outdir, exist_ok=True)

	df = pd.read_excel("paintings.xlsx")
	df = df.groupby("Web page URL").first()

	with open("paintings.json") as f:
	crawl_infos = json.load(f)

	for crawl_info in crawl_infos:
	row = df.loc[crawl_info["url"]]
	labels = str(row["Labels"])
	labels = [l.strip().replace("'", "") for l in labels.split(" ")]
	labels = [l for l in labels if l]
	for image in crawl_info["images"]:
	for label in labels:
	labeldir = os.path.join(outdir, label)
	os.makedirs(labeldir, exist_ok=True)
	shutil.copyfile(
	os.path.join(rawdir, image["path"]),
	os.path.join(labeldir, os.path.basename(image["path"])),
	)