Skip to content

Instantly share code, notes, and snippets.

@drewlustro
Created November 14, 2018 05:42
Show Gist options
  • Save drewlustro/e7799a82b81432bc433ab0013988815f to your computer and use it in GitHub Desktop.
Save drewlustro/e7799a82b81432bc433ab0013988815f to your computer and use it in GitHub Desktop.
Art Institute of Chicago Scraper - scrapes Creative Commons Zero (CC0) licensed artworks with puppeteer and downloads images
const puppeteer = require("puppeteer");
const fs = require("fs");
const request = require("request");
let scrapeUrls = async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
page.setViewport({ width: 1600, height: 1200 });
await page.goto('https://www.artic.edu/collection?is_public_domain=1')
await page.waitFor(3000);
let times = 0, previousHeight = 0;
try {
while (times < 100) {
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.click('[data-behavior="loadMore"]');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
times++;
}
} catch (err) {
console.log('Caught Error:', err);
}
const urls = await page.$$eval('.o-collection-listing__colset a.m-listing__link', links => {
return links.map(link => link.href);
});
console.log('Distinct URLS: ', urls, `Total Count: ${urls.length} URLs`);
return urls;
};
// ----------------------------------------------------------------------
let downloadArt = async (pageUrls) => {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
page.setViewport({
width: 1600,
height: 1200
});
await page.goto('https://www.artic.edu/collection?is_public_domain=1')
let url;
let count = 1;
try {
while (url = pageUrls.pop()) {
await page.goto(url);
await page.waitFor(500);
await page.click('button[data-gallery-download]') // download that shit
console.log(`Downloaded ${count} images`)
count++;
}
} catch (err) {
console.log('Caught Error:', err);
}
};
let run = async () => {
const artworkPageUrls = await scrapeUrls();
downloadArt(artworkPageUrls);
}
run();
{
"name": "museum-scrape",
"version": "1.0.0",
"description": "Scrapes images off museum site with puppeteer",
"main": "index.js",
"author": "Drew Lustro",
"license": "MIT",
"private": true,
"dependencies": {
"fs": "^0.0.1-security",
"puppeteer": "^1.10.0",
"request": "^2.88.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment