Skip to content

Instantly share code, notes, and snippets.

@digitalWestie
Last active June 1, 2023 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save digitalWestie/c6ade9ed9b2bff2927e9ddb72bb71ce8 to your computer and use it in GitHub Desktop.
Save digitalWestie/c6ade9ed9b2bff2927e9ddb72bb71ce8 to your computer and use it in GitHub Desktop.
Scraping and data-prep code for USDA Pomological Watercolour Collection using NodeJS
import * as fs from 'node:fs';
import fetch from 'node-fetch';
import cheerio from 'cheerio';
export const downloadImage = async function(url, imageName) {
const response = await fetch(url);
if (response.ok) {
response.body.pipe(fs.createWriteStream('./' + imageName))
return true
}
return false
}
export const extractThumbnailUrls = async function(url) {
const response = await fetch(url);
if (!response.ok) {
console.log("Err couldn't fetch page!", response.status)
return false
}
const body = await response.text();
return body; // TODO: dig in to the markup and return a list of urls!
}
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=10&q=&search_field=all_fields";
console.log(downloadResults(exampleResultsUrl));
downloadImage("https://naldc.nal.usda.gov/download/POM00004894/thumbnail", "test.jpg")
import * as fs from 'node:fs';
import fetch from 'node-fetch';
import cheerio from 'cheerio';
import { setTimeout } from 'timers/promises';
export const downloadImage = async function(url, imageName) {
const response = await fetch(url);
if (response.ok) {
response.body.pipe(fs.createWriteStream('./' + imageName))
return true
}
return false
}
export const extractThumbnailUrls = async function(url) {
const response = await fetch(url);
if (!response.ok) {
console.log("Err couldn't fetch page!", response.status)
return false
}
const body = await response.text();
const $ = cheerio.load(body);
const imgElements = $('#documents article img');
const sources = imgElements.map((idx, element) => { return element.attribs.src });
return sources;
}
const downloadResults = async function(resultsUrl){
const sources = await extractThumbnailUrls(resultsUrl);
for (const src of sources) {
console.log("Downloading " + src);
// Turn url into a reasonable filename
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg
const noSlashes = src.split('/').join('-');
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg";
// Download and then pause before next step
const result = await downloadImage(src, filename)
await setTimeout(1500);
if (!result) { console.log("Failed to download " + src); }
}
}
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=10&q=&search_field=all_fields";
console.log(downloadResults(exampleResultsUrl));
import * as fs from 'node:fs';
import fetch from 'node-fetch';
import cheerio from 'cheerio';
import { setTimeout } from 'timers/promises';
export const downloadImage = async function(url, imageName) {
const response = await fetch(url);
if (response.ok) {
response.body.pipe(fs.createWriteStream('./' + imageName))
return true
}
return false
}
export const extractThumbnailUrls = async function(url) {
const response = await fetch(url);
if (!response.ok) {
console.log("Err couldn't fetch page!", response.status)
return false
}
const body = await response.text();
const $ = cheerio.load(body);
const articles = $('#documents article');
const imgElements = [];
// Check for apples
for (const article of articles) {
const $a = cheerio.load(article) // Load this markup into cheerio function, so we can used 'find' to get a subselection
const fruit = $a('dd.blacklight-common_name').text();
const img = $a('img')[0];
if (fruit.includes("apple")) {
console.log("Skipping " + img.attribs.src);
} else {
imgElements.push(img)
}
}
const sources = imgElements.map((element, idx) => { return element.attribs.src });
return sources;
}
const downloadResults = async function(resultsUrl){
const sources = await extractThumbnailUrls(resultsUrl);
for (const src of sources) {
console.log("Downloading " + src);
// Turn url into a reasonable filename
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg
const noSlashes = src.split('/').join('-');
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg";
// Download and then pause before next step
const result = await downloadImage(src, filename)
await setTimeout(1500);
if (!result) { console.log("Failed to download " + src); }
}
}
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=10&q=&search_field=all_fields";
downloadResults(exampleResultsUrl);
import * as fs from 'node:fs';
import fetch from 'node-fetch';
import cheerio from 'cheerio';
import { setTimeout } from 'timers/promises';
export const downloadImage = async function(url, imageName) {
const response = await fetch(url);
if (response.ok) {
response.body.pipe(fs.createWriteStream('./' + imageName))
return true
}
return false
}
export const extractThumbnailUrls = async function(url) {
const response = await fetch(url);
if (!response.ok) {
console.log("Err couldn't fetch page!", response.status)
return false
}
const body = await response.text();
const $ = cheerio.load(body);
const articles = $('#documents article');
const imgElements = [];
// Check for apples
for (const article of articles) {
const $a = cheerio.load(article) // Load this markup into cheerio function, so we can used 'find' to get a subselection
const fruit = $a('dd.blacklight-common_name').text();
const img = $a('img')[0];
if (fruit.includes("apple")) {
console.log("Skipping " + img.attribs.src);
} else {
imgElements.push(img)
}
}
const sources = imgElements.map((element, idx) => { return element.attribs.src });
return sources;
}
const downloadResults = async function(resultsUrl){
const sources = await extractThumbnailUrls(resultsUrl);
for (const src of sources) {
console.log("Downloading " + src);
// Turn url into a reasonable filename
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg
const noSlashes = src.split('/').join('-');
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg";
// Download and then pause before next step
const result = await downloadImage(src, filename)
await setTimeout(1500);
if (!result) { console.log("Failed to download " + src); }
}
}
const downloadPages = async function(baseResultsUrl, pageStart, pages) {
for (var i = pageStart; i < pageStart+pages; ++i) {
console.log("---- Downloading results for page " + i);
await downloadResults(baseResultsUrl + "&page=" + i);
}
}
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=10&q=&search_field=all_fields";
downloadPages(exampleResultsUrl, 1, 2);
import * as fs from 'node:fs';
import fetch from 'node-fetch';
import cheerio from 'cheerio';
import { setTimeout } from 'timers/promises';
export const downloadImage = async function(url, imageName) {
const response = await fetch(url);
if (response.ok) {
response.body.pipe(fs.createWriteStream('./' + imageName))
return true
}
return false
}
export const extractThumbnailUrls = async function(url) {
const response = await fetch(url);
if (!response.ok) {
console.log("Err couldn't fetch page!", response.status)
return false
}
const body = await response.text();
const $ = cheerio.load(body);
const articles = $('#documents article');
const fruits = []; //renamed to fruits
// Check for apples
for (const article of articles) {
const $a = cheerio.load(article); // Load this markup into cheerio function, so we can used 'find' to get a subselection
const fruit = $a('dd.blacklight-common_name').text().trim();
const img = $a('img')[0];
if (fruit.includes("apple")) {
console.log("Skipping " + img.attribs.src);
} else {
fruits.push({ "source": img.attribs.src, "fruit": fruit }); // we now collect an array of objects
}
}
// refactor out that second last line
return fruits;
}
const writeSaved = function(saved) {
fs.writeFile("saved-fruits.json", JSON.stringify(saved), {}, (err)=>{});
}
const downloadResults = async function(resultsUrl){
const fruits = await extractThumbnailUrls(resultsUrl);
const saved = [];
for (const fruit of fruits) {
console.log("Downloading " + fruit.source);
// Turn url into a reasonable filename
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg
const noSlashes = fruit.source.split('/').join('-');
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg";
// Download and then pause before next step
const result = await downloadImage(fruit.source, filename)
await setTimeout(1500);
if (!result) { console.log("Failed to download " + fruit.source); } else {
fruit["filename"] = filename;
saved.push(fruit);
}
}
return saved;
}
const downloadPages = async function(baseResultsUrl, pageStart, pages) {
let saved = [];
for (var i = pageStart; i < pageStart+pages; i+=3) {
console.log("---- Downloading results for page " + i);
const result = await downloadResults(baseResultsUrl + "&page=" + i);
saved = saved.concat(result);
}
writeSaved(saved);
}
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=50&q=&search_field=all_fields";
downloadPages(exampleResultsUrl, 1, 50);

Creating data-driven fabric designs with JavaScript

P1. Scraping and data preparation with nodejs

You've watched the screencast, here's the code! The screencast starts off with fruit-step-0.mjs, and progresses from there.

The changes for each step are as follows:

  • fruit-step-0.mjs: we start here
  • fruit-step-1.mjs: adds thumbnail url parsing and downloadResults function to download thumbnails from a single result page
  • fruit-step-2.mjs: adds parsing of each result's common name and adds skip condition for apples
  • fruit-step-3.mjs: adds paging
  • fruit-step-4.mjs: adds collection of save results with common name into array of objects, then saving to 'saved-fruits.json'

Finally, we make a directory called 'cropped' and created a cropped version of each downloaded image using ImageMagick's 'chop' command:

mkdir cropped
for f in ./*.jpg ; do convert "$f" -gravity north -chop 0x10 -gravity east -chop 10x0 -gravity south -chop 0x10 -gravity west -chop 10x0 "cropped/${f%.jpg}.jpg" ; done

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment