import * as fs from 'node:fs'; |
import fetch from 'node-fetch'; |
import cheerio from 'cheerio'; |
import { setTimeout } from 'timers/promises'; |
export const downloadImage = async function(url, imageName) { |
const response = await fetch(url); |
if (response.ok) { |
response.body.pipe(fs.createWriteStream('./' + imageName)) |
return true |
} |
return false |
} |
export const extractThumbnailUrls = async function(url) { |
const response = await fetch(url); |
if (!response.ok) { |
console.log("Err couldn't fetch page!", response.status) |
return false |
} |
const body = await response.text(); |
const $ = cheerio.load(body); |
const articles = $('#documents article'); |
const fruits = []; //renamed to fruits |
// Check for apples |
for (const article of articles) { |
const $a = cheerio.load(article); // Load this markup into cheerio function, so we can used 'find' to get a subselection |
const fruit = $a('dd.blacklight-common_name').text().trim(); |
const img = $a('img')[0]; |
if (fruit.includes("apple")) { |
console.log("Skipping " + img.attribs.src); |
} else { |
fruits.push({ "source": img.attribs.src, "fruit": fruit }); // we now collect an array of objects |
} |
} |
// refactor out that second last line |
return fruits; |
} |
const writeSaved = function(saved) { |
fs.writeFile("saved-fruits.json", JSON.stringify(saved), {}, (err)=>{}); |
} |
const downloadResults = async function(resultsUrl){ |
const fruits = await extractThumbnailUrls(resultsUrl); |
const saved = []; |
for (const fruit of fruits) { |
console.log("Downloading " + fruit.source); |
// Turn url into a reasonable filename |
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg |
const noSlashes = fruit.source.split('/').join('-'); |
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg"; |
// Download and then pause before next step |
const result = await downloadImage(fruit.source, filename) |
await setTimeout(1500); |
if (!result) { console.log("Failed to download " + fruit.source); } else { |
fruit["filename"] = filename; |
saved.push(fruit); |
} |
} |
return saved; |
} |
const downloadPages = async function(baseResultsUrl, pageStart, pages) { |
let saved = []; |
for (var i = pageStart; i < pageStart+pages; i+=3) { |
console.log("---- Downloading results for page " + i); |
const result = await downloadResults(baseResultsUrl + "&page=" + i); |
saved = saved.concat(result); |
} |
writeSaved(saved); |
} |
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=50&q=&search_field=all_fields"; |
downloadPages(exampleResultsUrl, 1, 50); |