|
import * as fs from 'node:fs'; |
|
import fetch from 'node-fetch'; |
|
import cheerio from 'cheerio'; |
|
import { setTimeout } from 'timers/promises'; |
|
|
|
export const downloadImage = async function(url, imageName) { |
|
const response = await fetch(url); |
|
|
|
if (response.ok) { |
|
response.body.pipe(fs.createWriteStream('./' + imageName)) |
|
return true |
|
} |
|
|
|
return false |
|
} |
|
|
|
export const extractThumbnailUrls = async function(url) { |
|
const response = await fetch(url); |
|
|
|
if (!response.ok) { |
|
console.log("Err couldn't fetch page!", response.status) |
|
return false |
|
} |
|
|
|
const body = await response.text(); |
|
const $ = cheerio.load(body); |
|
|
|
const articles = $('#documents article'); |
|
const fruits = []; //renamed to fruits |
|
|
|
// Check for apples |
|
for (const article of articles) { |
|
const $a = cheerio.load(article); // Load this markup into cheerio function, so we can used 'find' to get a subselection |
|
const fruit = $a('dd.blacklight-common_name').text().trim(); |
|
const img = $a('img')[0]; |
|
|
|
if (fruit.includes("apple")) { |
|
console.log("Skipping " + img.attribs.src); |
|
} else { |
|
fruits.push({ "source": img.attribs.src, "fruit": fruit }); // we now collect an array of objects |
|
} |
|
} |
|
|
|
// refactor out that second last line |
|
return fruits; |
|
} |
|
|
|
const writeSaved = function(saved) { |
|
fs.writeFile("saved-fruits.json", JSON.stringify(saved), {}, (err)=>{}); |
|
} |
|
|
|
const downloadResults = async function(resultsUrl){ |
|
const fruits = await extractThumbnailUrls(resultsUrl); |
|
const saved = []; |
|
|
|
for (const fruit of fruits) { |
|
console.log("Downloading " + fruit.source); |
|
// Turn url into a reasonable filename |
|
// https://naldc.nal.usda.gov/download/POM00002370/thumbnail -> POM00002370-thumbnail.jpg |
|
const noSlashes = fruit.source.split('/').join('-'); |
|
const filename = noSlashes.replace('https:--naldc.nal.usda.gov-download-', '') + ".jpg"; |
|
|
|
// Download and then pause before next step |
|
const result = await downloadImage(fruit.source, filename) |
|
await setTimeout(1500); |
|
|
|
if (!result) { console.log("Failed to download " + fruit.source); } else { |
|
fruit["filename"] = filename; |
|
saved.push(fruit); |
|
} |
|
} |
|
|
|
return saved; |
|
} |
|
|
|
const downloadPages = async function(baseResultsUrl, pageStart, pages) { |
|
let saved = []; |
|
|
|
for (var i = pageStart; i < pageStart+pages; i+=3) { |
|
console.log("---- Downloading results for page " + i); |
|
const result = await downloadResults(baseResultsUrl + "&page=" + i); |
|
saved = saved.concat(result); |
|
} |
|
|
|
writeSaved(saved); |
|
} |
|
|
|
const exampleResultsUrl = "https://naldc.nal.usda.gov/usda_pomological_watercolor?per_page=50&q=&search_field=all_fields"; |
|
downloadPages(exampleResultsUrl, 1, 50); |