Last active
November 14, 2023 22:56
-
-
Save crazy4groovy/02c34adba3c25c54bf8bbccf9d4c431f to your computer and use it in GitHub Desktop.
scrape midjourney "recent showcase" images (into folder, per hour) (JavaScript, Deno)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Timeout, TimeoutError } from "https://deno.land/x/timeout/mod.ts" | |
const delay = (ms) => new Promise((res) => setTimeout(res, ms)); | |
function newThrottler({ isBusy, lock, unlock, waitMs, size }) { | |
async function throttler(cb, ...args) { | |
size(1); | |
await Promise.resolve(); | |
while (!!isBusy()) { | |
await delay(waitMs()); // waits in event loop queue, until it interrupts for another attempt! | |
} | |
lock(); | |
// ... DO ALL WORK for result | |
const result = await cb.call(this, ...args); | |
unlock(); | |
size(-1); | |
return result; | |
} | |
throttler.size = () => size(); // read-only | |
throttler.isBusy = isBusy; | |
return throttler; | |
} | |
const throttler = (threads: number) => newThrottler((function(){ | |
let size = 0; | |
let semaphore = 0; | |
return { | |
isBusy: () => (semaphore >= threads), | |
lock: () => (semaphore += 1), | |
unlock: () => (semaphore -= 1), | |
waitMs: () => 1000 + (1000 * Math.random()), | |
size: (n) => n ? (size += n) : size, | |
}; | |
})()); | |
export const createDownloadThrottled = | |
(threads: number) => { | |
const thr = throttler(threads); | |
function dl(...args) { | |
thr(downloadFile, ...args); | |
}; | |
Object.entries(thr).forEach(([k, v]) => dl[k] = v ); | |
return dl; | |
} | |
// Eg. const dl = createDownloadThrottled(5) | |
// dl('https://a.com/1.jpg', '~/imgs/1.jpg', myHeaders) | |
// console.log(dl.size()); | |
// console.log(dl.isBusy()); | |
export async function downloadFile( | |
source: string, | |
destination: string, | |
headers: any = {}, | |
): Promise<boolean> { | |
try { | |
const req = fetch(source, { headers }); | |
const response = await Timeout.race([req], headers.timeoutMs || 9000); | |
delete headers.timeoutMs; | |
const blob = await response.blob(); | |
const data = new Uint8Array(await blob.arrayBuffer()); | |
const file = await Deno.create(destination); | |
await Deno.writeAll(file, data); | |
Deno.close(file.rid); | |
return true; | |
} catch(err) { | |
console.error(`ERROR while dl'ing ${source}:`, err.message); | |
if(err instanceof TimeoutError) { | |
console.error(`ERROR Timed out; skipping: ${source}`); | |
return false; | |
} | |
console.error("---RETRYING..."); | |
await delay(1000); | |
return downloadFile(source, destination, headers); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://chriszarate.github.io/bookmarkleter/ | |
async function saveImageToDisk(directoryHandle, imageUrl, filename) { | |
const fileHandle = await directoryHandle.getFileHandle(filename, { create: true }) | |
const outStream = await fileHandle.createWritable() | |
const response = await fetch(imageUrl) | |
const blob = await response.blob() | |
await outStream.write(blob) | |
await outStream.close() | |
} | |
async function saveImagesToDisk(directoryHandle, imageUrls, filenames, root) { | |
try { | |
const subdirectoryHandle = await directoryHandle.getDirectoryHandle(root, { create: false }) | |
const prs = imageUrls.map( | |
async (imageUrl, i) => saveImageToDisk(subdirectoryHandle, imageUrl, filenames[i], root)) | |
await Promise.all(prs) | |
console.log(imageUrls.length + ' Images saved to disk successfully!') | |
} catch (error) { | |
console.error('Error saving images:', error) | |
} | |
} | |
async function crudJobs(directoryHandle, date, newJobs) { | |
let create = true | |
for await (const e of directoryHandle.entries()) { | |
create && (e[0] === date) && (create = false) | |
if (!create) break; | |
} | |
const subdirectoryHandle = await directoryHandle.getDirectoryHandle(date, { create }) | |
const fileHandle = await subdirectoryHandle.getFileHandle(`___jobs.yaml`, { create }) | |
if (newJobs) { | |
const outStream = await fileHandle.createWritable() | |
await outStream.write(newJobs) | |
await outStream.close() | |
return | |
} | |
return fileHandle | |
.getFile() | |
.then(f => f.text()) | |
.then(txt => txt.split('\n').filter(Boolean)) | |
} | |
let directoryHandle | |
async function downloadImages(rows, imageUrls, filenames) { | |
const today = new Date() | |
const yesterday = new Date() | |
yesterday.setDate(today.getDate() - 1) | |
console.log('TODAY:', today) | |
try { | |
directoryHandle = directoryHandle || await window.showDirectoryPicker() | |
const jobs = [ | |
await crudJobs(directoryHandle, today.toJSON().split('T')[0]), | |
await crudJobs(directoryHandle, yesterday.toJSON().split('T')[0]) | |
] | |
for (let i = 0; i < rows.length; i++) { | |
if (jobs.flat().includes(rows[i])) { | |
rows.splice(i, 1) | |
imageUrls.splice(i, 1) | |
filenames.splice(i, 1) | |
i--; | |
} | |
} | |
await saveImagesToDisk(directoryHandle, imageUrls, filenames, today.toJSON().split('T')[0]) | |
const newJobs = [...jobs[0], ...rows].filter(Boolean).join('\n') | |
await crudJobs(directoryHandle, today.toJSON().split('T')[0], newJobs) | |
} catch (error) { | |
console.error('Error requesting file system permission:' + error.message) | |
} | |
} | |
const BUILD_REGEX = /"buildId":"([^"]+)"/ | |
// https://www.midjourney.com/_next/data/${buildId}/showcase/recent.json | |
async function main() { | |
if (window.location.href !== 'https://legacy.midjourney.com/showcase/recent/') | |
window.location.href = 'https://legacy.midjourney.com/showcase/recent/' | |
const html = await fetch('.').then(r => r.text()) // get latest | |
const id = BUILD_REGEX.exec(html)[1] | |
id || console.log('Not Found: buildId') | |
const r = await fetch(`https://legacy.midjourney.com/_next/data/${id}/showcase/recent.json`) | |
.then(r => r.json()) | |
const jobRows = r.pageProps.jobs | |
.filter(j => j.event.seedImageURL) | |
.map(j => [j.username, j.reference_job_id, j.event]) | |
const { rows, imageUrls, filenames } = jobRows.reduce((m, row) => { | |
const [username, id, event] = row | |
const filename = `${username.replace(/[^\w]/g, '-')}__${id}.png` | |
m.rows.push('- ' + JSON.stringify(row)) | |
m.imageUrls.push(event.seedImageURL) | |
m.filenames.push(filename) | |
return m | |
}, { rows:[], imageUrls:[], filenames:[] }) | |
// Example usage | |
// const imageUrls = [ | |
// 'https://cdn.midjourney.com/34c81180-ae94-4f78-b293-0f65af104168/0_2.png' | |
// ] | |
await downloadImages(rows, imageUrls, filenames) | |
setTimeout(main, 60 * 60 * 1000) | |
} | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { ensureDirSync, existsSync } from 'https://deno.land/std/fs/mod.ts'; | |
import { createDownloadThrottled } from "./dl.deno.ts"; | |
const dl = createDownloadThrottled(4); | |
const waitMin = 9; | |
const rootFolder = '___raw-images'; | |
const imgSet = new Set(); | |
const wait = () => new Promise(r => setTimeout(r, 1000 * 60 * waitMin + (1000 * 60 * Math.random()))) | |
while(true) { | |
console.log('TRY', new Date().toJSON()); | |
const h = await fetch('https://legacy.midjourney.com/showcase/recent/') | |
.then(r => r.text()) | |
.catch(() => null); | |
if (!h) { | |
await wait(); continue; | |
} | |
const id = h.match(/buildId\":\"([^\"]+)/gm)[0].split('"').pop(); | |
const r = await fetch(`https://legacy.midjourney.com/_next/data/${id}/showcase/recent.json`) | |
.then(r => r.json()) | |
.catch(() => null); | |
if (!r) { | |
await wait(); continue; | |
} | |
if (!r || !r.pageProps.jobs.length) { await wait(); continue; } | |
const jobs = r.pageProps.jobs.map(j => [j.username, j.reference_job_id, j.event]).filter(([a, b, e]) => e.seedImageURL); | |
const rootFolder = '___raw-images'; | |
let yesterday = new Date(); // current date and time | |
yesterday.setDate(yesterday.getDate() - 1); // set back one day | |
yesterday = yesterday.toJSON().split('T')[0]; | |
for (let i = 0; i < 20; i++) { | |
let yesterday = new Date(); // current date and time | |
yesterday.setDate(yesterday.getDate() - i); // set back one day | |
yesterday = yesterday.toJSON().split('T')[0]; | |
jobs.forEach(([username, id, event]) => { | |
const filename = `${username.replace(/[^\w]/g, '-')}__${id}.png`; | |
if (existsSync(`${rootFolder}/${yesterday}/${filename}`)) imgSet.add(filename); | |
if (existsSync(`${rootFolder}/${yesterday}/${filename.replace(/\.png/, '.jpg')}`)) imgSet.add(filename); | |
}); | |
} | |
let now = new Date().toJSON().split('T')[0]; | |
ensureDirSync(`${rootFolder}/${now}`); | |
jobs.forEach(([username, id, event]) => { | |
const filename = `${username.replace(/[^\w]/g, '-')}__${id}.png`; | |
if (imgSet.has(filename)) return; | |
imgSet.add(filename); | |
const file = `${rootFolder}/${now}/${filename}`; | |
dl(event.seedImageURL, file); | |
Deno.writeTextFileSync( | |
`${rootFolder}/${now}/___jobs.yaml`, | |
'\n- ' + JSON.stringify([username, id, event]), | |
{ create: true, append: true } | |
); | |
console.log('DL:', file); | |
}) | |
await wait(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment