Created
December 29, 2021 07:51
-
-
Save albertodeago/cd200a937ac04143f082d53d0639f4b0 to your computer and use it in GitHub Desktop.
Download memes from imgflip (scraping with puppeteer)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require("puppeteer"); | |
const fs = require("fs"); | |
const pageUrl = "https://imgflip.com/memetemplates"; | |
const viewport = { | |
width: 1600, | |
height: 1200, | |
}; | |
const MEME_LIST_SELECTOR = ".mt-boxes"; | |
const MEME_BOX_SELECTOR = `${MEME_LIST_SELECTOR} .mt-box`; | |
const MEME_TITLE_SELECTOR = `${MEME_LIST_SELECTOR} .mt-title`; | |
const MEME_SRC_SELECTOR = `${MEME_LIST_SELECTOR} .mt-img-wrap img`; | |
const NEXT_PAGE_SELECTOR_DISABLED = ".pager .pager-next.pager-disabled"; | |
const NEXT_PAGE_SELECTOR = ".pager .pager-next"; | |
/** | |
* This is an helper function to wait some times. Usually we should not wait fixed | |
* amount of times but I'm doing this to avoid "bombarding" imgflip. The last thing | |
* that I want is to create issues to an amazing service like imgflip is. | |
* @param {number} time | |
* @returns {Promise<void>} | |
*/ | |
const wait = (time) => new Promise((resolve) => setTimeout(resolve, time)); | |
/** | |
* Given a page, returns all the memes in the page in the format of title, id and image url | |
* @returns {Promise<Array<{title: string, imageId: string, imageUrl: string}>} | |
*/ | |
const getMemeList = async (page) => { | |
return page.evaluate( | |
(boxSelector, titleSelector, srcSelector) => { | |
const memeList = [...document.querySelectorAll(boxSelector)]; | |
return memeList.map(($el) => { | |
const title = $el | |
.querySelector(titleSelector) | |
.innerText.replace(/[^a-z0-9]/gi, "_") | |
.toLowerCase(); | |
const href = $el | |
.querySelector(srcSelector) | |
.getAttribute("src") | |
.replace("/", ""); | |
const hrefSplit = href.split("/"); | |
const imageId = hrefSplit[hrefSplit.length - 1]; | |
const imageUrl = "https://i.imgflip.com/" + imageId; | |
return { | |
title, | |
imageId, | |
imageUrl, | |
}; | |
}); | |
}, | |
MEME_BOX_SELECTOR, | |
MEME_TITLE_SELECTOR, | |
MEME_SRC_SELECTOR | |
); | |
}; | |
/** | |
* Given a puppeteer response, download the image and save it to the disk | |
* @param {HTTPResponse} source | |
* @param {string} title | |
*/ | |
const downloadImage = async function (source, title) { | |
const buffer = await source.buffer(); | |
fs.writeFileSync("./memes/" + title + ".jpg", buffer); | |
}; | |
/** | |
* Download all the memes in the provided page | |
*/ | |
const downloadMemeInPage = async function (page) { | |
let pageMemeList = await getMemeList(page); | |
for (let i = 0; i < pageMemeList.length; ++i) { | |
const memeObj = pageMemeList[i]; | |
const viewSource = await page.goto(memeObj.imageUrl); | |
await wait(1000); | |
// check if the response of the page is not an xml (xml means error page) | |
const responseHeaders = viewSource.headers(); | |
const responseContentType = responseHeaders["content-type"]; | |
// console.log(responseContentType); | |
if (!responseContentType || responseContentType.indexOf("xml") === -1) { | |
// it's an image, download it | |
await downloadImage(viewSource, memeObj.title); | |
} | |
// go back to go to the next image | |
await page.goBack(); | |
} | |
}; | |
async function run() { | |
const browser = await puppeteer.launch({ | |
headless: false, | |
}); | |
const page = await browser.newPage(); | |
await page.setViewport(viewport); | |
await page.goto(pageUrl); | |
let lastPage = false; | |
while (!lastPage) { | |
await downloadMemeInPage(page); | |
lastPage = await page.evaluate((sel) => { | |
return document.querySelector(sel) !== null; | |
}, NEXT_PAGE_SELECTOR_DISABLED); | |
if (!lastPage) await page.click(NEXT_PAGE_SELECTOR); | |
} | |
browser.close(); | |
} | |
run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment