Skip to content

Instantly share code, notes, and snippets.

@albertodeago
Created December 29, 2021 07:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save albertodeago/cd200a937ac04143f082d53d0639f4b0 to your computer and use it in GitHub Desktop.
Save albertodeago/cd200a937ac04143f082d53d0639f4b0 to your computer and use it in GitHub Desktop.
Download memes from imgflip (scraping with puppeteer)
const puppeteer = require("puppeteer");
const fs = require("fs");
const pageUrl = "https://imgflip.com/memetemplates";
const viewport = {
width: 1600,
height: 1200,
};
const MEME_LIST_SELECTOR = ".mt-boxes";
const MEME_BOX_SELECTOR = `${MEME_LIST_SELECTOR} .mt-box`;
const MEME_TITLE_SELECTOR = `${MEME_LIST_SELECTOR} .mt-title`;
const MEME_SRC_SELECTOR = `${MEME_LIST_SELECTOR} .mt-img-wrap img`;
const NEXT_PAGE_SELECTOR_DISABLED = ".pager .pager-next.pager-disabled";
const NEXT_PAGE_SELECTOR = ".pager .pager-next";
/**
* This is an helper function to wait some times. Usually we should not wait fixed
* amount of times but I'm doing this to avoid "bombarding" imgflip. The last thing
* that I want is to create issues to an amazing service like imgflip is.
* @param {number} time
* @returns {Promise<void>}
*/
const wait = (time) => new Promise((resolve) => setTimeout(resolve, time));
/**
* Given a page, returns all the memes in the page in the format of title, id and image url
* @returns {Promise<Array<{title: string, imageId: string, imageUrl: string}>}
*/
const getMemeList = async (page) => {
return page.evaluate(
(boxSelector, titleSelector, srcSelector) => {
const memeList = [...document.querySelectorAll(boxSelector)];
return memeList.map(($el) => {
const title = $el
.querySelector(titleSelector)
.innerText.replace(/[^a-z0-9]/gi, "_")
.toLowerCase();
const href = $el
.querySelector(srcSelector)
.getAttribute("src")
.replace("/", "");
const hrefSplit = href.split("/");
const imageId = hrefSplit[hrefSplit.length - 1];
const imageUrl = "https://i.imgflip.com/" + imageId;
return {
title,
imageId,
imageUrl,
};
});
},
MEME_BOX_SELECTOR,
MEME_TITLE_SELECTOR,
MEME_SRC_SELECTOR
);
};
/**
* Given a puppeteer response, download the image and save it to the disk
* @param {HTTPResponse} source
* @param {string} title
*/
const downloadImage = async function (source, title) {
const buffer = await source.buffer();
fs.writeFileSync("./memes/" + title + ".jpg", buffer);
};
/**
* Download all the memes in the provided page
*/
const downloadMemeInPage = async function (page) {
let pageMemeList = await getMemeList(page);
for (let i = 0; i < pageMemeList.length; ++i) {
const memeObj = pageMemeList[i];
const viewSource = await page.goto(memeObj.imageUrl);
await wait(1000);
// check if the response of the page is not an xml (xml means error page)
const responseHeaders = viewSource.headers();
const responseContentType = responseHeaders["content-type"];
// console.log(responseContentType);
if (!responseContentType || responseContentType.indexOf("xml") === -1) {
// it's an image, download it
await downloadImage(viewSource, memeObj.title);
}
// go back to go to the next image
await page.goBack();
}
};
async function run() {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.setViewport(viewport);
await page.goto(pageUrl);
let lastPage = false;
while (!lastPage) {
await downloadMemeInPage(page);
lastPage = await page.evaluate((sel) => {
return document.querySelector(sel) !== null;
}, NEXT_PAGE_SELECTOR_DISABLED);
if (!lastPage) await page.click(NEXT_PAGE_SELECTOR);
}
browser.close();
}
run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment