Last active
March 26, 2020 20:32
-
-
Save smokes/f951a219e85058a051bf11ef8e72780d to your computer and use it in GitHub Desktop.
Github Image scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require("puppeteer"); | |
const cheerio = require("cheerio"); | |
const url = require("url"); | |
const fs = require("fs"); | |
class GoogleImageScraper { | |
constructor({ | |
limit = 10, | |
scrollDelay = 500, | |
exportResults = false, | |
verbose = false | |
}) { | |
this.limit = limit; | |
this.scrollDelay = scrollDelay; | |
this.exportResults = exportResults; | |
this.verbose = verbose; | |
} | |
async scrape(keyword) { | |
if (!keyword) { | |
throw new Error("Missing keyword."); | |
} | |
let decodedQuery = keyword.replace(" ", "+"); | |
let googleQuery = `https://www.google.com/search?q=${decodedQuery}&tbm=isch`; | |
const browser = await puppeteer.launch({ | |
headless: true, | |
args: ["--no-sandbox", "--disable-setuid-sandbox"] | |
}); | |
this.log(`Started grabbing ${keyword} images`); | |
const page = await browser.newPage(); | |
page.setViewport({ width: 1920, height: 1080 }); | |
await page.goto(googleQuery); | |
await this.infiniteScroll(page); | |
await this.clickAllImages(page); | |
this.log("Right-clicked all images"); | |
let html = await page.content(); | |
let results = this.parseLinksFromHTML(html); | |
if (this.exportResults) { | |
let filename = keyword.replace(/[^a-z0-9]/gi, "_").toLowerCase(); | |
fs.writeFileSync(`${filename}.json`, JSON.stringify(results)); | |
this.log(`Exported ${results.length} results of '${keyword}'`); | |
} | |
await browser.close(); | |
return results; | |
} | |
async clickAllImages(page) { | |
this.log("Started right-clicking images"); | |
return page.evaluate(() => { | |
let elements = document.querySelectorAll("#islrg img"); | |
function rightClick(element) { | |
return new Promise(resolve => { | |
let event = new MouseEvent("mousedown", { | |
bubbles: true, | |
cancelable: false, | |
view: window, | |
button: 2, | |
buttons: 2, | |
clientX: element.getBoundingClientRect().x, | |
clientY: element.getBoundingClientRect().y | |
}); | |
element.dispatchEvent(event); | |
resolve(); | |
}); | |
} | |
async function rightClickAll(elements) { | |
for (const element of elements) { | |
await rightClick(element); | |
} | |
} | |
rightClickAll(elements); | |
}); | |
} | |
isButtonVisible(page) { | |
return page.evaluate(() => { | |
function isVisible(e) { | |
return !!(e.offsetWidth || e.offsetHeight || e.getClientRects().length); | |
} | |
return isVisible(document.querySelector("#islmp input[type='button']")); | |
}); | |
} | |
getInfiniteScrollStatus(page) { | |
return page.evaluate(() => { | |
let status = document.querySelector( | |
"#islmp div[data-endedmessage] > div:last-child" | |
).innerText; | |
return status; | |
}); | |
} | |
parseLinksFromHTML(html) { | |
let links = []; | |
let $ = cheerio.load(html); | |
$("#islrg a[href^='/imgres']").each(function(i, elem) { | |
let description = $(this) | |
.next() | |
.find("div > div:first-child") | |
.text(); | |
let link = $(this).attr("href"); | |
let parsedLink = url.parse(link, { parseQueryString: true }); | |
let imageurl = parsedLink.query.imgurl; | |
let source = parsedLink.query.imgrefurl; | |
links.push({ imageurl, source, description }); | |
}); | |
return links; | |
} | |
async infiniteScroll(page) { | |
let self = this; | |
let scrollIndex = 1; | |
try { | |
let previousHeight; | |
while (scrollIndex < self.limit) { | |
let buttonIsVisible = await this.isButtonVisible(page); | |
let infiniteScrollStatus = await this.getInfiniteScrollStatus(page); | |
if (infiniteScrollStatus === "Looks like you've reached the end") { | |
this.log("Looks like I've reached the end of results"); | |
break; | |
} | |
scrollIndex += 1; | |
previousHeight = await page.evaluate("document.body.scrollHeight"); | |
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)"); | |
this.log("Scrolled to bottom, total number of scrolls: " + scrollIndex); | |
if (buttonIsVisible) { | |
await page.click("#islmp input[type='button']"); | |
this.log("Clicked on show more results"); | |
} | |
await page.waitForFunction( | |
`document.body.scrollHeight > ${previousHeight}` | |
); | |
await page.waitFor(self.scrollDelay); | |
} | |
// don't forget to delete me :) | |
return true; | |
// | |
} catch (error) { | |
this.log(error); | |
} | |
} | |
log(message) { | |
if (this.verbose) { | |
console.log(message); | |
} | |
} | |
} | |
let scraper = new GoogleImageScraper({ | |
// maximum number of scrolls to the bottom of the page. | |
// limit : 1 results in 100 images, so 10 is approximately 800 because of google search limits | |
limit: 10, | |
scrollDelay: 500, | |
verbose: true, | |
exportResults: true | |
}); | |
scraper.scrape("cats"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment