Skip to content

Instantly share code, notes, and snippets.

@elog08
Created April 17, 2018 02:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save elog08/b3e0a96cf0628bbbf9d8acdc74662ce7 to your computer and use it in GitHub Desktop.
Save elog08/b3e0a96cf0628bbbf9d8acdc74662ce7 to your computer and use it in GitHub Desktop.
ScrapeInfiniteList.js
module.exports = function() {
return new Promise((resolve, reject) => {
// Class for Individual Thread
const C_THREAD = '.pagedlist_item:not(.pagedlist_hidden)';
// Class for threads marked for deletion on subsequent loop
const C_THREAD_TO_REMOVE = '.pagedlist_item:not(.pagedlist_hidden) .TO_REMOVE';
// Class for Title
const C_THREAD_TITLE = '.title';
// Class for Description
const C_THREAD_DESCRIPTION = '.search_result_snippet .search_result_snippet .rendered_qtext ';
// Class for ID
const C_THREAD_ID = '.question_link';
// DOM attribute for link
const A_THREAD_URL = 'href';
// DOM attribute for ID
const A_THREAD_ID = 'id';
const _log = console.info,
_warn = console.warn,
_error = console.error,
_time = console.time,
_timeEnd = console.timeEnd;
_time("Scrape");
let page = 1;
// Global Set to store all entries
let threads = new Set(); // Eliminates dupes
// Pause between pagination
const PAUSE = 4000;
// Accepts a parent DOM element and extracts the title and URL
function scrapeSingleThread(elThread) {
try {
const elTitle = elThread.querySelector(C_THREAD_TITLE),
elLink = elThread.querySelector(C_THREAD_ID),
elDescription = elThread.querySelector(C_THREAD_DESCRIPTION);
if (elTitle) {
const title = elTitle.innerText.trim(),
description = elDescription.innerText.trim(),
id = elLink.getAttribute(A_THREAD_ID),
url = elLink.getAttribute(A_THREAD_URL);
threads.add({
title,
description,
url,
id
});
}
} catch (e) {
_error("Error capturing individual thread", e);
}
}
// Get all threads in the visible context
function scrapeThreads() {
_log("Scraping page %d", page);
const visibleThreads = document.querySelectorAll(C_THREAD);
if (visibleThreads.length > 0) {
_log("Scraping page %d... found %d threads", page, visibleThreads.length);
Array.from(visibleThreads).forEach(scrapeSingleThread);
} else {
_warn("Scraping page %d... found no threads", page);
}
// Return master list of threads;
return visibleThreads.length;
}
// Clears the list between pagination to preserve memory
// Otherwise, browser starts to lag after about 1000 threads
function clearList() {
_log("Clearing list page %d", page);
const toRemove = `${C_THREAD_TO_REMOVE}_${(page-1)}`,
toMark = `${C_THREAD_TO_REMOVE}_${(page)}`;
try {
// Remove threads previously marked for removal
document.querySelectorAll(toRemove)
.forEach(e => e.parentNode.removeChild(e));
// // Mark visible threads for removal on next iteration
document.querySelectorAll(C_THREAD)
.forEach(e => e.className = toMark.replace(/\./g, ''));
} catch (e) {
_error("Unable to remove elements", e.message)
}
}
// Scrolls to the bottom of the viewport
function loadMore() {
_log("Load more... page %d", page);
window.scrollTo(0, document.body.scrollHeight);
}
// Recursive loop that ends when there are no more threads
function loop() {
_log("Looping... %d entries added", threads.size);
if (scrapeThreads()) {
try {
clearList();
loadMore();
page++;
setTimeout(loop, PAUSE)
} catch (e) {
reject(e);
}
} else {
_timeEnd("Scrape");
resolve(Array.from(threads));
}
}
loop();
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment