Skip to content

Instantly share code, notes, and snippets.

@link89
Created January 2, 2024 01:41
Show Gist options
  • Save link89/b653260e2f3a2c5d94d534b72a2f0cae to your computer and use it in GitHub Desktop.
Save link89/b653260e2f3a2c5d94d534b72a2f0cae to your computer and use it in GitHub Desktop.
A tampermonkey script to crawl reddit comments.
// ==UserScript==
// @name Reddit Crawler
// @namespace http://tampermonkey.net/
// @version 0.1
// @description Crawl posts and comments on a specific reddit sub and export as jsonl
// @author Bing
// @match https://www.reddit.com/*
// @grant GM_registerMenuCommand
// ==/UserScript==
(async function () {
'use strict';
// Define a function to download the result as jsonl
function downloadResult() {
// Get the result from localStorage
const result = localStorage.getItem('result');
// Check if the result is not empty
if (result) {
// Create a blob with the jsonl
const blob = new Blob([result], { type: 'text/jsonl' });
// Create a URL for the blob
const url = URL.createObjectURL(blob);
// Create a link element with the URL
const link = document.createElement('a');
link.href = url;
link.download = 'reddit-crawler.jsonl';
// Append the link to the document body
document.body.appendChild(link);
// Click the link
link.click();
// Remove the link from the document body
document.body.removeChild(link);
// Revoke the URL
URL.revokeObjectURL(url);
} else {
// No result, alert the user
alert('No result to download');
}
}
// Define a function to remove localStorage data
function cleanData() {
// Remove the postUrls and result from localStorage
localStorage.removeItem('postUrls');
localStorage.removeItem('result');
// Alert the user
alert('Data cleaned');
}
function waitForPageLoad() {
return new Promise(resolve => {
if (document.readyState === 'complete' || document.readyState === 'interactive') {
// If the document has finished loading, resolve the promise
resolve();
} else {
// Otherwise, resolve the promise when the DOMContentLoaded event is fired
document.addEventListener('DOMContentLoaded', resolve);
}
});
}
function getUrlPath(url) {
return new URL(url).pathname;
}
function getPostUrls() {
const data = localStorage.getItem('postUrls');
return data ? JSON.parse(data) : [];
}
function setPostUrls(urls) {
const data = JSON.stringify(Array.from(new Set(urls)));
localStorage.setItem('postUrls', data);
}
// Define a function to check if the state is running
function isRunning() {
return getPostUrls().length > 0;
}
// Define a function to check if the URL path is a sub path
function isSubPath(path) {
// Check if the path matches /r/\w+/?$
return path.match(/^\/r\/\w+\/?$/);
}
// Define a function to check if the URL path is a post path
function isPostPath(path) {
// Check if the path matches /r/\w+/comments/\w+/.+/?$
return path.match(/^\/r\/\w+\/comments\/\w+\/.+\/?$/);
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function scrollUntilLoaded(maxScrolls, delay=1e3) {
let previousHeight = -1;
for (let i = 0; i < maxScrolls; i++) {
previousHeight = document.documentElement.scrollHeight;
window.scrollBy(0, document.body.scrollHeight + 400);
await sleep(delay); // Wait for a second for the new content to load
await waitForPageLoad();
if (previousHeight === document.documentElement.scrollHeight) {
break; // If the scroll height hasn't changed, break the loop
}
}
}
async function crawlComments() {
console.log('start to crawl comments');
await waitForPageLoad();
await scrollUntilLoaded(20, 5e3);
// Get all comments (include id, author, content, timestamp, etc) and save it to localstorage['result']
const comments = [];
for (const c of document.querySelectorAll('div.Comment')) {
// wait for auth information is loaded
let author = '';
for (let i = 0; i < 30; i++) {
const node = c.querySelector('[data-testid="comment_author_link"]');
if (node) {
author = getUrlPath(node.href);
break;
}
await sleep(1e3);
}
const postId = getUrlPath(window.location.href);
const commentId = getUrlPath(c.querySelector('[data-testid="comment_timestamp"]').href);
const content = c.querySelector('[data-testid="comment"]').textContent;
const vote = c.querySelector('[data-click-id="upvote"]').parentNode.textContent;
// Return an object with the comment data
comments.push({postId, commentId, author, content, vote });
}
// Get the result from localStorage or initialize it as an empty string
let result = localStorage.getItem('result') || '';
// Append new data as jsonl
result += comments.map(c => JSON.stringify(c) + '\n').join('');
// Save the result to localStorage
localStorage.setItem('result', result);
}
// Get the current URL
const url = new URL(window.location.href);
// Get the URL path
const path = url.pathname;
console.log(`Is ${path} post path ${!!isPostPath(path)} or sub path ${!!isSubPath(path)}`);
// Register a menu button named 'run reddit crawl' with the following callback
GM_registerMenuCommand('Run Reddit Crawl', async () => {
if (!isSubPath(path)) {
return;
}
console.log('start to crawl post');
await scrollUntilLoaded(10, 5e3);
await waitForPageLoad();
// Scroll down the page and wait to ensure all data is loaded
// Get all posts urls in the current page and save it to localstorage['postUrls']
const selector = 'div.Post div[data-adclicklocation="title"] a[data-click-id="body"]';
const postUrls = Array.from(document.querySelectorAll(selector)).map(a => a.href);
const postUrl = postUrls.pop();
localStorage.setItem('postUrls', JSON.stringify(postUrls));
// Open first post url
window.open(postUrl, '_self');
});
// Register a menu button named 'download result' with the following callback
GM_registerMenuCommand('Download Result', () => {
// Download the result as jsonl
downloadResult();
});
// Register a menu button named 'clean data' with the following callback
GM_registerMenuCommand('Clean Data', () => {
// Remove localStorage data
cleanData();
});
// Check if the state is running and the URL path is a post path
if (isRunning()) {
if (isPostPath(path)) {
await sleep(5e3);
await crawlComments();
}
// Pop a postUrl and open it
const postUrls = getPostUrls()
const postUrl = postUrls.pop();
// Save the remaining post URLs to localStorage
setPostUrls(postUrls);
// Check if there is any post URL left
if (postUrl) {
// Open the next post URL
window.open(postUrl, '_self');
} else {
// No more post URLs, alert the user
alert('Reddit Crawl completed');
}
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment