Created
January 2, 2024 01:41
-
-
Save link89/b653260e2f3a2c5d94d534b72a2f0cae to your computer and use it in GitHub Desktop.
A tampermonkey script to crawl reddit comments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name Reddit Crawler | |
// @namespace http://tampermonkey.net/ | |
// @version 0.1 | |
// @description Crawl posts and comments on a specific reddit sub and export as jsonl | |
// @author Bing | |
// @match https://www.reddit.com/* | |
// @grant GM_registerMenuCommand | |
// ==/UserScript== | |
(async function () { | |
'use strict'; | |
// Define a function to download the result as jsonl | |
function downloadResult() { | |
// Get the result from localStorage | |
const result = localStorage.getItem('result'); | |
// Check if the result is not empty | |
if (result) { | |
// Create a blob with the jsonl | |
const blob = new Blob([result], { type: 'text/jsonl' }); | |
// Create a URL for the blob | |
const url = URL.createObjectURL(blob); | |
// Create a link element with the URL | |
const link = document.createElement('a'); | |
link.href = url; | |
link.download = 'reddit-crawler.jsonl'; | |
// Append the link to the document body | |
document.body.appendChild(link); | |
// Click the link | |
link.click(); | |
// Remove the link from the document body | |
document.body.removeChild(link); | |
// Revoke the URL | |
URL.revokeObjectURL(url); | |
} else { | |
// No result, alert the user | |
alert('No result to download'); | |
} | |
} | |
// Define a function to remove localStorage data | |
function cleanData() { | |
// Remove the postUrls and result from localStorage | |
localStorage.removeItem('postUrls'); | |
localStorage.removeItem('result'); | |
// Alert the user | |
alert('Data cleaned'); | |
} | |
function waitForPageLoad() { | |
return new Promise(resolve => { | |
if (document.readyState === 'complete' || document.readyState === 'interactive') { | |
// If the document has finished loading, resolve the promise | |
resolve(); | |
} else { | |
// Otherwise, resolve the promise when the DOMContentLoaded event is fired | |
document.addEventListener('DOMContentLoaded', resolve); | |
} | |
}); | |
} | |
function getUrlPath(url) { | |
return new URL(url).pathname; | |
} | |
function getPostUrls() { | |
const data = localStorage.getItem('postUrls'); | |
return data ? JSON.parse(data) : []; | |
} | |
function setPostUrls(urls) { | |
const data = JSON.stringify(Array.from(new Set(urls))); | |
localStorage.setItem('postUrls', data); | |
} | |
// Define a function to check if the state is running | |
function isRunning() { | |
return getPostUrls().length > 0; | |
} | |
// Define a function to check if the URL path is a sub path | |
function isSubPath(path) { | |
// Check if the path matches /r/\w+/?$ | |
return path.match(/^\/r\/\w+\/?$/); | |
} | |
// Define a function to check if the URL path is a post path | |
function isPostPath(path) { | |
// Check if the path matches /r/\w+/comments/\w+/.+/?$ | |
return path.match(/^\/r\/\w+\/comments\/\w+\/.+\/?$/); | |
} | |
function sleep(ms) { | |
return new Promise(resolve => setTimeout(resolve, ms)); | |
} | |
async function scrollUntilLoaded(maxScrolls, delay=1e3) { | |
let previousHeight = -1; | |
for (let i = 0; i < maxScrolls; i++) { | |
previousHeight = document.documentElement.scrollHeight; | |
window.scrollBy(0, document.body.scrollHeight + 400); | |
await sleep(delay); // Wait for a second for the new content to load | |
await waitForPageLoad(); | |
if (previousHeight === document.documentElement.scrollHeight) { | |
break; // If the scroll height hasn't changed, break the loop | |
} | |
} | |
} | |
async function crawlComments() { | |
console.log('start to crawl comments'); | |
await waitForPageLoad(); | |
await scrollUntilLoaded(20, 5e3); | |
// Get all comments (include id, author, content, timestamp, etc) and save it to localstorage['result'] | |
const comments = []; | |
for (const c of document.querySelectorAll('div.Comment')) { | |
// wait for auth information is loaded | |
let author = ''; | |
for (let i = 0; i < 30; i++) { | |
const node = c.querySelector('[data-testid="comment_author_link"]'); | |
if (node) { | |
author = getUrlPath(node.href); | |
break; | |
} | |
await sleep(1e3); | |
} | |
const postId = getUrlPath(window.location.href); | |
const commentId = getUrlPath(c.querySelector('[data-testid="comment_timestamp"]').href); | |
const content = c.querySelector('[data-testid="comment"]').textContent; | |
const vote = c.querySelector('[data-click-id="upvote"]').parentNode.textContent; | |
// Return an object with the comment data | |
comments.push({postId, commentId, author, content, vote }); | |
} | |
// Get the result from localStorage or initialize it as an empty string | |
let result = localStorage.getItem('result') || ''; | |
// Append new data as jsonl | |
result += comments.map(c => JSON.stringify(c) + '\n').join(''); | |
// Save the result to localStorage | |
localStorage.setItem('result', result); | |
} | |
// Get the current URL | |
const url = new URL(window.location.href); | |
// Get the URL path | |
const path = url.pathname; | |
console.log(`Is ${path} post path ${!!isPostPath(path)} or sub path ${!!isSubPath(path)}`); | |
// Register a menu button named 'run reddit crawl' with the following callback | |
GM_registerMenuCommand('Run Reddit Crawl', async () => { | |
if (!isSubPath(path)) { | |
return; | |
} | |
console.log('start to crawl post'); | |
await scrollUntilLoaded(10, 5e3); | |
await waitForPageLoad(); | |
// Scroll down the page and wait to ensure all data is loaded | |
// Get all posts urls in the current page and save it to localstorage['postUrls'] | |
const selector = 'div.Post div[data-adclicklocation="title"] a[data-click-id="body"]'; | |
const postUrls = Array.from(document.querySelectorAll(selector)).map(a => a.href); | |
const postUrl = postUrls.pop(); | |
localStorage.setItem('postUrls', JSON.stringify(postUrls)); | |
// Open first post url | |
window.open(postUrl, '_self'); | |
}); | |
// Register a menu button named 'download result' with the following callback | |
GM_registerMenuCommand('Download Result', () => { | |
// Download the result as jsonl | |
downloadResult(); | |
}); | |
// Register a menu button named 'clean data' with the following callback | |
GM_registerMenuCommand('Clean Data', () => { | |
// Remove localStorage data | |
cleanData(); | |
}); | |
// Check if the state is running and the URL path is a post path | |
if (isRunning()) { | |
if (isPostPath(path)) { | |
await sleep(5e3); | |
await crawlComments(); | |
} | |
// Pop a postUrl and open it | |
const postUrls = getPostUrls() | |
const postUrl = postUrls.pop(); | |
// Save the remaining post URLs to localStorage | |
setPostUrls(postUrls); | |
// Check if there is any post URL left | |
if (postUrl) { | |
// Open the next post URL | |
window.open(postUrl, '_self'); | |
} else { | |
// No more post URLs, alert the user | |
alert('Reddit Crawl completed'); | |
} | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment