link89/crawl-reddit-comments.js

## crawl-reddit-comments.js
// ==UserScript==
// @name         Reddit Crawler
// @namespace    http://tampermonkey.net/
// @version      0.1
// @description  Crawl posts and comments on a specific reddit sub and export as jsonl
// @author       Bing
// @match        https://www.reddit.com/*
// @grant        GM_registerMenuCommand
// ==/UserScript==


(async function () {
    'use strict';

    // Define a function to download the result as jsonl
    function downloadResult() {
        // Get the result from localStorage
        const result = localStorage.getItem('result');
        // Check if the result is not empty
        if (result) {
            // Create a blob with the jsonl
            const blob = new Blob([result], { type: 'text/jsonl' });
            // Create a URL for the blob
            const url = URL.createObjectURL(blob);
            // Create a link element with the URL
            const link = document.createElement('a');
            link.href = url;
            link.download = 'reddit-crawler.jsonl';
            // Append the link to the document body
            document.body.appendChild(link);
            // Click the link
            link.click();
            // Remove the link from the document body
            document.body.removeChild(link);
            // Revoke the URL
            URL.revokeObjectURL(url);
        } else {
            // No result, alert the user
            alert('No result to download');
        }
    }

    // Define a function to remove localStorage data
    function cleanData() {
        // Remove the postUrls and result from localStorage
        localStorage.removeItem('postUrls');
        localStorage.removeItem('result');
        // Alert the user
        alert('Data cleaned');
    }

    function waitForPageLoad() {
        return new Promise(resolve => {
            if (document.readyState === 'complete' || document.readyState === 'interactive') {
                // If the document has finished loading, resolve the promise
                resolve();
            } else {
                // Otherwise, resolve the promise when the DOMContentLoaded event is fired
                document.addEventListener('DOMContentLoaded', resolve);
            }
        });
    }

    function getUrlPath(url) {
        return new URL(url).pathname;

    }

    function getPostUrls() {
        const data = localStorage.getItem('postUrls');
        return data ? JSON.parse(data) : [];
    }

    function setPostUrls(urls) {
        const data = JSON.stringify(Array.from(new Set(urls)));
        localStorage.setItem('postUrls', data);
    }

    // Define a function to check if the state is running
    function isRunning() {
        return getPostUrls().length > 0;
    }

    // Define a function to check if the URL path is a sub path
    function isSubPath(path) {
        // Check if the path matches /r/\w+/?$
        return path.match(/^\/r\/\w+\/?$/);
    }

    // Define a function to check if the URL path is a post path
    function isPostPath(path) {
        // Check if the path matches /r/\w+/comments/\w+/.+/?$
        return path.match(/^\/r\/\w+\/comments\/\w+\/.+\/?$/);
    }

    function sleep(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    async function scrollUntilLoaded(maxScrolls, delay=1e3) {
        let previousHeight = -1;
        for (let i = 0; i < maxScrolls; i++) {
            previousHeight = document.documentElement.scrollHeight;
            window.scrollBy(0, document.body.scrollHeight + 400);
            await sleep(delay); // Wait for a second for the new content to load
            await waitForPageLoad();
            if (previousHeight === document.documentElement.scrollHeight) {
                break; // If the scroll height hasn't changed, break the loop
            }
        }
    }

    async function crawlComments() {
        console.log('start to crawl comments');
        await waitForPageLoad();
        await scrollUntilLoaded(20, 5e3);
        // Get all comments (include id, author, content, timestamp, etc) and save it to localstorage['result']
        const comments = [];
        for (const c of document.querySelectorAll('div.Comment')) {
            // wait for auth information is loaded
            let author = '';
            for (let i = 0; i < 30; i++) {
                const node = c.querySelector('[data-testid="comment_author_link"]');
                if (node) {
                    author = getUrlPath(node.href);
                    break;
                }
                await sleep(1e3);
            }
            const postId = getUrlPath(window.location.href);
            const commentId = getUrlPath(c.querySelector('[data-testid="comment_timestamp"]').href);
            const content = c.querySelector('[data-testid="comment"]').textContent;
            const vote = c.querySelector('[data-click-id="upvote"]').parentNode.textContent;
            // Return an object with the comment data
            comments.push({postId, commentId, author, content, vote });
        }

        // Get the result from localStorage or initialize it as an empty string
        let result = localStorage.getItem('result') || '';
        // Append new data as jsonl
        result += comments.map(c => JSON.stringify(c) + '\n').join('');
        // Save the result to localStorage
        localStorage.setItem('result', result);
    }

    // Get the current URL
    const url = new URL(window.location.href);

    // Get the URL path
    const path = url.pathname;

    console.log(`Is ${path} post path ${!!isPostPath(path)} or sub path ${!!isSubPath(path)}`);

    // Register a menu button named 'run reddit crawl' with the following callback
    GM_registerMenuCommand('Run Reddit Crawl', async () => {
        if (!isSubPath(path)) {
            return;
        }
        console.log('start to crawl post');
        await scrollUntilLoaded(10, 5e3);
        await waitForPageLoad();
        // Scroll down the page and wait to ensure all data is loaded
        // Get all posts urls in the current page and save it to localstorage['postUrls']
        const selector = 'div.Post div[data-adclicklocation="title"] a[data-click-id="body"]';
        const postUrls = Array.from(document.querySelectorAll(selector)).map(a => a.href);
        const postUrl = postUrls.pop();
        localStorage.setItem('postUrls', JSON.stringify(postUrls));
        // Open first post url
        window.open(postUrl, '_self');
    });

    // Register a menu button named 'download result' with the following callback
    GM_registerMenuCommand('Download Result', () => {
        // Download the result as jsonl
        downloadResult();
    });
    // Register a menu button named 'clean data' with the following callback
    GM_registerMenuCommand('Clean Data', () => {
        // Remove localStorage data
        cleanData();
    });

    // Check if the state is running and the URL path is a post path
    if (isRunning()) {
        if (isPostPath(path)) {
            await sleep(5e3);
            await crawlComments();
        }
        // Pop a postUrl and open it
        const postUrls = getPostUrls()
        const postUrl = postUrls.pop();
        // Save the remaining post URLs to localStorage
        setPostUrls(postUrls);
        // Check if there is any post URL left
        if (postUrl) {
            // Open the next post URL
            window.open(postUrl, '_self');
        } else {
            // No more post URLs, alert the user
            alert('Reddit Crawl completed');
        }
    }
})();
	// ==UserScript==
	// @name Reddit Crawler
	// @namespace http://tampermonkey.net/
	// @version 0.1
	// @description Crawl posts and comments on a specific reddit sub and export as jsonl
	// @author Bing
	// @match https://www.reddit.com/*
	// @grant GM_registerMenuCommand
	// ==/UserScript==


	(async function () {
	'use strict';

	// Define a function to download the result as jsonl
	function downloadResult() {
	// Get the result from localStorage
	const result = localStorage.getItem('result');
	// Check if the result is not empty
	if (result) {
	// Create a blob with the jsonl
	const blob = new Blob([result], { type: 'text/jsonl' });
	// Create a URL for the blob
	const url = URL.createObjectURL(blob);
	// Create a link element with the URL
	const link = document.createElement('a');
	link.href = url;
	link.download = 'reddit-crawler.jsonl';
	// Append the link to the document body
	document.body.appendChild(link);
	// Click the link
	link.click();
	// Remove the link from the document body
	document.body.removeChild(link);
	// Revoke the URL
	URL.revokeObjectURL(url);
	} else {
	// No result, alert the user
	alert('No result to download');
	}
	}

	// Define a function to remove localStorage data
	function cleanData() {
	// Remove the postUrls and result from localStorage
	localStorage.removeItem('postUrls');
	localStorage.removeItem('result');
	// Alert the user
	alert('Data cleaned');
	}

	function waitForPageLoad() {
	return new Promise(resolve => {
	if (document.readyState === 'complete' \|\| document.readyState === 'interactive') {
	// If the document has finished loading, resolve the promise
	resolve();
	} else {
	// Otherwise, resolve the promise when the DOMContentLoaded event is fired
	document.addEventListener('DOMContentLoaded', resolve);
	}
	});
	}

	function getUrlPath(url) {
	return new URL(url).pathname;

	}

	function getPostUrls() {
	const data = localStorage.getItem('postUrls');
	return data ? JSON.parse(data) : [];
	}

	function setPostUrls(urls) {
	const data = JSON.stringify(Array.from(new Set(urls)));
	localStorage.setItem('postUrls', data);
	}

	// Define a function to check if the state is running
	function isRunning() {
	return getPostUrls().length > 0;
	}

	// Define a function to check if the URL path is a sub path
	function isSubPath(path) {
	// Check if the path matches /r/\w+/?$
	return path.match(/^\/r\/\w+\/?$/);
	}

	// Define a function to check if the URL path is a post path
	function isPostPath(path) {
	// Check if the path matches /r/\w+/comments/\w+/.+/?$
	return path.match(/^\/r\/\w+\/comments\/\w+\/.+\/?$/);
	}

	function sleep(ms) {
	return new Promise(resolve => setTimeout(resolve, ms));
	}

	async function scrollUntilLoaded(maxScrolls, delay=1e3) {
	let previousHeight = -1;
	for (let i = 0; i < maxScrolls; i++) {
	previousHeight = document.documentElement.scrollHeight;
	window.scrollBy(0, document.body.scrollHeight + 400);
	await sleep(delay); // Wait for a second for the new content to load
	await waitForPageLoad();
	if (previousHeight === document.documentElement.scrollHeight) {
	break; // If the scroll height hasn't changed, break the loop
	}
	}
	}

	async function crawlComments() {
	console.log('start to crawl comments');
	await waitForPageLoad();
	await scrollUntilLoaded(20, 5e3);
	// Get all comments (include id, author, content, timestamp, etc) and save it to localstorage['result']
	const comments = [];
	for (const c of document.querySelectorAll('div.Comment')) {
	// wait for auth information is loaded
	let author = '';
	for (let i = 0; i < 30; i++) {
	const node = c.querySelector('[data-testid="comment_author_link"]');
	if (node) {
	author = getUrlPath(node.href);
	break;
	}
	await sleep(1e3);
	}
	const postId = getUrlPath(window.location.href);
	const commentId = getUrlPath(c.querySelector('[data-testid="comment_timestamp"]').href);
	const content = c.querySelector('[data-testid="comment"]').textContent;
	const vote = c.querySelector('[data-click-id="upvote"]').parentNode.textContent;
	// Return an object with the comment data
	comments.push({postId, commentId, author, content, vote });
	}

	// Get the result from localStorage or initialize it as an empty string
	let result = localStorage.getItem('result') \|\| '';
	// Append new data as jsonl
	result += comments.map(c => JSON.stringify(c) + '\n').join('');
	// Save the result to localStorage
	localStorage.setItem('result', result);
	}

	// Get the current URL
	const url = new URL(window.location.href);

	// Get the URL path
	const path = url.pathname;

	console.log(`Is ${path} post path ${!!isPostPath(path)} or sub path ${!!isSubPath(path)}`);

	// Register a menu button named 'run reddit crawl' with the following callback
	GM_registerMenuCommand('Run Reddit Crawl', async () => {
	if (!isSubPath(path)) {
	return;
	}
	console.log('start to crawl post');
	await scrollUntilLoaded(10, 5e3);
	await waitForPageLoad();
	// Scroll down the page and wait to ensure all data is loaded
	// Get all posts urls in the current page and save it to localstorage['postUrls']
	const selector = 'div.Post div[data-adclicklocation="title"] a[data-click-id="body"]';
	const postUrls = Array.from(document.querySelectorAll(selector)).map(a => a.href);
	const postUrl = postUrls.pop();
	localStorage.setItem('postUrls', JSON.stringify(postUrls));
	// Open first post url
	window.open(postUrl, '_self');
	});

	// Register a menu button named 'download result' with the following callback
	GM_registerMenuCommand('Download Result', () => {
	// Download the result as jsonl
	downloadResult();
	});
	// Register a menu button named 'clean data' with the following callback
	GM_registerMenuCommand('Clean Data', () => {
	// Remove localStorage data
	cleanData();
	});

	// Check if the state is running and the URL path is a post path
	if (isRunning()) {
	if (isPostPath(path)) {
	await sleep(5e3);
	await crawlComments();
	}
	// Pop a postUrl and open it
	const postUrls = getPostUrls()
	const postUrl = postUrls.pop();
	// Save the remaining post URLs to localStorage
	setPostUrls(postUrls);
	// Check if there is any post URL left
	if (postUrl) {
	// Open the next post URL
	window.open(postUrl, '_self');
	} else {
	// No more post URLs, alert the user
	alert('Reddit Crawl completed');
	}
	}
	})();