Skip to content

Instantly share code, notes, and snippets.

@josephwegner
Created August 10, 2019 17:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josephwegner/a17bb26a5804f25e8163a2b5ba21a086 to your computer and use it in GitHub Desktop.
Save josephwegner/a17bb26a5804f25e8163a2b5ba21a086 to your computer and use it in GitHub Desktop.
Scrape /r/Hobbies comments for subreddit links

Simple enough to run... just use the following commands

node scrape.js #creates the file subreddits.list
node count.js #outputs markdown-formatted data.

Please be respectful with this script. Pushshift is a free service, and serving these API requests costs them money.

const readline = require('readline');
const fs = require('fs');
const readInterface = readline.createInterface({
input: fs.createReadStream('subreddits.list'),
});
var subreddits = {}
readInterface.on('line', function(line) {
if (!subreddits[line]) {
subreddits[line] = 1
} else {
subreddits[line]++
}
})
readInterface.on('close', () => {
var counts = []
Object.keys(subreddits).forEach((sr) => {
counts.push([sr, subreddits[sr]])
})
counts.sort((a,b) => {
if (a[1] > b[1]) {
return -1
} else if (a[1] < b[1]) {
return 1
} else {
return 0
}
})
counts.forEach((subreddit) => {
console.log(`${subreddit[0]}|${subreddit[1]}`)
})
})
var https = require('https')
var fs = require('fs')
const BASE_OPTS = {
host: 'api.pushshift.io',
port: 443,
method: 'GET'
}
const BASE_URL = 'https://api.pushshift.io/reddit/search/comment/?subreddit=hobbies&sort=desc&sort_type=created_utc&size=500&fields=body,created_utc'
const MATCHER = /\/r\/[A-Za-z0-9_]+/g
const FD = fs.openSync('subreddits.list', 'w')
const DFILE = fs.createWriteStream('subreddits.list', {
flags: 'w',
fd: FD,
start: 0
})
function getComments(before) {
var url;
if (before) {
url = `${BASE_URL}&before=${before}`
} else {
url = BASE_URL
}
const req = https.request(url, BASE_OPTS, (res) => {
var data = ''
res.on('data', (d) => {
data += d
})
res.on('end', () => {
console.log('request finished')
try {
var comments = JSON.parse(data).data
console.log(`${comments.length} new comments found`)
} catch {
console.log('error parsing', data)
}
var subreddits = []
comments.forEach((c) => {
var matches = c.body.match(MATCHER)
if (matches != null) {
matches = matches.filter((sr, index) => {
return matches.indexOf(sr) === index
})
try {
subreddits = subreddits.concat(matches)
} catch(e) {
console.log('error concatenating', subreddits, matches)
console.log(e)
process.exit()
}
}
})
console.log(`adding ${subreddits.length} subreddits`)
DFILE.write(subreddits.join('\n'))
if (comments.length > 0) {
setTimeout(() => {
getComments(comments[comments.length - 1].created_utc)
}, 1000)
} else {
DFILE.end()
}
})
})
req.end()
console.log('made request', before)
}
getComments()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment