Skip to content

Instantly share code, notes, and snippets.

@n4n0GH
Last active September 1, 2019 09:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save n4n0GH/a050f027344e0060814fb67d46d54ac6 to your computer and use it in GitHub Desktop.
Save n4n0GH/a050f027344e0060814fb67d46d54ac6 to your computer and use it in GitHub Desktop.
Crawls page 0 of any given 4channel board
/* general setup */
const rp = require('request-promise')
const cheerio = require('cheerio')
const fs = require('fs')
const url = 'https://boards.4channel.org/'
const board = 'a'
const threadList = []
console.log('Loading config..')
/* define options for rp */
let options = {
uri: url + board,
transform: function(body){
return cheerio.load(body)
}
}
let thread = []
rp(options)
.then(function($) {
let threadId = ''
console.log('Board:' +board)
$('div.board > div.thread').each(function(index) {
// find all threads on first page
threadId = $(this).find('div.opContainer').attr('id').slice(2)
threadList.push(threadId)
})
console.log(threadList)
})
.then(function($) {
for(n in threadList){
let nuOptions = {
uri: url+board+'/thread/'+threadList[n],
transform: function(body){
return cheerio.load(body)
}
}
rp(nuOptions)
.then(function($) {
console.log(nuOptions.uri)
console.log(threadList[n])
let opId
let opSubject
let opText
let opStamp
let opImageSrc
let opImageName
let opName
$('div.board > div.thread').each(function(index) {
// reset reply variables for each thread
let reply = ''
let replyStamp = ''
let replyImageSrc
let replyImageName
let replyName
// parse general thread info
opId = $(this).find('div.opContainer').attr('id').slice(2)
opSubject = $(this).find('div.opContainer > div.post > div.postInfo > span.subject').text().trim().replace(/\\([\s\S])|(")/g,"\\$1$2")
opText = $(this).find('div.opContainer > div.post > blockquote.postMessage').html().replace(/\\([\s\S])|(")/g,"\\$1$2")
opImageSrc = $(this).find('div.opContainer > div.post > div.file > div.fileText > a').attr('href')
opImageName = $(this).find('div.opContainer > div.post > div.file > div.fileText > a').text().trim()
opStamp = $(this).find('div.opContainer > div.post > div.postInfo > span.dateTime').text().trim()
opName = $(this).find('div.opContainer > div.post > div.postInfo > span.nameBlock > span.name').text().trim()
// parse individual posts
$(this).find('div.replyContainer').each(function(index) {
if(typeof $(this).find('div.post > div.file > div.fileText > a') != "undefined"){
replyImageSrc = $(this).find('div.post > div.file > div.fileText > a').attr('href')
replyImageName = $(this).find('div.post > div.file > div.fileText > a').text().trim()
}
else {
replyImageSrc = ''
replyImageName = ''
}
replyStamp = $(this).find('div.post > div.postInfo > span.dateTime').text().trim()
replyName = $(this).find('div.post > div.postInfo > span.nameBlock > span.name').text().trim()
let replyId = $(this).attr('id').slice(2)
let replyText = $(this).find('div.post > blockquote.postMessage').html().replace(/\\([\s\S])|(")/g,"\\$1$2")
reply += '{"id": "'+replyId+'", "thread": "'+opId+'", "timestamp": "'+replyStamp+'", "file": {"originalName": "'+replyImageName+'", "src": "'+replyImageSrc+'"}, "name": "'+replyName+'", "text": "'+replyText+'"},'
})
reply = reply.slice(0,-1) // removes last comma for JSON compliance
thread.push('{"board": "'+board+'", "thread": "'+opId+'", "timestamp": "'+opStamp+'", "subject": "'+opSubject+'", "name": "'+opName+'", "text": "'+opText+'", "file": {"originalName": "'+opImageName+'", "src": "'+opImageSrc+'"}, "replies": ['+reply+'], "hidden": []}')
})
})
.then(function(){
fName = board+'.json'
fs.writeFile(fName, thread, function(error){
if(error) throw error
console.log('Saved file '+fName+'!')
})
})
.catch(function(err) {
console.log(err)
})
}
})
.catch(function(err) {
console.log(err)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment