Skip to content

Instantly share code, notes, and snippets.

@Asbra
Created December 13, 2014 17:29
Show Gist options
  • Save Asbra/002a93d8249f045a2adf to your computer and use it in GitHub Desktop.
Save Asbra/002a93d8249f045a2adf to your computer and use it in GitHub Desktop.
4chan downloader
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author: johan
# @date: 2014-12-12
# @modified_by: johan
# @modified_at: 2014-12-12
import re # Regular expressions
import requests # To make HTTP requests
import json # To parse 4chan's JSON
import shutil # Used when downloading file
import os # For creating folders
class fchan(object) :
def __init__(self) :
return
def catalog(self, section) :
print 'Grabbing catalog for /'+section+'/ ..'
# Build Url
url = 'https://boards.4chan.org/'+section+'/catalog'
# Download page
r = requests.get(url)
# Error handling
if r.status_code != 200 or not r.content :
print 'Failed to read catalog. Are you sure that there is a /'+section+'/ section?'
return []
threads = []
# Find all threads in the JSON data
m = re.findall(r'var catalog = (.*)?};', r.content)
j = m[0]+'}'
d = json.loads(j)
for t in d['threads'].iteritems() :
threads.append(t[0])
print 'Found '+str(len(threads))+' threads'
return threads
def thread(self, section, id) :
print 'Grabbing thread /'+section+'/thread/'+id+'/'
# Build Url
url = 'https://boards.4chan.org/'+section+'/thread/'+id+'/'
# Download page
r = requests.get(url)
# Error handling
if r.status_code != 200 or not r.content :
print 'Failed to read thread /'+section+'/thread/'+id+'/'
return []
# Find all images in thread
m = re.findall(r'href=".*?(\/\/i.4cdn.org\/[a-z]+\/[0-9]+.(jpg|jpeg|png|gif|webm))"', r.content)
images = []
for i in m :
images.append(i[0])
print 'Found '+str(len(images))+' images in thread /'+section+'/thread/'+id+'/'
return self.uniq(images)
# Remove duplicate elements in list
def uniq(self, seq) :
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
if __name__ == '__main__':
import sys
# Input validation
if len(sys.argv) < 2 or not sys.argv[1] :
print '4chan.py <section>'
print 'No section given'
sys.exit()
section = sys.argv[1]
chan = fchan()
# Get all threads from the catalog
threads = chan.catalog(section)
# Create section folder if it doesn't exist
if not os.path.exists(section) :
os.makedirs(section)
# Iterate all threads and download images
for thread in threads :
# Get images
images = chan.thread(section, thread)
# Create image folder if it doesn't exist
if not os.path.exists(section+'/'+thread) :
os.makedirs(section+'/'+thread)
# Iterate images list and download them
for image in images :
match = re.findall(r'\/([0-9]+.(jpg|jpeg|png|gif|webm))$', image)
if match[0] :
filename = section+'/'+thread+'/'+match[0][0]
print 'Downloading /'+filename
# Download image
q = requests.get('https:'+image, stream=True)
with open(filename, 'wb') as f :
q.raw.decode_content = True
shutil.copyfileobj(q.raw, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment