Created
December 13, 2014 17:29
-
-
Save Asbra/002a93d8249f045a2adf to your computer and use it in GitHub Desktop.
4chan downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# @author: johan | |
# @date: 2014-12-12 | |
# @modified_by: johan | |
# @modified_at: 2014-12-12 | |
import re # Regular expressions | |
import requests # To make HTTP requests | |
import json # To parse 4chan's JSON | |
import shutil # Used when downloading file | |
import os # For creating folders | |
class fchan(object) : | |
def __init__(self) : | |
return | |
def catalog(self, section) : | |
print 'Grabbing catalog for /'+section+'/ ..' | |
# Build Url | |
url = 'https://boards.4chan.org/'+section+'/catalog' | |
# Download page | |
r = requests.get(url) | |
# Error handling | |
if r.status_code != 200 or not r.content : | |
print 'Failed to read catalog. Are you sure that there is a /'+section+'/ section?' | |
return [] | |
threads = [] | |
# Find all threads in the JSON data | |
m = re.findall(r'var catalog = (.*)?};', r.content) | |
j = m[0]+'}' | |
d = json.loads(j) | |
for t in d['threads'].iteritems() : | |
threads.append(t[0]) | |
print 'Found '+str(len(threads))+' threads' | |
return threads | |
def thread(self, section, id) : | |
print 'Grabbing thread /'+section+'/thread/'+id+'/' | |
# Build Url | |
url = 'https://boards.4chan.org/'+section+'/thread/'+id+'/' | |
# Download page | |
r = requests.get(url) | |
# Error handling | |
if r.status_code != 200 or not r.content : | |
print 'Failed to read thread /'+section+'/thread/'+id+'/' | |
return [] | |
# Find all images in thread | |
m = re.findall(r'href=".*?(\/\/i.4cdn.org\/[a-z]+\/[0-9]+.(jpg|jpeg|png|gif|webm))"', r.content) | |
images = [] | |
for i in m : | |
images.append(i[0]) | |
print 'Found '+str(len(images))+' images in thread /'+section+'/thread/'+id+'/' | |
return self.uniq(images) | |
# Remove duplicate elements in list | |
def uniq(self, seq) : | |
seen = set() | |
seen_add = seen.add | |
return [x for x in seq if not (x in seen or seen_add(x))] | |
if __name__ == '__main__': | |
import sys | |
# Input validation | |
if len(sys.argv) < 2 or not sys.argv[1] : | |
print '4chan.py <section>' | |
print 'No section given' | |
sys.exit() | |
section = sys.argv[1] | |
chan = fchan() | |
# Get all threads from the catalog | |
threads = chan.catalog(section) | |
# Create section folder if it doesn't exist | |
if not os.path.exists(section) : | |
os.makedirs(section) | |
# Iterate all threads and download images | |
for thread in threads : | |
# Get images | |
images = chan.thread(section, thread) | |
# Create image folder if it doesn't exist | |
if not os.path.exists(section+'/'+thread) : | |
os.makedirs(section+'/'+thread) | |
# Iterate images list and download them | |
for image in images : | |
match = re.findall(r'\/([0-9]+.(jpg|jpeg|png|gif|webm))$', image) | |
if match[0] : | |
filename = section+'/'+thread+'/'+match[0][0] | |
print 'Downloading /'+filename | |
# Download image | |
q = requests.get('https:'+image, stream=True) | |
with open(filename, 'wb') as f : | |
q.raw.decode_content = True | |
shutil.copyfileobj(q.raw, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment