Asbra/4chan.py

## 4chan.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author:      johan
# @date:        2014-12-12
# @modified_by: johan
# @modified_at: 2014-12-12

import re        # Regular expressions
import requests  # To make HTTP requests
import json      # To parse 4chan's JSON
import shutil    # Used when downloading file
import os        # For creating folders

class fchan(object) :

	def __init__(self) :
		return

	def catalog(self, section) :
		print 'Grabbing catalog for /'+section+'/ ..'

		# Build Url
		url = 'https://boards.4chan.org/'+section+'/catalog'

		# Download page
		r = requests.get(url)

		# Error handling
		if r.status_code != 200 or not r.content :
			print 'Failed to read catalog. Are you sure that there is a /'+section+'/ section?'
			return []

		threads = []

		# Find all threads in the JSON data
		m = re.findall(r'var catalog = (.*)?};', r.content)
		j = m[0]+'}'

		d = json.loads(j)

		for t in d['threads'].iteritems() :
			threads.append(t[0])

		print 'Found '+str(len(threads))+' threads'

		return threads

	def thread(self, section, id) :
		print 'Grabbing thread /'+section+'/thread/'+id+'/'

		# Build Url
		url = 'https://boards.4chan.org/'+section+'/thread/'+id+'/'

		# Download page
		r = requests.get(url)

		# Error handling
		if r.status_code != 200 or not r.content :
			print 'Failed to read thread /'+section+'/thread/'+id+'/'
			return []

		# Find all images in thread
		m = re.findall(r'href=".*?(\/\/i.4cdn.org\/[a-z]+\/[0-9]+.(jpg|jpeg|png|gif|webm))"', r.content)

		images = []

		for i in m :
			images.append(i[0])

		print 'Found '+str(len(images))+' images in thread /'+section+'/thread/'+id+'/'

		return self.uniq(images)

	# Remove duplicate elements in list
	def uniq(self, seq) :
		seen = set()
		seen_add = seen.add
		return [x for x in seq if not (x in seen or seen_add(x))]

if __name__ == '__main__':
	import sys

	# Input validation
	if len(sys.argv) < 2 or not sys.argv[1] :
		print '4chan.py <section>'
		print 'No section given'
		sys.exit()

	section = sys.argv[1]

	chan = fchan()

	# Get all threads from the catalog
	threads = chan.catalog(section)

	# Create section folder if it doesn't exist
	if not os.path.exists(section) :
		os.makedirs(section)

	# Iterate all threads and download images
	for thread in threads :
		# Get images
		images = chan.thread(section, thread)

		# Create image folder if it doesn't exist
		if not os.path.exists(section+'/'+thread) :
			os.makedirs(section+'/'+thread)

		# Iterate images list and download them
		for image in images :
			match = re.findall(r'\/([0-9]+.(jpg|jpeg|png|gif|webm))$', image)

			if match[0] :
				filename = section+'/'+thread+'/'+match[0][0]

				print 'Downloading /'+filename

				# Download image
				q = requests.get('https:'+image, stream=True)

				with open(filename, 'wb') as f :
					q.raw.decode_content = True
					shutil.copyfileobj(q.raw, f)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# @author: johan
	# @date: 2014-12-12
	# @modified_by: johan
	# @modified_at: 2014-12-12

	import re # Regular expressions
	import requests # To make HTTP requests
	import json # To parse 4chan's JSON
	import shutil # Used when downloading file
	import os # For creating folders

	class fchan(object) :

	def __init__(self) :
	return

	def catalog(self, section) :
	print 'Grabbing catalog for /'+section+'/ ..'

	# Build Url
	url = 'https://boards.4chan.org/'+section+'/catalog'

	# Download page
	r = requests.get(url)

	# Error handling
	if r.status_code != 200 or not r.content :
	print 'Failed to read catalog. Are you sure that there is a /'+section+'/ section?'
	return []

	threads = []

	# Find all threads in the JSON data
	m = re.findall(r'var catalog = (.*)?};', r.content)
	j = m[0]+'}'

	d = json.loads(j)

	for t in d['threads'].iteritems() :
	threads.append(t[0])

	print 'Found '+str(len(threads))+' threads'

	return threads

	def thread(self, section, id) :
	print 'Grabbing thread /'+section+'/thread/'+id+'/'

	# Build Url
	url = 'https://boards.4chan.org/'+section+'/thread/'+id+'/'

	# Download page
	r = requests.get(url)

	# Error handling
	if r.status_code != 200 or not r.content :
	print 'Failed to read thread /'+section+'/thread/'+id+'/'
	return []

	# Find all images in thread
	m = re.findall(r'href=".*?(\/\/i.4cdn.org\/[a-z]+\/[0-9]+.(jpg\|jpeg\|png\|gif\|webm))"', r.content)

	images = []

	for i in m :
	images.append(i[0])

	print 'Found '+str(len(images))+' images in thread /'+section+'/thread/'+id+'/'

	return self.uniq(images)

	# Remove duplicate elements in list
	def uniq(self, seq) :
	seen = set()
	seen_add = seen.add
	return [x for x in seq if not (x in seen or seen_add(x))]

	if __name__ == '__main__':
	import sys

	# Input validation
	if len(sys.argv) < 2 or not sys.argv[1] :
	print '4chan.py <section>'
	print 'No section given'
	sys.exit()

	section = sys.argv[1]

	chan = fchan()

	# Get all threads from the catalog
	threads = chan.catalog(section)

	# Create section folder if it doesn't exist
	if not os.path.exists(section) :
	os.makedirs(section)

	# Iterate all threads and download images
	for thread in threads :
	# Get images
	images = chan.thread(section, thread)

	# Create image folder if it doesn't exist
	if not os.path.exists(section+'/'+thread) :
	os.makedirs(section+'/'+thread)

	# Iterate images list and download them
	for image in images :
	match = re.findall(r'\/([0-9]+.(jpg\|jpeg\|png\|gif\|webm))$', image)

	if match[0] :
	filename = section+'/'+thread+'/'+match[0][0]

	print 'Downloading /'+filename

	# Download image
	q = requests.get('https:'+image, stream=True)

	with open(filename, 'wb') as f :
	q.raw.decode_content = True
	shutil.copyfileobj(q.raw, f)