Last active March 2, 2019 19:48
This script downloads all the files from a specified thread on
#!/usr/bin/env python3
"""Image board downloader for
Downloads all the files from a specified thread.
Now multithreaded!
A number of threads is cpu_count * 4 by default, but if number of files is less
then the number of threads would be accordingly smaller.
__version__ = "1.1.0"
__author__ = "Arav"
__email__ = ""
__copyright__ = "Copyright © 2017,2018 Arav <>"
__license__ = """
This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See for more details.
from argparse import ArgumentParser
from json import loads
from os import makedirs, cpu_count
from os.path import exists, join
from re import findall
from sys import argv
from urllib.request import urlopen, urlretrieve
from threading import Thread, current_thread
URL_REGEXP = r"https?:\/\/boards\.4chan\.org\/(.+)\/thread\/(\d+)"
URL_THREAD_JSON = "{board}/thread/{thread}.json"
URL_FILE_CDN = "{board}/{filename}"
THREAD_COUNT = cpu_count() * 4
def fix_filename(filename):
chars = [':', '?', '/', '\\']
return ''.join(['_' if c in chars else c for c in filename])
def process_posts(posts, board, save_directory):
print(current_thread().getName(), "is processing", len(posts), "posts.")
for i, post in enumerate(posts, start=1):
if not 'tim' in post: continue
filename = str(post['tim']) + post['ext']
if 'filename' in post:
actual_filename = str(fix_filename(post['filename'])) + post['ext']
actual_filename = filename
if exists(join(save_directory, actual_filename)): continue
print(current_thread().getName(), "[" + str(i) + "/" + str(len(posts)) + "] Downloading", actual_filename)
urlretrieve(URL_FILE_CDN.format(board=board, filename=filename), \
join(save_directory, actual_filename))
print(current_thread().getName(), "is done its work.")
def print_version():
print("Image board downloader for\n"
+ "Version " + __version__ + "\n"
+ __copyright__ + "\n"
+ __license__)
def main():
parser = ArgumentParser(
description="Downloads all the files from a specified thread.",
epilog="It stores content in directory 4chan/<board>/<thread number> within current by default. With -D option \
you may change destination directory.")
parser.add_argument("-v", "--version", action="store_true",
help="show version of the program and exit")
parser.add_argument("-D", "--dest-dir",
help="specifies destination directory, uses current otherwise")
parser.add_argument("thread", nargs='+',
help="thread's URL or <board short name> <thread number> (e.g. wg 1234567)")
args = parser.parse_args()
if args.version:
if args.thread:
if len(args.thread) == 1:
board, thread_number = findall(URL_REGEXP, args.thread)[0]
board, thread_number = args.thread[0], args.thread[1]
with urlopen(URL_THREAD_JSON.format(board=board, thread=thread_number)) as url:
thread = loads('utf-8'))['posts']
print("Thread is no longer exist or never existed.")
print("There is", len(thread), "posts to process. Working on it.")
if args.dest_dir:
save_directory = join(args.dest_dir, '4chan', board, thread_number)
save_directory = join('4chan', board, thread_number)
makedirs(save_directory, exist_ok=True)
op_filename = join(save_directory, '! OP.txt')
with open(op_filename, 'w', encoding='utf-8') as t:
if 'sub' in thread[0]: t.write(thread[0]['sub'])
if 'com' in thread[0]: t.write('\n' + thread[0]['com'])
workers = []
for i in range(THREAD_COUNT):
if len(thread[i::THREAD_COUNT]) == 0: continue
worker = Thread(target=process_posts, args=(thread[i::THREAD_COUNT], board, save_directory))
workers += [worker]
for worker in workers:
if __name__ == "__main__":
