Skip to content

Instantly share code, notes, and snippets.

@arav97531
Last active March 2, 2019 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arav97531/678270d111680671c43073a1434a87e3 to your computer and use it in GitHub Desktop.
Save arav97531/678270d111680671c43073a1434a87e3 to your computer and use it in GitHub Desktop.
This script downloads all the files from a specified thread on 4chan.org.
#!/usr/bin/env python3
"""Image board downloader for 4chan.org.
Downloads all the files from a specified thread.
Now multithreaded!
A number of threads is cpu_count * 4 by default, but if number of files is less
then the number of threads would be accordingly smaller.
"""
__version__ = "1.1.0"
__author__ = "Arav"
__email__ = "me@arav.icu"
__copyright__ = "Copyright © 2017,2018 Arav <me@aravsr.tk>"
__license__ = """
This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://www.wtfpl.net/ for more details.
"""
from argparse import ArgumentParser
from json import loads
from os import makedirs, cpu_count
from os.path import exists, join
from re import findall
from sys import argv
from urllib.request import urlopen, urlretrieve
from threading import Thread, current_thread
URL_REGEXP = r"https?:\/\/boards\.4chan\.org\/(.+)\/thread\/(\d+)"
URL_THREAD_JSON = "https://a.4cdn.org/{board}/thread/{thread}.json"
URL_FILE_CDN = "https://i.4cdn.org/{board}/{filename}"
THREAD_COUNT = cpu_count() * 4
def fix_filename(filename):
chars = [':', '?', '/', '\\']
return ''.join(['_' if c in chars else c for c in filename])
def process_posts(posts, board, save_directory):
print(current_thread().getName(), "is processing", len(posts), "posts.")
for i, post in enumerate(posts, start=1):
if not 'tim' in post: continue
filename = str(post['tim']) + post['ext']
if 'filename' in post:
actual_filename = str(fix_filename(post['filename'])) + post['ext']
else:
actual_filename = filename
if exists(join(save_directory, actual_filename)): continue
print(current_thread().getName(), "[" + str(i) + "/" + str(len(posts)) + "] Downloading", actual_filename)
urlretrieve(URL_FILE_CDN.format(board=board, filename=filename), \
join(save_directory, actual_filename))
print(current_thread().getName(), "is done its work.")
def print_version():
print("Image board downloader for 4chan.org\n"
+ "Version " + __version__ + "\n"
+ __copyright__ + "\n"
+ __license__)
exit()
def main():
parser = ArgumentParser(
prog="ibdl-4chan",
description="Downloads all the files from a specified thread.",
epilog="It stores content in directory 4chan/<board>/<thread number> within current by default. With -D option \
you may change destination directory.")
parser.add_argument("-v", "--version", action="store_true",
help="show version of the program and exit")
parser.add_argument("-D", "--dest-dir",
help="specifies destination directory, uses current otherwise")
parser.add_argument("thread", nargs='+',
help="thread's URL or <board short name> <thread number> (e.g. wg 1234567)")
args = parser.parse_args()
if args.version:
print_version()
if args.thread:
if len(args.thread) == 1:
board, thread_number = findall(URL_REGEXP, args.thread)[0]
else:
board, thread_number = args.thread[0], args.thread[1]
else:
parser.print_help()
exit()
try:
with urlopen(URL_THREAD_JSON.format(board=board, thread=thread_number)) as url:
thread = loads(url.read().decode('utf-8'))['posts']
except:
print("Thread is no longer exist or never existed.")
exit()
print("There is", len(thread), "posts to process. Working on it.")
if args.dest_dir:
save_directory = join(args.dest_dir, '4chan', board, thread_number)
else:
save_directory = join('4chan', board, thread_number)
makedirs(save_directory, exist_ok=True)
op_filename = join(save_directory, '! OP.txt')
with open(op_filename, 'w', encoding='utf-8') as t:
if 'sub' in thread[0]: t.write(thread[0]['sub'])
if 'com' in thread[0]: t.write('\n' + thread[0]['com'])
workers = []
for i in range(THREAD_COUNT):
if len(thread[i::THREAD_COUNT]) == 0: continue
worker = Thread(target=process_posts, args=(thread[i::THREAD_COUNT], board, save_directory))
worker.start()
workers += [worker]
for worker in workers:
worker.join()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment