Last active
March 2, 2019 19:48
-
-
Save arav97531/678270d111680671c43073a1434a87e3 to your computer and use it in GitHub Desktop.
This script downloads all the files from a specified thread on 4chan.org.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Image board downloader for 4chan.org. | |
Downloads all the files from a specified thread. | |
Now multithreaded! | |
A number of threads is cpu_count * 4 by default, but if number of files is less | |
then the number of threads would be accordingly smaller. | |
""" | |
__version__ = "1.1.0" | |
__author__ = "Arav" | |
__email__ = "me@arav.icu" | |
__copyright__ = "Copyright © 2017,2018 Arav <me@aravsr.tk>" | |
__license__ = """ | |
This program is free software. It comes without any warranty, to | |
the extent permitted by applicable law. You can redistribute it | |
and/or modify it under the terms of the Do What The Fuck You Want | |
To Public License, Version 2, as published by Sam Hocevar. See | |
http://www.wtfpl.net/ for more details. | |
""" | |
from argparse import ArgumentParser | |
from json import loads | |
from os import makedirs, cpu_count | |
from os.path import exists, join | |
from re import findall | |
from sys import argv | |
from urllib.request import urlopen, urlretrieve | |
from threading import Thread, current_thread | |
URL_REGEXP = r"https?:\/\/boards\.4chan\.org\/(.+)\/thread\/(\d+)" | |
URL_THREAD_JSON = "https://a.4cdn.org/{board}/thread/{thread}.json" | |
URL_FILE_CDN = "https://i.4cdn.org/{board}/{filename}" | |
THREAD_COUNT = cpu_count() * 4 | |
def fix_filename(filename): | |
chars = [':', '?', '/', '\\'] | |
return ''.join(['_' if c in chars else c for c in filename]) | |
def process_posts(posts, board, save_directory): | |
print(current_thread().getName(), "is processing", len(posts), "posts.") | |
for i, post in enumerate(posts, start=1): | |
if not 'tim' in post: continue | |
filename = str(post['tim']) + post['ext'] | |
if 'filename' in post: | |
actual_filename = str(fix_filename(post['filename'])) + post['ext'] | |
else: | |
actual_filename = filename | |
if exists(join(save_directory, actual_filename)): continue | |
print(current_thread().getName(), "[" + str(i) + "/" + str(len(posts)) + "] Downloading", actual_filename) | |
urlretrieve(URL_FILE_CDN.format(board=board, filename=filename), \ | |
join(save_directory, actual_filename)) | |
print(current_thread().getName(), "is done its work.") | |
def print_version(): | |
print("Image board downloader for 4chan.org\n" | |
+ "Version " + __version__ + "\n" | |
+ __copyright__ + "\n" | |
+ __license__) | |
exit() | |
def main(): | |
parser = ArgumentParser( | |
prog="ibdl-4chan", | |
description="Downloads all the files from a specified thread.", | |
epilog="It stores content in directory 4chan/<board>/<thread number> within current by default. With -D option \ | |
you may change destination directory.") | |
parser.add_argument("-v", "--version", action="store_true", | |
help="show version of the program and exit") | |
parser.add_argument("-D", "--dest-dir", | |
help="specifies destination directory, uses current otherwise") | |
parser.add_argument("thread", nargs='+', | |
help="thread's URL or <board short name> <thread number> (e.g. wg 1234567)") | |
args = parser.parse_args() | |
if args.version: | |
print_version() | |
if args.thread: | |
if len(args.thread) == 1: | |
board, thread_number = findall(URL_REGEXP, args.thread)[0] | |
else: | |
board, thread_number = args.thread[0], args.thread[1] | |
else: | |
parser.print_help() | |
exit() | |
try: | |
with urlopen(URL_THREAD_JSON.format(board=board, thread=thread_number)) as url: | |
thread = loads(url.read().decode('utf-8'))['posts'] | |
except: | |
print("Thread is no longer exist or never existed.") | |
exit() | |
print("There is", len(thread), "posts to process. Working on it.") | |
if args.dest_dir: | |
save_directory = join(args.dest_dir, '4chan', board, thread_number) | |
else: | |
save_directory = join('4chan', board, thread_number) | |
makedirs(save_directory, exist_ok=True) | |
op_filename = join(save_directory, '! OP.txt') | |
with open(op_filename, 'w', encoding='utf-8') as t: | |
if 'sub' in thread[0]: t.write(thread[0]['sub']) | |
if 'com' in thread[0]: t.write('\n' + thread[0]['com']) | |
workers = [] | |
for i in range(THREAD_COUNT): | |
if len(thread[i::THREAD_COUNT]) == 0: continue | |
worker = Thread(target=process_posts, args=(thread[i::THREAD_COUNT], board, save_directory)) | |
worker.start() | |
workers += [worker] | |
for worker in workers: | |
worker.join() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment