arav97531/ibdl-4chan.py

## ibdl-4chan.py
#!/usr/bin/env python3
"""Image board downloader for 4chan.org.
Downloads all the files from a specified thread.
Now multithreaded!
A number of threads is cpu_count * 4 by default, but if number of files is less
 then the number of threads would be accordingly smaller.
"""

__version__ = "1.1.0"
__author__ = "Arav"
__email__ = "me@arav.icu"
__copyright__ = "Copyright © 2017,2018 Arav <me@aravsr.tk>"
__license__ = """
This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://www.wtfpl.net/ for more details.
"""


from argparse import ArgumentParser
from json import loads
from os import makedirs, cpu_count
from os.path import exists, join
from re import findall
from sys import argv
from urllib.request import urlopen, urlretrieve
from threading import Thread, current_thread


URL_REGEXP = r"https?:\/\/boards\.4chan\.org\/(.+)\/thread\/(\d+)"
URL_THREAD_JSON = "https://a.4cdn.org/{board}/thread/{thread}.json"
URL_FILE_CDN = "https://i.4cdn.org/{board}/{filename}"

THREAD_COUNT = cpu_count() * 4


def fix_filename(filename):
    chars = [':', '?', '/', '\\']
    return ''.join(['_' if c in chars else c for c in filename])

def process_posts(posts, board, save_directory):
    print(current_thread().getName(), "is processing", len(posts), "posts.")

    for i, post in enumerate(posts, start=1):
        if not 'tim' in post: continue

        filename = str(post['tim']) + post['ext']
        if 'filename' in post:
            actual_filename = str(fix_filename(post['filename'])) + post['ext']
        else:
            actual_filename = filename

        if exists(join(save_directory, actual_filename)): continue

        print(current_thread().getName(), "[" + str(i) + "/" + str(len(posts)) + "] Downloading", actual_filename)

        urlretrieve(URL_FILE_CDN.format(board=board, filename=filename), \
            join(save_directory, actual_filename))

    print(current_thread().getName(), "is done its work.")


def print_version():
    print("Image board downloader for 4chan.org\n"
            + "Version " + __version__ + "\n"
            + __copyright__ + "\n"
            + __license__)
    exit()


def main():
    parser = ArgumentParser(
        prog="ibdl-4chan",
        description="Downloads all the files from a specified thread.",
        epilog="It stores content in directory 4chan/<board>/<thread number> within current by default. With -D option \
        you may change destination directory.")
    parser.add_argument("-v", "--version", action="store_true",
        help="show version of the program and exit")
    parser.add_argument("-D", "--dest-dir",
        help="specifies destination directory, uses current otherwise")
    parser.add_argument("thread", nargs='+',
        help="thread's URL or <board short name> <thread number> (e.g. wg 1234567)")
    args = parser.parse_args()


    if args.version:
        print_version()


    if args.thread:
        if len(args.thread) == 1:
            board, thread_number = findall(URL_REGEXP, args.thread)[0]
        else:
            board, thread_number = args.thread[0], args.thread[1]
    else:
        parser.print_help()
        exit()


    try:
        with urlopen(URL_THREAD_JSON.format(board=board, thread=thread_number)) as url:
            thread = loads(url.read().decode('utf-8'))['posts']
    except:
        print("Thread is no longer exist or never existed.")
        exit()

    print("There is", len(thread), "posts to process. Working on it.")


    if args.dest_dir:
        save_directory = join(args.dest_dir, '4chan', board, thread_number)
    else:
        save_directory = join('4chan', board, thread_number)

    makedirs(save_directory, exist_ok=True)


    op_filename = join(save_directory, '! OP.txt')
    with open(op_filename, 'w', encoding='utf-8') as t:
        if 'sub' in thread[0]: t.write(thread[0]['sub'])
        if 'com' in thread[0]: t.write('\n' + thread[0]['com'])


    workers = []
    for i in range(THREAD_COUNT):
        if len(thread[i::THREAD_COUNT]) == 0: continue
        worker = Thread(target=process_posts, args=(thread[i::THREAD_COUNT], board, save_directory))
        worker.start()
        workers += [worker]

    for worker in workers:
        worker.join()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""Image board downloader for 4chan.org.
	Downloads all the files from a specified thread.
	Now multithreaded!
	A number of threads is cpu_count * 4 by default, but if number of files is less
	then the number of threads would be accordingly smaller.
	"""

	__version__ = "1.1.0"
	__author__ = "Arav"
	__email__ = "me@arav.icu"
	__copyright__ = "Copyright © 2017,2018 Arav <me@aravsr.tk>"
	__license__ = """
	This program is free software. It comes without any warranty, to
	the extent permitted by applicable law. You can redistribute it
	and/or modify it under the terms of the Do What The Fuck You Want
	To Public License, Version 2, as published by Sam Hocevar. See
	http://www.wtfpl.net/ for more details.
	"""


	from argparse import ArgumentParser
	from json import loads
	from os import makedirs, cpu_count
	from os.path import exists, join
	from re import findall
	from sys import argv
	from urllib.request import urlopen, urlretrieve
	from threading import Thread, current_thread


	URL_REGEXP = r"https?:\/\/boards\.4chan\.org\/(.+)\/thread\/(\d+)"
	URL_THREAD_JSON = "https://a.4cdn.org/{board}/thread/{thread}.json"
	URL_FILE_CDN = "https://i.4cdn.org/{board}/{filename}"

	THREAD_COUNT = cpu_count() * 4


	def fix_filename(filename):
	chars = [':', '?', '/', '\\']
	return ''.join(['_' if c in chars else c for c in filename])

	def process_posts(posts, board, save_directory):
	print(current_thread().getName(), "is processing", len(posts), "posts.")

	for i, post in enumerate(posts, start=1):
	if not 'tim' in post: continue

	filename = str(post['tim']) + post['ext']
	if 'filename' in post:
	actual_filename = str(fix_filename(post['filename'])) + post['ext']
	else:
	actual_filename = filename

	if exists(join(save_directory, actual_filename)): continue

	print(current_thread().getName(), "[" + str(i) + "/" + str(len(posts)) + "] Downloading", actual_filename)

	urlretrieve(URL_FILE_CDN.format(board=board, filename=filename), \
	join(save_directory, actual_filename))

	print(current_thread().getName(), "is done its work.")


	def print_version():
	print("Image board downloader for 4chan.org\n"
	+ "Version " + __version__ + "\n"
	+ __copyright__ + "\n"
	+ __license__)
	exit()


	def main():
	parser = ArgumentParser(
	prog="ibdl-4chan",
	description="Downloads all the files from a specified thread.",
	epilog="It stores content in directory 4chan/<board>/<thread number> within current by default. With -D option \
	you may change destination directory.")
	parser.add_argument("-v", "--version", action="store_true",
	help="show version of the program and exit")
	parser.add_argument("-D", "--dest-dir",
	help="specifies destination directory, uses current otherwise")
	parser.add_argument("thread", nargs='+',
	help="thread's URL or <board short name> <thread number> (e.g. wg 1234567)")
	args = parser.parse_args()


	if args.version:
	print_version()


	if args.thread:
	if len(args.thread) == 1:
	board, thread_number = findall(URL_REGEXP, args.thread)[0]
	else:
	board, thread_number = args.thread[0], args.thread[1]
	else:
	parser.print_help()
	exit()


	try:
	with urlopen(URL_THREAD_JSON.format(board=board, thread=thread_number)) as url:
	thread = loads(url.read().decode('utf-8'))['posts']
	except:
	print("Thread is no longer exist or never existed.")
	exit()

	print("There is", len(thread), "posts to process. Working on it.")


	if args.dest_dir:
	save_directory = join(args.dest_dir, '4chan', board, thread_number)
	else:
	save_directory = join('4chan', board, thread_number)

	makedirs(save_directory, exist_ok=True)


	op_filename = join(save_directory, '! OP.txt')
	with open(op_filename, 'w', encoding='utf-8') as t:
	if 'sub' in thread[0]: t.write(thread[0]['sub'])
	if 'com' in thread[0]: t.write('\n' + thread[0]['com'])


	workers = []
	for i in range(THREAD_COUNT):
	if len(thread[i::THREAD_COUNT]) == 0: continue
	worker = Thread(target=process_posts, args=(thread[i::THREAD_COUNT], board, save_directory))
	worker.start()
	workers += [worker]

	for worker in workers:
	worker.join()


	if __name__ == "__main__":
	main()