Last active
August 4, 2019 16:06
-
-
Save m3philis/7649961 to your computer and use it in GitHub Desktop.
Little downloader for konachan.com. You can download every image of that site or set tags to download just pics with that tags. I appreciate ideas and bug reports :)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
from __future__ import print_function | |
import re | |
import os | |
import os.path | |
import sys | |
import http.client | |
import urllib.request | |
import time | |
# regexes | |
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)") | |
name_regex = re.compile("image/.*?/(.*)") | |
# variable | |
counter1, counter2 = 0, 15000 | |
tag_filter = None | |
domain = http.client.HTTPConnection("konachan.com") | |
# little function to calculate the last page of search results | |
def page_count(): | |
# open connection to konachan.com | |
domain = http.client.HTTPConnection("konachan.com") | |
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) | |
while True: | |
try: | |
first_page = domain.getresponse() | |
break | |
except http.client.BadStatusLine: | |
time.sleep(1) | |
domain.close() | |
domain = http.client.HTTPConnection("konachan.com") | |
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+")) | |
# we got our response, now it's time to find that number | |
first_page_source = str(first_page.read()) | |
page_list = first_page_source.split("Next Page") | |
number = 0 | |
for line in page_list: | |
if re.search("(?<=\/post\?page\=)\d+", line): | |
number = re.search("(?<=\/post\?page\=)\d+", line).group(0) | |
else: | |
number = 2 | |
return int(number) | |
# we don't want to save every picture in one directory. | |
# so we create a new directory when we donwloaded 15k pics | |
def directory_size(directory_intern): | |
if len(os.listdir(directory_intern)) >= 15000: | |
print("Directory " + directory_intern + " full") | |
counter1 += 15000 | |
counter2 += 15000 | |
directory = "Pics " + str(counter1) + " - " + str(counter2) | |
if os.path.isdir(directory): | |
print("Directory already exists; skip creation") | |
else: | |
os.makedirs(directory, 0o755, False) | |
os.chdir("..") | |
# now we start | |
# user has to set path for pictures | |
print("Please set download location (full path required): ") | |
path = sys.stdin.readline() | |
# set tags, if user want to download specific pictures | |
print("Set Tags (seperate multiple tags with a whitespace;" + | |
" connect tags with more than one word with an underscore): ") | |
tags = sys.stdin.readline().strip("\n") | |
# chdir in $path and create directory if it not exists | |
if not os.path.isdir(path.rstrip()): | |
os.makedirs(path.rstrip(), 0o755, True) | |
os.chdir(path.rstrip()) | |
if not os.path.isdir("Tags: " + tags): | |
os.makedirs("Tags: " + tags, 0o755, True) | |
os.chdir("Tags: " + tags) | |
# creating directory for pics | |
directory = "Pics " + str(counter1) + " - " + str(counter2) | |
if not os.path.isdir(directory): | |
os.makedirs(directory, 0o755, True) | |
# let's start with downloading | |
for page_number in range(1, page_count()): | |
print("Starting download in page " + str(page_number)) | |
domain.request("GET", "/post?page=" + str(page_number) + | |
"&tags=" + tags.replace(" ", "+")) | |
while True: | |
try: | |
index_page = domain.getresponse() | |
break | |
except http.client.BadStatusLine: | |
domain.close() | |
domain = http.client.HTTPConnection("konachan.com") | |
domain.request("GET", "/post?page=" + str(page_number) + | |
"&tags=" + tags.replace(" ", "+")) | |
time.sleep(1) | |
# after we got the response from konachan we need the source code | |
index_page_source = str(index_page.read()) | |
# and now we need save every link on this page in a list | |
pics_list = index_page_source.split("Post.register") | |
directory_size(directory) | |
# now we can search every line for the pic link | |
for pic in pics_list: | |
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic)) | |
# if we found the url we download the pic | |
# but with whitespaces instead of "%20" | |
if pic_url: | |
name = name_regex.search(pic_url.group(0)).group(1) | |
print(" Downloading pic: " + name.replace("%20", " ") + | |
" in directory: " + directory) | |
# a little check if pic already exists | |
existance = False | |
for dir in os.listdir(): | |
os.chdir(dir) | |
if os.path.isfile(name.replace("%20", " ")): | |
print(" Pic is already on your pc! Skip!") | |
existance = True | |
os.chdir("..") | |
if not existance: | |
os.chdir(directory) | |
image = urllib.request.URLopener() | |
image.retrieve("http://" + | |
pic_url.group(0), urllib.request.url2pathname(name)) | |
print(" Download finished") | |
os.chdir("..") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment