Skip to content

Instantly share code, notes, and snippets.

@m3philis
Last active August 4, 2019 16:06
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m3philis/7649961 to your computer and use it in GitHub Desktop.
Save m3philis/7649961 to your computer and use it in GitHub Desktop.
Little downloader for konachan.com. You can download every image of that site or set tags to download just pics with that tags. I appreciate ideas and bug reports :)
#! /usr/bin/python
from __future__ import print_function
import re
import os
import os.path
import sys
import http.client
import urllib.request
import time
# regexes
url_regex = re.compile("konachan.com/image/.+?/.+?\.(?:png|jpg)")
name_regex = re.compile("image/.*?/(.*)")
# variable
counter1, counter2 = 0, 15000
tag_filter = None
domain = http.client.HTTPConnection("konachan.com")
# little function to calculate the last page of search results
def page_count():
# open connection to konachan.com
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
while True:
try:
first_page = domain.getresponse()
break
except http.client.BadStatusLine:
time.sleep(1)
domain.close()
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=1&tags=" + tags.replace(" ", "+"))
# we got our response, now it's time to find that number
first_page_source = str(first_page.read())
page_list = first_page_source.split("Next Page")
number = 0
for line in page_list:
if re.search("(?<=\/post\?page\=)\d+", line):
number = re.search("(?<=\/post\?page\=)\d+", line).group(0)
else:
number = 2
return int(number)
# we don't want to save every picture in one directory.
# so we create a new directory when we donwloaded 15k pics
def directory_size(directory_intern):
if len(os.listdir(directory_intern)) >= 15000:
print("Directory " + directory_intern + " full")
counter1 += 15000
counter2 += 15000
directory = "Pics " + str(counter1) + " - " + str(counter2)
if os.path.isdir(directory):
print("Directory already exists; skip creation")
else:
os.makedirs(directory, 0o755, False)
os.chdir("..")
# now we start
# user has to set path for pictures
print("Please set download location (full path required): ")
path = sys.stdin.readline()
# set tags, if user want to download specific pictures
print("Set Tags (seperate multiple tags with a whitespace;" +
" connect tags with more than one word with an underscore): ")
tags = sys.stdin.readline().strip("\n")
# chdir in $path and create directory if it not exists
if not os.path.isdir(path.rstrip()):
os.makedirs(path.rstrip(), 0o755, True)
os.chdir(path.rstrip())
if not os.path.isdir("Tags: " + tags):
os.makedirs("Tags: " + tags, 0o755, True)
os.chdir("Tags: " + tags)
# creating directory for pics
directory = "Pics " + str(counter1) + " - " + str(counter2)
if not os.path.isdir(directory):
os.makedirs(directory, 0o755, True)
# let's start with downloading
for page_number in range(1, page_count()):
print("Starting download in page " + str(page_number))
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
while True:
try:
index_page = domain.getresponse()
break
except http.client.BadStatusLine:
domain.close()
domain = http.client.HTTPConnection("konachan.com")
domain.request("GET", "/post?page=" + str(page_number) +
"&tags=" + tags.replace(" ", "+"))
time.sleep(1)
# after we got the response from konachan we need the source code
index_page_source = str(index_page.read())
# and now we need save every link on this page in a list
pics_list = index_page_source.split("Post.register")
directory_size(directory)
# now we can search every line for the pic link
for pic in pics_list:
pic_url = url_regex.search(re.sub("\\\\\\\\", "", pic))
# if we found the url we download the pic
# but with whitespaces instead of "%20"
if pic_url:
name = name_regex.search(pic_url.group(0)).group(1)
print(" Downloading pic: " + name.replace("%20", " ") +
" in directory: " + directory)
# a little check if pic already exists
existance = False
for dir in os.listdir():
os.chdir(dir)
if os.path.isfile(name.replace("%20", " ")):
print(" Pic is already on your pc! Skip!")
existance = True
os.chdir("..")
if not existance:
os.chdir(directory)
image = urllib.request.URLopener()
image.retrieve("http://" +
pic_url.group(0), urllib.request.url2pathname(name))
print(" Download finished")
os.chdir("..")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment