Skip to content

Instantly share code, notes, and snippets.

Created October 13, 2012 16:19
Show Gist options
  • Select an option

  • Save anonymous/3885201 to your computer and use it in GitHub Desktop.

Select an option

Save anonymous/3885201 to your computer and use it in GitHub Desktop.
_booru scraper
#! /usr/local/bin/python3
# Just run it. Supports python2 and python3.
# Only tested on unix, but should work everywhere; nothing complicated going on.
# Saves files to 'current directory / out [site id] [tags searched for, in alphabetic order]'.
# You'll need an account for danbooru, and you can only search 2 tags at a go (with a basic account)
import os
import sys
import xml.etree.ElementTree as ET
from hashlib import sha1
if sys.version_info[0] >= 3:
from urllib.request import urlopen, urlretrieve
else:
from urllib import urlopen, urlretrieve # can we all move on from python2, please?
from math import ceil
if sys.version_info[0] >= 3:
minput = input
else:
minput = raw_input
site = minput('Enter \'d\' for danbooru, \'g\' for gelbooru: ')
if site == 'd':
uname = minput('Username: ')
passw = 'choujin-steiner--' + minput('Password (don\'t worry, it\'s not saved): ') + '--'
hpassw = sha1(passw.encode('utf-8')).hexdigest()
baseURL = 'http://danbooru.donmai.us/post/index.xml?login=' + uname + '&password_hash=' + hpassw
pagelabel = 'page'
pageoff = 1
elif site == 'g':
baseURL = 'http://gelbooru.com/index.php?page=dapi&s=post&q=index'
pagelabel = 'pid'
pageoff = 0
else:
print('I DON\'T UNDERSTAND')
exit(1)
def sanitize(s):
valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
return ''.join(c if c in valid_chars else '_' for c in s)
tags = minput('Enter space-delimited tags to search for: ').split()
tags.sort()
perpage = 20
barwidth = 40
dir = sanitize('out ' + site + ' ' + ' '.join(tags))
tags = '+'.join(tags)
if not os.path.exists(dir):
os.makedirs(dir)
# Just get one so we know the count.
req = urlopen(baseURL + '&tags={}&limit=1'.format(tags))
root = ET.parse(req).getroot()
total = int(root.attrib['count'])
# Always precisely width 4, suffix width 2.
def humanize_bytes(bytes):
abbrevs = (
(1<<50, 'PB'),
(1<<40, 'TB'),
(1<<30, 'GB'),
(1<<20, 'MB'),
(1<<10, 'kB'),
(1, ' B')
)
for factor, suffix in abbrevs:
if bytes >= factor:
k = '{:f}'.format(bytes/factor)[:4]
return k, factor, suffix
def dlProgress(count, blockSize, totalSize):
done = min(count*blockSize, totalSize)
ticks = int(done*barwidth/totalSize)
outstr = ('\b'*(barwidth+13)) + ('-'*ticks) + (' '*(barwidth-ticks)) + ']'
t, factor, suffix = humanize_bytes(totalSize)
k = '{:f}'.format(done/factor)[:4]
outstr += ' {0}/{1}{2}'.format(k, t, suffix)
sys.stdout.write(outstr)
sys.stdout.flush()
def dopage(root):
kids = list(root)
kids.reverse()
for child in kids:
url = child.attrib['file_url']
extn = url[url.rfind('.'):]
mtags = child.attrib['tags'].lstrip()
if len(mtags) > 200:
mtags = mtags[:201][:mtags.rfind(' ')]
filename = dir + '/' + sanitize(child.attrib['id'] + ' ' + mtags + extn)
if(os.path.exists(filename)):
print('Skipping ' + '{0:7s}'.format(child.attrib['id']) + ': already exists.')
else:
sys.stdout.write('Fetching ' + '{0:7s}'.format(child.attrib['id']) + ': [' + (' '*(barwidth+13)))
sys.stdout.flush()
urlretrieve(url, filename, reporthook=dlProgress)
print('')
pages = int(ceil(float(total) / perpage))
for pg in range(pages-1, -1, -1): #eg 3 pages: pg 2,1,0
print('Page {} of {}: ({} images total)'.format(pages-pg, pages, total))
rurl = baseURL + ('&tags={}&limit={}&' + pagelabel + '={}').format(tags, perpage, pg+pageoff)
req = urlopen(rurl)
root = ET.parse(req).getroot()
dopage(root)
print('')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment