Created
October 13, 2012 16:19
-
-
Save anonymous/3885201 to your computer and use it in GitHub Desktop.
_booru scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/local/bin/python3 | |
| # Just run it. Supports python2 and python3. | |
| # Only tested on unix, but should work everywhere; nothing complicated going on. | |
| # Saves files to 'current directory / out [site id] [tags searched for, in alphabetic order]'. | |
| # You'll need an account for danbooru, and you can only search 2 tags at a go (with a basic account) | |
| import os | |
| import sys | |
| import xml.etree.ElementTree as ET | |
| from hashlib import sha1 | |
| if sys.version_info[0] >= 3: | |
| from urllib.request import urlopen, urlretrieve | |
| else: | |
| from urllib import urlopen, urlretrieve # can we all move on from python2, please? | |
| from math import ceil | |
| if sys.version_info[0] >= 3: | |
| minput = input | |
| else: | |
| minput = raw_input | |
| site = minput('Enter \'d\' for danbooru, \'g\' for gelbooru: ') | |
| if site == 'd': | |
| uname = minput('Username: ') | |
| passw = 'choujin-steiner--' + minput('Password (don\'t worry, it\'s not saved): ') + '--' | |
| hpassw = sha1(passw.encode('utf-8')).hexdigest() | |
| baseURL = 'http://danbooru.donmai.us/post/index.xml?login=' + uname + '&password_hash=' + hpassw | |
| pagelabel = 'page' | |
| pageoff = 1 | |
| elif site == 'g': | |
| baseURL = 'http://gelbooru.com/index.php?page=dapi&s=post&q=index' | |
| pagelabel = 'pid' | |
| pageoff = 0 | |
| else: | |
| print('I DON\'T UNDERSTAND') | |
| exit(1) | |
| def sanitize(s): | |
| valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' | |
| return ''.join(c if c in valid_chars else '_' for c in s) | |
| tags = minput('Enter space-delimited tags to search for: ').split() | |
| tags.sort() | |
| perpage = 20 | |
| barwidth = 40 | |
| dir = sanitize('out ' + site + ' ' + ' '.join(tags)) | |
| tags = '+'.join(tags) | |
| if not os.path.exists(dir): | |
| os.makedirs(dir) | |
| # Just get one so we know the count. | |
| req = urlopen(baseURL + '&tags={}&limit=1'.format(tags)) | |
| root = ET.parse(req).getroot() | |
| total = int(root.attrib['count']) | |
| # Always precisely width 4, suffix width 2. | |
| def humanize_bytes(bytes): | |
| abbrevs = ( | |
| (1<<50, 'PB'), | |
| (1<<40, 'TB'), | |
| (1<<30, 'GB'), | |
| (1<<20, 'MB'), | |
| (1<<10, 'kB'), | |
| (1, ' B') | |
| ) | |
| for factor, suffix in abbrevs: | |
| if bytes >= factor: | |
| k = '{:f}'.format(bytes/factor)[:4] | |
| return k, factor, suffix | |
| def dlProgress(count, blockSize, totalSize): | |
| done = min(count*blockSize, totalSize) | |
| ticks = int(done*barwidth/totalSize) | |
| outstr = ('\b'*(barwidth+13)) + ('-'*ticks) + (' '*(barwidth-ticks)) + ']' | |
| t, factor, suffix = humanize_bytes(totalSize) | |
| k = '{:f}'.format(done/factor)[:4] | |
| outstr += ' {0}/{1}{2}'.format(k, t, suffix) | |
| sys.stdout.write(outstr) | |
| sys.stdout.flush() | |
| def dopage(root): | |
| kids = list(root) | |
| kids.reverse() | |
| for child in kids: | |
| url = child.attrib['file_url'] | |
| extn = url[url.rfind('.'):] | |
| mtags = child.attrib['tags'].lstrip() | |
| if len(mtags) > 200: | |
| mtags = mtags[:201][:mtags.rfind(' ')] | |
| filename = dir + '/' + sanitize(child.attrib['id'] + ' ' + mtags + extn) | |
| if(os.path.exists(filename)): | |
| print('Skipping ' + '{0:7s}'.format(child.attrib['id']) + ': already exists.') | |
| else: | |
| sys.stdout.write('Fetching ' + '{0:7s}'.format(child.attrib['id']) + ': [' + (' '*(barwidth+13))) | |
| sys.stdout.flush() | |
| urlretrieve(url, filename, reporthook=dlProgress) | |
| print('') | |
| pages = int(ceil(float(total) / perpage)) | |
| for pg in range(pages-1, -1, -1): #eg 3 pages: pg 2,1,0 | |
| print('Page {} of {}: ({} images total)'.format(pages-pg, pages, total)) | |
| rurl = baseURL + ('&tags={}&limit={}&' + pagelabel + '={}').format(tags, perpage, pg+pageoff) | |
| req = urlopen(rurl) | |
| root = ET.parse(req).getroot() | |
| dopage(root) | |
| print('') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment