Skip to content

Instantly share code, notes, and snippets.

@HakurouKen
Created April 7, 2016 11:52
Show Gist options
  • Save HakurouKen/e036160d54ed1420e7a2c6b114ae4d16 to your computer and use it in GitHub Desktop.
Save HakurouKen/e036160d54ed1420e7a2c6b114ae4d16 to your computer and use it in GitHub Desktop.
Get pixiv original picture by id without login.
import requests
import re
from pyquery import PyQuery
from urllib2 import HTTPError
import os
class Picture():
'''
Get picture(s) from image page.
@Note: HTML changed after login.
'''
PAGE_URL = 'http://www.pixiv.net/member_illust.php?mode={mode}&illust_id={illust_id}'
MEDIUM_PAGE_URL = PAGE_URL.format(mode='medium',illust_id='{}')
MANGA_PAGE_URL = PAGE_URL.format(mode='manga',illust_id='{}')
FULL_PIC_URL = 'http://{domain}/img-original/img/{time}/{id}_p{index}.{suffix}'
SMALL_PIC_PATTERN = re.compile(r'https?:\/\/(\w+\.pixiv\.net)\/.*\/(\d{4}\/\d{2}\/\d{2}\/\d{2}\/\d{2}\/\d{2})\/(\d+)_p(\d+)\w+\.(.*)$')
def __init__(self,illust_id,multiple=None):
self.illust_id = illust_id
self._content = None
self._headers = {'Referer': 'http://www.pixiv.net/'}
self.multiple = multiple
@property
def images(self):
if self.multiple is None:
q = self._get_content()
if q('.img-container ._work').hasClass('multiple'):
self.multiple = True
else:
self.multiple = False
return self.images
elif self.multiple:
return self._get_multiple()
else:
return self._get_single()
@property
def author(self):
q = self._get_content()
return q('.userdata .name').text()
@property
def title(self):
q = self._get_content()
return q('.userdata .title').text()
def info(self):
return {
'author': self.author,
'title': self.title,
'images': self.images
}
def download(self,folder='.'):
images = self.images
author = self.author
title = self.title
for i,image in enumerate(images):
_,suffix = os.path.splitext(image)
if i == 0:
filename = author + ' - ' + title + suffix
else:
filename = author + ' - ' + title + str(i+1) + suffix
resp = requests.get(image,headers=self._headers)
with open(os.path.join(folder,filename),'wb') as f:
if resp.status_code == 200:
for chunk in resp.iter_content(1024*1024):
f.write(chunk)
def _build_error(self,resp=None,status=None,msg=None):
if resp is not None:
url = resp.url
status = resp.status_code
elif status:
url = ''
else:
url = ''
status = -1
msg = msg or 'Error "{}" happend at ID {}'.format(resp.reason,self.illust_id) or ''
if status > 0:
return HTTPError(url,status,msg,None,None)
else:
return Error("Unknown error happend at ID {}".format(self.illust_id))
def _get_content(self):
if self._content:
return self._content
url = self.MEDIUM_PAGE_URL.format(self.illust_id)
resp = requests.get(url,headers= self._headers)
if resp.status_code == 200:
self.content = PyQuery(resp.content)
return PyQuery(resp.content)
elif resp.status_code == 404:
raise self._build_error(resp,'ID {} does not exists'.format(self.illust_id))
else:
raise self._build_error(resp)
def _get_single(self):
'''
Get picture url from page which has only one picture.
'''
q = self._get_content()
# the small img url is something like:
# http://{domain}/c/{size}/img-master/img/{date-with-slash}/{id}_p{num-of-img}_master{xxx}.{suffix}
small = q('.img-container img').attr('src')
pattern = self.SMALL_PIC_PATTERN
m = re.match(pattern,small)
info = {
"domain": m.group(1),
"time": m.group(2),
"id": m.group(3),
"index": m.group(4),
"suffix": '{}'
}
url_part = self.FULL_PIC_URL.format(**info)
suffix = self._get_img_suffix( url_part, default_suffix=m.group(5) )
return [url_part.format(suffix)]
def _get_multiple_content(self):
'''
Get picture url from page which has more than one picture.
'''
url = self.MANGA_PAGE_URL.format(self.illust_id)
resp = requests.get( url, headers= self._headers)
if resp.status_code == 200:
return PyQuery(resp.content)
elif resp.status_code == 404:
raise self._build_error(resp,'ID {} does not exists'.format(self.illust_id))
else:
raise self._build_error(resp)
def _get_multiple(self):
q = self._get_multiple_content()
imgs = q('.item-container img')
urls = [img.attr('data-src') for img in imgs.items()]
if not len(urls):
raise self._build_error(status=404,msg='ID {} do not have picture.'.format(self.illust_id))
pattern = self.SMALL_PIC_PATTERN
url_parts = []
_suffix = None
for url in urls:
m = re.match(pattern,url)
info = {
"domain": m.group(1),
"time": m.group(2),
"id": m.group(3),
"index": m.group(4),
"suffix": '{}'
}
url_parts.append(self.FULL_PIC_URL.format(**info))
_suffix = _suffix or m.group(5)
suffix = self._get_img_suffix( url_parts[0], default_suffix=_suffix )
return [url.format(suffix) for url in url_parts]
def _check_img_suffix(self,url_part,suffix):
url = url_part.format(suffix)
resp = requests.head(url,headers = self._headers)
if resp.status_code == 200:
return suffix
return None
def _get_img_suffix(self,url_part,default_suffix=None):
'''
Guess the image suffix.
Original picture's suffix may vary from small one.
'''
SUFFIX = ['png','jpg','gif']
if default_suffix:
suffix = self._check_img_suffix(url_part,default_suffix)
if suffix:
return suffix
for suffix in SUFFIX:
if suffix == default_suffix:
pass
else:
suffix = self._check_img_suffix(url_part,suffix)
if suffix:
return suffix
raise self._build_error(status=404,msg='Cannot find picture of ID {}'.format(self.illust_id))
if __name__ == '__main__':
import sys
argv = sys.argv[1:]
if len(argv):
id_ = int(argv[0])
print Picture(id_).info()
else:
raise ValueError('Need an illust id.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment