Skip to content

Instantly share code, notes, and snippets.

@jayzhan211
Last active March 31, 2019 05:27
Show Gist options
  • Save jayzhan211/0161a906ca55f6d0dede96cc80f4b755 to your computer and use it in GitHub Desktop.
Save jayzhan211/0161a906ca55f6d0dede96cc80f4b755 to your computer and use it in GitHub Desktop.
####################################################################
# 1. fill in self.pixiv_id = '' and self.password = '' to login
# 2. fill in search_keyword = '' to find what you want
#
# 輸入關鍵字 爬圖
####################################################################
import requests
import re
from bs4 import BeautifulSoup
import os
import time
class Pixiv_Crawler():
def __init__(self, nums_of_pages = 10 , search_keyword = None):
self.session = requests.Session()
self.login_url = 'https://accounts.pixiv.net/login'
self.post_url = 'https://accounts.pixiv.net/api/login?lang=zh_tw'
self.main_url = 'http://www.pixiv.net'
search_keyword_encode = str(search_keyword.encode('utf8'))[2:-1].replace('\\x','%')
self.target_url = 'https://www.pixiv.net/search.php?word=' + search_keyword_encode + '&order=date_d&p='
self.headers = self.headers = {
'Referer': 'https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
self.pixiv_id = ''
self.password = ''
self.post_key = ''
self.return_to = 'https://www.pixiv.net/'
self.load_path = r'C:\Users\owner\Pixiv_Img'
self.nums_of_pages = nums_of_pages
def login(self):
post_key_html = self.session.get(self.login_url, headers=self.headers).text
post_key_soup = BeautifulSoup(post_key_html, 'lxml')
self.post_key = post_key_soup.find('input')['value']
data = {
'pixiv_id': self.pixiv_id,
'password': self.password,
'return_to': self.return_to,
'post_key': self.post_key
}
res = self.session.post(self.post_url,data=data,headers=self.headers)
print('Login Message: '+ str(res.json()))
def get_html(self, url):
return self.session.get(url, headers=self.headers)
def get_img(self, html, page_num):
li_soup = BeautifulSoup(html, 'lxml')
li_list = li_soup.find_all('li', attrs={'class', 'image-item'})
for li in li_list:
herf = li.find('a')['href']
url = self.main_url + herf
html = self.session.get(url, headers=self.headers).text
img_src = re.search('"regular":"(.+?)",',html).group(1)
img_src=img_src.replace('\\','')
img_title = re.search('title>「(.+?)」',html).group(1)
self.download_img(img_src,img_title, url, page_num)
def download_img(self, src , title , href, page_num):
src_headers = self.headers
print('URL: '+href)
print('[{}]: {}'.format(title,src))
src_headers['Referer'] = href
try:
html = requests.get(src,headers = src_headers)
img = html.content
except:
print('Fail to get image')
if os.path.exists(os.path.join(self.load_path, str(page_num), title + '.jpg')):
page_id = 1
while os.path.exists(os.path.join(self.load_path, str(page_num), title + str(page_id) + '.jpg')):
page_id += 1
title = title + str(page_id)
with open(title + '.jpg', 'ab') as f:
f.write(img)
print('Save Sucessfully')
def mkdir(self, path):
path=path.strip()
if not os.path.exists(os.path.join(self.load_path, path)):
os.makedirs(os.path.join(self.load_path, path))
os.chdir(os.path.join(self.load_path, path))
def run(self):
self.login()
for page_num in range(1,self.nums_of_pages+1):
path = str(page_num)
self.mkdir(path)
html = self.session.get(self.target_url + str(page_num) , headers=self.headers)
self.get_img(html.text, page_num)
print('Page {} finished'.format(page_num))
time.sleep(2)
search_keyword = '女の子'
pixiv = Pixiv_Crawler(nums_of_pages = 1 , search_keyword = search_keyword)
pixiv.run()
@jayzhan211
Copy link
Author

This gist is DEPRECATED

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment