Last active
March 31, 2019 05:27
-
-
Save jayzhan211/0161a906ca55f6d0dede96cc80f4b755 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################################### | |
# 1. fill in self.pixiv_id = '' and self.password = '' to login | |
# 2. fill in search_keyword = '' to find what you want | |
# | |
# 輸入關鍵字 爬圖 | |
#################################################################### | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
import os | |
import time | |
class Pixiv_Crawler(): | |
def __init__(self, nums_of_pages = 10 , search_keyword = None): | |
self.session = requests.Session() | |
self.login_url = 'https://accounts.pixiv.net/login' | |
self.post_url = 'https://accounts.pixiv.net/api/login?lang=zh_tw' | |
self.main_url = 'http://www.pixiv.net' | |
search_keyword_encode = str(search_keyword.encode('utf8'))[2:-1].replace('\\x','%') | |
self.target_url = 'https://www.pixiv.net/search.php?word=' + search_keyword_encode + '&order=date_d&p=' | |
self.headers = self.headers = { | |
'Referer': 'https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' | |
} | |
self.pixiv_id = '' | |
self.password = '' | |
self.post_key = '' | |
self.return_to = 'https://www.pixiv.net/' | |
self.load_path = r'C:\Users\owner\Pixiv_Img' | |
self.nums_of_pages = nums_of_pages | |
def login(self): | |
post_key_html = self.session.get(self.login_url, headers=self.headers).text | |
post_key_soup = BeautifulSoup(post_key_html, 'lxml') | |
self.post_key = post_key_soup.find('input')['value'] | |
data = { | |
'pixiv_id': self.pixiv_id, | |
'password': self.password, | |
'return_to': self.return_to, | |
'post_key': self.post_key | |
} | |
res = self.session.post(self.post_url,data=data,headers=self.headers) | |
print('Login Message: '+ str(res.json())) | |
def get_html(self, url): | |
return self.session.get(url, headers=self.headers) | |
def get_img(self, html, page_num): | |
li_soup = BeautifulSoup(html, 'lxml') | |
li_list = li_soup.find_all('li', attrs={'class', 'image-item'}) | |
for li in li_list: | |
herf = li.find('a')['href'] | |
url = self.main_url + herf | |
html = self.session.get(url, headers=self.headers).text | |
img_src = re.search('"regular":"(.+?)",',html).group(1) | |
img_src=img_src.replace('\\','') | |
img_title = re.search('title>「(.+?)」',html).group(1) | |
self.download_img(img_src,img_title, url, page_num) | |
def download_img(self, src , title , href, page_num): | |
src_headers = self.headers | |
print('URL: '+href) | |
print('[{}]: {}'.format(title,src)) | |
src_headers['Referer'] = href | |
try: | |
html = requests.get(src,headers = src_headers) | |
img = html.content | |
except: | |
print('Fail to get image') | |
if os.path.exists(os.path.join(self.load_path, str(page_num), title + '.jpg')): | |
page_id = 1 | |
while os.path.exists(os.path.join(self.load_path, str(page_num), title + str(page_id) + '.jpg')): | |
page_id += 1 | |
title = title + str(page_id) | |
with open(title + '.jpg', 'ab') as f: | |
f.write(img) | |
print('Save Sucessfully') | |
def mkdir(self, path): | |
path=path.strip() | |
if not os.path.exists(os.path.join(self.load_path, path)): | |
os.makedirs(os.path.join(self.load_path, path)) | |
os.chdir(os.path.join(self.load_path, path)) | |
def run(self): | |
self.login() | |
for page_num in range(1,self.nums_of_pages+1): | |
path = str(page_num) | |
self.mkdir(path) | |
html = self.session.get(self.target_url + str(page_num) , headers=self.headers) | |
self.get_img(html.text, page_num) | |
print('Page {} finished'.format(page_num)) | |
time.sleep(2) | |
search_keyword = '女の子' | |
pixiv = Pixiv_Crawler(nums_of_pages = 1 , search_keyword = search_keyword) | |
pixiv.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This gist is DEPRECATED