jayzhan211/Pixiv_Crawler.py

## Pixiv_Crawler.py
####################################################################
# 1. fill in self.pixiv_id = '' and self.password = '' to login
# 2. fill in search_keyword = '' to find what you want
#
# 輸入關鍵字 爬圖
####################################################################
import requests
import re
from bs4 import BeautifulSoup
import os
import time


class Pixiv_Crawler():
    def __init__(self, nums_of_pages = 10 , search_keyword = None):
        self.session = requests.Session()
        self.login_url = 'https://accounts.pixiv.net/login'
        self.post_url = 'https://accounts.pixiv.net/api/login?lang=zh_tw'
        self.main_url = 'http://www.pixiv.net'

        search_keyword_encode = str(search_keyword.encode('utf8'))[2:-1].replace('\\x','%')
        self.target_url = 'https://www.pixiv.net/search.php?word=' + search_keyword_encode + '&order=date_d&p='
        self.headers = self.headers = {
            'Referer': 'https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        }
        self.pixiv_id = ''
        self.password = ''
        self.post_key = ''
        self.return_to = 'https://www.pixiv.net/'
        self.load_path = r'C:\Users\owner\Pixiv_Img'
        self.nums_of_pages = nums_of_pages

    def login(self):
        post_key_html = self.session.get(self.login_url, headers=self.headers).text
        post_key_soup = BeautifulSoup(post_key_html, 'lxml')
        self.post_key = post_key_soup.find('input')['value']
        data = {
            'pixiv_id': self.pixiv_id,
            'password': self.password,
            'return_to': self.return_to,
            'post_key': self.post_key
        }
        res = self.session.post(self.post_url,data=data,headers=self.headers)
        print('Login Message: '+ str(res.json()))
    def get_html(self, url):
        return self.session.get(url, headers=self.headers)
    def get_img(self, html, page_num):
        li_soup = BeautifulSoup(html, 'lxml')
        li_list = li_soup.find_all('li', attrs={'class', 'image-item'})
        for li in li_list:
            herf = li.find('a')['href']
            url = self.main_url + herf
            html = self.session.get(url, headers=self.headers).text
            img_src = re.search('"regular":"(.+?)",',html).group(1)
            img_src=img_src.replace('\\','')
            img_title = re.search('title>「(.+?)」',html).group(1)
            self.download_img(img_src,img_title, url, page_num)

    def download_img(self, src , title , href, page_num):
        src_headers = self.headers
        print('URL: '+href)
        print('[{}]: {}'.format(title,src))
        src_headers['Referer'] = href
        try:
            html = requests.get(src,headers = src_headers)
            img = html.content
        except:
            print('Fail to get image')

        if os.path.exists(os.path.join(self.load_path, str(page_num), title + '.jpg')):
            page_id = 1
            while os.path.exists(os.path.join(self.load_path, str(page_num), title + str(page_id) + '.jpg')):
                page_id += 1
            title = title + str(page_id)

        with open(title + '.jpg', 'ab') as f:
            f.write(img)
        print('Save Sucessfully')

    def mkdir(self, path):
        path=path.strip()
        if not os.path.exists(os.path.join(self.load_path, path)):
            os.makedirs(os.path.join(self.load_path, path))
        os.chdir(os.path.join(self.load_path, path))


    def run(self):
        self.login()
        for page_num in range(1,self.nums_of_pages+1):
            path = str(page_num)
            self.mkdir(path)
            html = self.session.get(self.target_url + str(page_num) , headers=self.headers)
            self.get_img(html.text, page_num)
            print('Page {} finished'.format(page_num))
            time.sleep(2)


search_keyword = '女の子'
pixiv = Pixiv_Crawler(nums_of_pages = 1 , search_keyword = search_keyword)
pixiv.run()
	####################################################################
	# 1. fill in self.pixiv_id = '' and self.password = '' to login
	# 2. fill in search_keyword = '' to find what you want
	#
	# 輸入關鍵字爬圖
	####################################################################
	import requests
	import re
	from bs4 import BeautifulSoup
	import os
	import time


	class Pixiv_Crawler():
	def __init__(self, nums_of_pages = 10 , search_keyword = None):
	self.session = requests.Session()
	self.login_url = 'https://accounts.pixiv.net/login'
	self.post_url = 'https://accounts.pixiv.net/api/login?lang=zh_tw'
	self.main_url = 'http://www.pixiv.net'

	search_keyword_encode = str(search_keyword.encode('utf8'))[2:-1].replace('\\x','%')
	self.target_url = 'https://www.pixiv.net/search.php?word=' + search_keyword_encode + '&order=date_d&p='
	self.headers = self.headers = {
	'Referer': 'https://accounts.pixiv.net/login?lang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
	'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
	}
	self.pixiv_id = ''
	self.password = ''
	self.post_key = ''
	self.return_to = 'https://www.pixiv.net/'
	self.load_path = r'C:\Users\owner\Pixiv_Img'
	self.nums_of_pages = nums_of_pages

	def login(self):
	post_key_html = self.session.get(self.login_url, headers=self.headers).text
	post_key_soup = BeautifulSoup(post_key_html, 'lxml')
	self.post_key = post_key_soup.find('input')['value']
	data = {
	'pixiv_id': self.pixiv_id,
	'password': self.password,
	'return_to': self.return_to,
	'post_key': self.post_key
	}
	res = self.session.post(self.post_url,data=data,headers=self.headers)
	print('Login Message: '+ str(res.json()))
	def get_html(self, url):
	return self.session.get(url, headers=self.headers)
	def get_img(self, html, page_num):
	li_soup = BeautifulSoup(html, 'lxml')
	li_list = li_soup.find_all('li', attrs={'class', 'image-item'})
	for li in li_list:
	herf = li.find('a')['href']
	url = self.main_url + herf
	html = self.session.get(url, headers=self.headers).text
	img_src = re.search('"regular":"(.+?)",',html).group(1)
	img_src=img_src.replace('\\','')
	img_title = re.search('title>「(.+?)」',html).group(1)
	self.download_img(img_src,img_title, url, page_num)

	def download_img(self, src , title , href, page_num):
	src_headers = self.headers
	print('URL: '+href)
	print('[{}]: {}'.format(title,src))
	src_headers['Referer'] = href
	try:
	html = requests.get(src,headers = src_headers)
	img = html.content
	except:
	print('Fail to get image')

	if os.path.exists(os.path.join(self.load_path, str(page_num), title + '.jpg')):
	page_id = 1
	while os.path.exists(os.path.join(self.load_path, str(page_num), title + str(page_id) + '.jpg')):
	page_id += 1
	title = title + str(page_id)

	with open(title + '.jpg', 'ab') as f:
	f.write(img)
	print('Save Sucessfully')

	def mkdir(self, path):
	path=path.strip()
	if not os.path.exists(os.path.join(self.load_path, path)):
	os.makedirs(os.path.join(self.load_path, path))
	os.chdir(os.path.join(self.load_path, path))


	def run(self):
	self.login()
	for page_num in range(1,self.nums_of_pages+1):
	path = str(page_num)
	self.mkdir(path)
	html = self.session.get(self.target_url + str(page_num) , headers=self.headers)
	self.get_img(html.text, page_num)
	print('Page {} finished'.format(page_num))
	time.sleep(2)


	search_keyword = '女の子'
	pixiv = Pixiv_Crawler(nums_of_pages = 1 , search_keyword = search_keyword)
	pixiv.run()