Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Created December 17, 2017 01:48
Show Gist options
  • Save ikegami-yukino/f5296b2ac19cf431aa90193359b18865 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/f5296b2ac19cf431aa90193359b18865 to your computer and use it in GitHub Desktop.
Pixiv小説のクロール
# -*- coding: utf-8 -*-
import re
from robobrowser import RoboBrowser
PIXIV_BASE_URL = 'https://www.pixiv.net'
TAG = '巴マミ'
MAX_PAGE = 190
browser = RoboBrowser(parser='lxml', history=True)
browser.open('https://accounts.pixiv.net/login')
form = browser.get_forms('form', class_='')[0]
form['pixiv_id'] = 'USERNAME'
form['password'] = 'PASSWORD'
browser.submit_form(form)
for i in range(MAX_PAGE):
print(i)
browser.open(PIXIV_BASE_URL + '/novel/tags.php?tag={}&order=date&p='.format(TAG) + str(i))
novel_items = browser.find(class_='novel-items')
if novel_items is None:
break
# ページ内の小説をループ
for novel in novel_items.find_all(class_='_novel-item'):
novel_url = PIXIV_BASE_URL + novel.find('h1').find('a')['href']
browser.open(novel_url)
# 小説の情報
title = novel.find('h1').find('a').text.replace('/', '_')
text = re.sub(r'\s|\n| ', '', browser.find('textarea', id='novel_text').text)
print(title)
with open(title + '.txt', 'w') as fd:
fd.write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment