Skip to content

Instantly share code, notes, and snippets.

@utgwkk
Created December 12, 2015 09:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save utgwkk/1d59fe99a513ef9b38dc to your computer and use it in GitHub Desktop.
Save utgwkk/1d59fe99a513ef9b38dc to your computer and use it in GitHub Desktop.
ニジエスクレイピング野郎
#!/usr/bin/env python
# coding=utf-8
import re
import requests
import lxml.html
class Nijie(object):
def __init__(self, params):
self._scheme = 'http:'
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'Mozilla/5.0'})
self._np = re.compile(r'<[Aa] href="(//pic0[0-9]\.nijie\.info/nijie_picture/[a-zA-Z0-9_/\.]+?)"\s?\S*?>+')
self.session.get('http://nijie.info/')
r = self.session.post('http://nijie.info/login_int.php', data=params)
if r.status_code != 200:
raise Exception('failed to login to nijie.info')
def download(self, target_url, target_dir='./'):
title = ''
description = ''
urls = []
r = self.session.get(target_url)
if r.status_code == 200:
html = r.text
dom = lxml.html.fromstring(html)
title = dom.xpath('/html/head/meta[@name="twitter:title"]')[0].get('content')
description = dom.xpath('//head/meta[@name="twitter:description"]')[0].get('content')
for url in self._np.findall(html):
r = self.session.get(self._scheme + url)
if r.status_code == 200:
urls.append(url)
return title, description, urls
if __name__ == '__main__':
params = {
'email': '',
'password': '',
'save': 'on',
'ticket': ''
}
n = Nijie(params)
a = n.download('http://nijie.info/view.php?id=')
print(a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment