Skip to content

Instantly share code, notes, and snippets.

@623637646
Created July 4, 2020 08:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 623637646/7f7b19ab47f1a2f3dfd4662a43ff4a84 to your computer and use it in GitHub Desktop.
Save 623637646/7f7b19ab47f1a2f3dfd4662a43ff4a84 to your computer and use it in GitHub Desktop.
xiuren image spider
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-03-12 15:56:36
# Project: Girls
from pyspider.libs.base_handler import *
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class Handler(BaseHandler):
crawl_config = {
'root_urls': [
'http://www.xiuren.org/category/TuiGirl.html',
#'http://www.xiuren.org/category/XiuRen.html'
],
'download_path': os.path.expanduser('~/Downloads/') + 'Girls/',
'index_page_detail_page_link': '#main > div > div > a',
'detail_image_link': '#post > div.post > p > span > a',
'detail_page_title': '#title > h1:nth-child(1)',
'index_page_next_link': '#page > a.next'
}
def on_start(self):
root_urls = self.crawl_config['root_urls']
for each in root_urls:
self.crawl(root_urls, callback=self.index_page)
def index_page(self, response):
category = response.url.split(".")[-2].split("/")[-1]
for each in response.doc(self.crawl_config['index_page_detail_page_link']).items():
self.crawl(each.attr.href, callback=self.detail_page,save={'category': category})
next_link = response.doc(self.crawl_config['index_page_next_link']).attr.href
if next_link:
self.crawl(next_link, callback=self.index_page)
def detail_page(self, response):
category = response.save['category']
detail_page_title = response.doc(self.crawl_config['detail_page_title']).text()
for each in response.doc(self.crawl_config['detail_image_link']).items():
file_url = each.attr.href
file_name = detail_page_title + file_url.split("/")[-1]
directory = self.crawl_config['download_path'] + category
if os.path.exists(directory + '/' + file_name):
print("file \"" + directory + '/' + file_name + "\" is existed, skip download")
continue
self.crawl(file_url, callback=self.save_img,
save={'file_name': file_name, 'directory': directory})
def save_img(self,response):
directory = response.save["directory"]
if not os.path.exists(directory):
os.makedirs(directory)
content = response.content
file_path = directory + '/' + response.save["file_name"]
f = open(file_path,"wb" )
f.write(content)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment