Skip to content

Instantly share code, notes, and snippets.

@and2long
Created November 25, 2018 11:34
Show Gist options
  • Save and2long/237fe0c28e65152c4c3b89eafdfa31b0 to your computer and use it in GitHub Desktop.
Save and2long/237fe0c28e65152c4c3b89eafdfa31b0 to your computer and use it in GitHub Desktop.
get all the images on the home page of mzitu.
# -*- coding: utf-8 -*-
import os
import time
import re
import scrapy
class DetailSpider(scrapy.Spider):
name = 'detail'
allowed_domains = ['mzitu.com', 'i.meizitu.net']
start_urls = ['http://www.mzitu.com']
def parse(self, response):
# 第一页列表
lis = response.xpath('//*[@id="pins"]/li')
for li in lis:
url = li.xpath('./a/@href').extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
def parse_detail(self, response):
print(response.url)
if response.url.endswith('.jpg'):
# 保存图片到本地
folder_path = './img'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
file_name = '{}_{}.jpg'.format(response.meta['prefix'], str(int(time.time() * 1000)))
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'wb') as f:
f.write(response.body)
f.close()
else:
# 进入详情
image_url = response.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img/@src').extract_first()
image_prefix = re.match(r'^.*/(\d{4,}).*', response.url).group(1)
yield scrapy.Request(url=image_url, callback=self.parse_detail, meta={'prefix': image_prefix})
pagenavi = response.xpath('//div[@class="pagenavi"]/a[last()]/span/text()').extract_first()
if pagenavi == "下一页»":
next_url = response.xpath('//span[text()="下一页»"]/../@href').extract_first()
print(next_url)
yield scrapy.Request(url=next_url, callback=self.parse_detail)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment