Skip to content

Instantly share code, notes, and snippets.

@vimerzhao
Last active May 26, 2021 23:30
Show Gist options
  • Save vimerzhao/8485f89978aba2e7d9a0f8d82c371643 to your computer and use it in GitHub Desktop.
Save vimerzhao/8485f89978aba2e7d9a0f8d82c371643 to your computer and use it in GitHub Desktop.
一个妹子图(http://www.mzitu.com )的爬虫程序,简单且low B,但对初步认识爬虫很有帮助。。。。
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import os
# 爬取目标
url = 'http://www.mzitu.com/page/'
parser = 'html.parser'
cur_path = os.getcwd() + '/'
# 设置报头,Http协议
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
def update_header(referer):
header['Referer'] = '{}'.format(referer)
# 爬取的预览页面数量
preview_page_cnt = 2
for cur_page in range(1, int(preview_page_cnt) + 1):
cur_url = url + str(cur_page)
cur_page = requests.get(cur_url, headers=header)
# 解析网页
soup = BeautifulSoup(cur_page.text, parser)
# 图片入口和文字入口取一个即可
preview_link_list = soup.find(id='pins').find_all('a', target='_blank')[1::2]
for link in preview_link_list:
dir_name = link.get_text().strip().replace('?', '')
link = link['href']
soup = BeautifulSoup(requests.get(link).text, parser)
# 获取图片数量
pic_cnt = soup.find('div', class_='pagenavi').find_all('a')[4].get_text()
# 创建目录
pic_path = cur_path + dir_name
if os.path.exists(pic_path):
print('directory exist!')
else:
os.mkdir(pic_path)
os.chdir(pic_path) # 进入目录,开始下载
print('下载' + dir_name + '...')
# 遍历获取每页图片的地址
for pic_index in range(1, int(pic_cnt) + 1):
pic_link = link + '/' + str(pic_index)
cur_page = requests.get(pic_link, headers=header)
soup = BeautifulSoup(cur_page.text, parser)
pic_src = soup.find('div', 'main-image').find('img')['src']
pic_name = pic_src.split('/')[-1]
update_header(pic_src)
f = open(pic_name, 'wb')
f.write(requests.get(pic_src, headers=header).content)
f.close()
os.chdir(cur_path) # 完成下载,退出目录
print('下载完成')
@gkj17
Copy link

gkj17 commented Jun 7, 2020

对的!刚刚学了下pythhon的语法,现在已经改好了!!!谢谢赵裕大佬

@l1343363028
Copy link

mzitu.com肿么打不开了,你们有这样的情况吗
(坐标:河南信阳)

@gkj17
Copy link

gkj17 commented Aug 5, 2020 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment