Skip to content

Instantly share code, notes, and snippets.

@PandyYang
Created February 21, 2018 02:56
Show Gist options
  • Save PandyYang/23692dfd996b6def47b3f43d4ab67f8c to your computer and use it in GitHub Desktop.
Save PandyYang/23692dfd996b6def47b3f43d4ab67f8c to your computer and use it in GitHub Desktop.
#encoding = "utf--8"
import sys
import importlib,sys
importlib.reload(sys)
# sys.setdefaultencoding('utf8')
import requests
from bs4 import BeautifulSoup as bs
import os
url = "http://www.mmjpg.com/"
headers = {
"Connection":"keep-alive",
"Host":"www.mmjpg.com",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/58.0"
}
headers1 = {
"Connection":"keep-alive",
"Host":"img.mmjpg.com",
"Referer":"http://www.mmjpg.com/mm/1013/5",
"User-Agent":"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0"
}
def get_page_url(url):
req = requests.get(url,headers = headers)
req.encoding = 'utf8'
soup = bs(req.text,"html.parser")
page_url = soup.select('.pic ul li[class = "like"] a ')
for x in page_url:
yield {
"title":x["title"],
"page_url":x["href"]
}
def downloader(url,title,n):
req = requests.get(url,headers = headers1,endcoding = "utf-8")
print(req.status_code)
# 创建下载目录,如果目录存在就不创建
if not os.path.exists("%s" % title):
os.mkdir("%s" % title)
#因为目标站点的服务器有一点问题,好多图片一次是打不开的,
#需要刷新好几次,这里就是让下载器在获取到图片之前对一张
#图片最多刷新6次
t = 0
while t < 6:
if req.status_code == 200:
with open("%s/%s.jpg" % (title,n),"wb") as f:
print("%s.jpg开始下载" % n)
f.write(req.content)
print("%s.jpg下载完成 \n" % n)
break
else:
rep = requests.get(url,headers = headers1)
t = t + 1
continue
return n+1
# 获取图片的地址
def get_image_url(url,title,m):
n = m
rep = requests.get(url,headers = headers)
print(url)
rep.encoding="utf_8"
soup = bs(rep.text,"html.parser")
print(title)
image_url = soup.select(".article .content a img")[0]["src"]
if soup.select('.article .page [class="ch next"]')[0].get_text() == "下一张" :
next_page = url_a + soup.select('.article .page [class="ch next"]')[0]["href"]
print(next_page)
n = downloader(image_url,title,n)
get_image_url(next_page,title,n) # 递归
else:
return
def main():
for x in get_page_url(url):
get_image_url(x["page_url"],x["title"],1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment