Created
February 21, 2018 02:56
-
-
Save PandyYang/23692dfd996b6def47b3f43d4ab67f8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding = "utf--8" | |
import sys | |
import importlib,sys | |
importlib.reload(sys) | |
# sys.setdefaultencoding('utf8') | |
import requests | |
from bs4 import BeautifulSoup as bs | |
import os | |
url = "http://www.mmjpg.com/" | |
headers = { | |
"Connection":"keep-alive", | |
"Host":"www.mmjpg.com", | |
"Upgrade-Insecure-Requests":"1", | |
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/58.0" | |
} | |
headers1 = { | |
"Connection":"keep-alive", | |
"Host":"img.mmjpg.com", | |
"Referer":"http://www.mmjpg.com/mm/1013/5", | |
"User-Agent":"Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0" | |
} | |
def get_page_url(url): | |
req = requests.get(url,headers = headers) | |
req.encoding = 'utf8' | |
soup = bs(req.text,"html.parser") | |
page_url = soup.select('.pic ul li[class = "like"] a ') | |
for x in page_url: | |
yield { | |
"title":x["title"], | |
"page_url":x["href"] | |
} | |
def downloader(url,title,n): | |
req = requests.get(url,headers = headers1,endcoding = "utf-8") | |
print(req.status_code) | |
# 创建下载目录,如果目录存在就不创建 | |
if not os.path.exists("%s" % title): | |
os.mkdir("%s" % title) | |
#因为目标站点的服务器有一点问题,好多图片一次是打不开的, | |
#需要刷新好几次,这里就是让下载器在获取到图片之前对一张 | |
#图片最多刷新6次 | |
t = 0 | |
while t < 6: | |
if req.status_code == 200: | |
with open("%s/%s.jpg" % (title,n),"wb") as f: | |
print("%s.jpg开始下载" % n) | |
f.write(req.content) | |
print("%s.jpg下载完成 \n" % n) | |
break | |
else: | |
rep = requests.get(url,headers = headers1) | |
t = t + 1 | |
continue | |
return n+1 | |
# 获取图片的地址 | |
def get_image_url(url,title,m): | |
n = m | |
rep = requests.get(url,headers = headers) | |
print(url) | |
rep.encoding="utf_8" | |
soup = bs(rep.text,"html.parser") | |
print(title) | |
image_url = soup.select(".article .content a img")[0]["src"] | |
if soup.select('.article .page [class="ch next"]')[0].get_text() == "下一张" : | |
next_page = url_a + soup.select('.article .page [class="ch next"]')[0]["href"] | |
print(next_page) | |
n = downloader(image_url,title,n) | |
get_image_url(next_page,title,n) # 递归 | |
else: | |
return | |
def main(): | |
for x in get_page_url(url): | |
get_image_url(x["page_url"],x["title"],1) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment