Skip to content

Instantly share code, notes, and snippets.

@mpco
Created September 10, 2017 16:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mpco/0074c65bd634a44832c6785f11127752 to your computer and use it in GitHub Desktop.
Save mpco/0074c65bd634a44832c6785f11127752 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib import request
from bs4 import BeautifulSoup
import re
import time
import sys
import os
# 加载webdriver路径
dirPath = os.path.dirname(os.path.realpath(__file__))
os.environ['PATH'] += (":" + dirPath)
# 打开网页
url = sys.argv[1]
answerID = url.split("/")[-1]
driver = webdriver.Chrome()
driver.get(url)
# 如果需要登录
try:
assert "- 知乎" in driver.title
except AssertionError:
assert "与世界分享你的知识、经验和见解" in driver.title
signinButton = driver.find_element_by_xpath("//a[@href='#signin']")
signinButton.click()
input("请登录\n回车键以继续...")
assert "- 知乎" in driver.title
# 加载出所有答案
print("网页加载中...")
while True:
# 等待加载
time.sleep(5)
# 拉到网页底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
moreAnswerButton = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, "//button[@class='Button QuestionMainAction']")))
moreAnswerButton.click()
except Exception as e:
# print(e)
break
input("回车 以开始处理网页....")
# 处理网页
html = driver.page_source
bsObj = BeautifulSoup(html, 'html.parser')
# 处理已加载的图片链接
dataList_1 = bsObj.findAll(name='img', attrs={
'data-rawwidth': re.compile(r'\d{0,4}'), 'data-original': re.compile(r'https://')})
# 处理未加载的图片链接
dataList_2 = bsObj.findAll(name='div', attrs={
'class': "VagueImage origin_image zh-lightbox-thumb", 'data-src': re.compile(r'https://')})
urlList_1 = [data.attrs['data-original'] for data in dataList_1]
urlList_1.extend([data.attrs['data-src'] for data in dataList_2])
# 以 _r 结尾的图片链接是原版的图片
urlList = [urlItem.replace("_b", "_r") for urlItem in urlList_1]
# 保存的文件夹
dirName = os.path.join(os.path.dirname(os.path.realpath(__file__)), time.strftime("%Y%m%d") + "-id" + answerID)
os.mkdir(dirName)
# 保存图片链接到文件
linkFilePath = os.path.join(dirName, "imgLink-id" + answerID + ".txt")
f = open(linkFilePath, 'w')
for index, imgUrl in enumerate(urlList):
f.write(imgUrl + "\n")
print("-----" + str(index + 1) + ":" + imgUrl + "-------")
f.close()
input("回车 以继续下载图片...")
driver.close()
# 下载图片
for index, imgUrl in enumerate(urlList):
_, file_extension = os.path.splitext(imgUrl)
with open(dirName + '/' + str(index + 1) + file_extension, 'wb') as w:
w.write(request.urlopen(imgUrl).read())
print("下载 第 " + str(index + 1) + " 张图片")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment