Created
March 25, 2018 14:34
-
-
Save joyc/699a08ab3e131cfebe45d0646637dc97 to your computer and use it in GitHub Desktop.
taobaogirl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import threading | |
import re | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
from selenium import webdriver | |
browserPath = '/Users/charlielee/anaconda3/phantomjs-2.1.1-macosx/bin/phantomjs' | |
homePage = 'https://mm.taobao.com/search_tstar_model.htm' | |
outputDir = 'photo/' | |
parser = 'html5lib' | |
def main(): | |
driver = webdriver.PhantomJS(executable_path=browserPath) #浏览器的地址 | |
driver.get(homePage) #访问目标网页地址 | |
bsObj = BeautifulSoup(driver.page_source, parser) #解析目标网页的 Html 源码 | |
print("[*]OK GET Page") | |
# 获得主页上所有妹子的姓名、所在城市、身高、体重等信息 | |
girlsList = driver.find_element_by_id('J_GirlsList').text.split('\n') | |
# 获取所有妹子的封面图片 | |
imagesUrl = re.findall('\/\/gtd\.alicdn\.com\/sns_logo.*\.jpg', driver.page_source) | |
# 解析出妹子的个人主页地址等信息 | |
girlsUrl = bsObj.find_all("a", {"href": re.compile("\/\/.*\.htm\?(userId=)\d*")}) | |
# 所有妹子的名字地点 | |
girlsNL = girlsList[::3] | |
# 所有妹子的身高体重 | |
girlsHW = girlsList[1::3] | |
# 所有妹子的个人主页地址 | |
girlsHURL = [('http:' + i['href']) for i in girlsUrl] | |
# 所有妹子的封面图片地址 | |
girlsPhotoURL = [('https:' + i) for i in imagesUrl] | |
girlsInfo = zip(girlsNL, girlsHW, girlsHURL, girlsPhotoURL) | |
# 姓名地址 girlNL,身高体重 girlHW,个人主页地址 girlHRUL,封面图片 URL | |
for girlNL, girlHW, girlHURL, girlCover in girlsInfo: | |
print("[*]Girl :", girlNL, girlHW) | |
# 为妹子建立文件夹 | |
mkdir(outputDir + girlNL) | |
print(" [*]saving...") | |
# 获取妹子封面图片 | |
data = urlopen(girlCover).read() | |
with open(outputDir + girlNL + '/cover.jpg', 'wb') as f: | |
f.write(data) | |
print(" [+]Loading Cover... ") | |
# 获取妹子个人主页中的图片 | |
getImgs(girlHURL, outputDir + girlNL) | |
driver.close() | |
def mkdir(path): | |
# 判断路径是否存在 | |
isExists = os.path.exists(path) | |
# 判断结果 | |
if not isExists: | |
# 如果不存在则创建目录 | |
print(" [*]新建了文件夹", path) | |
# 创建目录操作函数 | |
os.makedirs(path) | |
else: | |
# 如果目录存在则不创建,并提示目录已存在 | |
print(' [+]文件夹', path, '已创建') | |
def getImgs(url, path): | |
driver = webdriver.PhantomJS(executable_path=browserPath) | |
driver.get(url) | |
print(" [*]Opening...") | |
bsObj = BeautifulSoup(driver.page_source, parser) | |
# 获得模特个人页面上的艺术照地址 | |
imgs = bsObj.find_all("img", {"src": re.compile(".*\.jpg")}) | |
for i, img in enumerate(imgs[1:]): #不包含与封面图片一样的头像 | |
try: | |
html = urlopen('https:' + img['src']) | |
data = html.read() | |
fileName = "{}/{}.jpg".format(path, i + 1) | |
print(" [+]Loading...", fileName) | |
with open(fileName, 'wb') as f: | |
f.write(data) | |
except Exception: | |
print(" [!]Address Error!") | |
driver.close() | |
if __name__ == '__main__': | |
if not os.path.exists(outputDir): | |
os.makedirs(outputDir) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment