Skip to content

Instantly share code, notes, and snippets.

@wangxiaodong
Created November 16, 2017 18:08
Show Gist options
  • Save wangxiaodong/d38fb9fb09b2d2ceb3966af5dd73a511 to your computer and use it in GitHub Desktop.
Save wangxiaodong/d38fb9fb09b2d2ceb3966af5dd73a511 to your computer and use it in GitHub Desktop.
抓百度图片的
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
import time
import os
import hashlib
# 用的是就是改page_num下载需要的页数就好了
# 下面下的那个main就是从第一页开始连续抓取10页,并且下载图片的
# 自己改下下载的页数就好了
# 会在当前文件夹下面建立一个叫 d 的文件夹, 这个对你也没啥影响的
# 如果那个文件里有已经下载过了就不会再下载了,自动判断,不用管
# 当前页码从0开始的
page_num = 0
# 下载多少页
page_count = 1
# 每页的图的数量默认30
rn = 30
# 搜索的关键字
keyword = "飞机"
headers = {'Accept': 'text/plain, */*; q=0.01',
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4",
"Connection":"keep-alive",
"Cookie": "BDIMGISLOGIN=0; winWH=%5E6_1280x703; BDqhfp=%E9%A3%9E%E6%9C%BA%26%26NaN-1undefined%26%260%26%261; __cfduid=dd82f58321cf57b3e90b6ecd8088431b41508419891; BDUSS=RPbGdYN2ZMUnZadjM3NlVXSkRqbVhZM28ybWgwOWxzU2F4YWFtdk5VOXlQdnRaSVFBQUFBJCQAAAAAAAAAAAEAAACkjbQ6utrU88GvbzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHKx01lysdNZcH; BAIDUID=4B4C79AED13EE3DC1C819ED095B7C1B8:FG=1; PSTM=1508937405; BIDUPSID=DE6DCA159E2BDECDF1472B04F122B738; pgv_pvi=686790656; pgv_si=s7669567488; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=3; H_PS_PSSID=1424_21092_22158; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; indexPageSugList=%5B%22%E9%A3%9E%E6%9C%BA%22%2C%22work%2B%20%E5%8A%9E%E5%85%AC%E5%AE%A4%20%E6%88%90%E9%83%BD%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%20%E5%AE%A4%E5%86%85%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%2010%20%E5%8A%9E%E5%85%AC%E5%AE%A4%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%2010%22%2C%22%E5%A4%A9%E5%BA%9C%E6%96%B0%E8%B0%B7%22%2C%22%E7%A6%8F%22%5D; cleanHistoryStatus=0; userFrom=www.baidu.com",
"DNT":"1",
"Host":"image.baidu.com",
"Referer":"https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1510847661183_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%A3%9E%E6%9C%BA",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"
}
url = "https://image.baidu.com/search/acjson"
payload = {"tn":"resultjson_com",
"ipn":"rj",
"ct":201326592,
"is": "",
"fp":"result",
"queryWord":keyword,
"cl":2,
"lm":-1,
"ie":"utf-8",
"oe":"utf-8",
"adpicid": "",
"st":-1,
"z": "",
"ic":0,
"word":keyword,
"s": "",
"se": "",
"tab": "",
"width": "",
"height": "",
"face":0,
"istype":2,
"qc":"",
"nc":1,
"fr":"",
"pn":0,
"rn":rn,
"gsm":"1e",
str(int(time.time()* 1000)): ""}
def download_img(url, title, type_name):
dir_name = "d"
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
fn = hashlib.md5(url).hexdigest() + "_" + title + "." + type_name
fpath = os.path.join(dir_name, fn)
if not os.path.isfile(fpath):
r = requests.get(url)
with open(fpath, 'wb') as fd:
for chunk in r.iter_content(100*1024):
fd.write(chunk)
else:
print 'already exists: ', fpath
def crawl_baidu():
r = requests.get(url, params=payload, headers=headers)
if 300 > r.status_code >= 200:
r.encoding = 'utf8'
obj = json.loads(r.text)
for i in obj["data"]:
print "原始图片名字:", i.get("fromPageTitle", "无")
print "图片名字:", i.get("fromPageTitleEnc", "无")
print "缩略图地址:", i.get("thumbURL", "无")
print "图片地址:", i.get("middleURL", "无")
print "-" * 45
if i.get("thumbURL", ""):
print "start download img"
t = time.time()
download_img(i.get("thumbURL"), i.get("fromPageTitleEnc", "无"), i.get("type", "jpg"))
print 'down load: ', i.get("thumbURL"), " spent:", time.time() - t, "s"
print "-" * 45
def main():
for i in range(page_count):
payload['pn'] = (page_num + i) * rn,
print '下载第 %d 页' % (i + 1)
crawl_baidu()
print '下载完成'
print '*' * 80
time.sleep(1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment