Skip to content

Instantly share code, notes, and snippets.

@jinyu121
Last active September 17, 2017 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jinyu121/33e053105e021607c189f8bc50df3c40 to your computer and use it in GitHub Desktop.
Save jinyu121/33e053105e021607c189f8bc50df3c40 to your computer and use it in GitHub Desktop.
免登录下载微博大图
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 15 01:38:07 2017
@author: nondanee
"""
import os
import sys
import locale
import urllib2
import json
import re
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
def print_fit(string):
if type(string) is unicode:
print(string).encode(sys.stdin.encoding or locale.getpreferredencoding(True))
elif type(string) is str:
print(string.decode("utf-8")).encode(sys.stdin.encoding or locale.getpreferredencoding(True))
def raw_input_fit(string = ""):
if type(string) is unicode:
prompt = string.encode(sys.stdin.encoding or locale.getpreferredencoding(True))
elif type(string) is str:
prompt = string.decode("utf-8").encode(sys.stdin.encoding or locale.getpreferredencoding(True))
return raw_input(prompt).decode(sys.stdin.encoding or locale.getpreferredencoding(True))
def get_img_urls(containerid,page):
url = "https://m.weibo.cn/api/container/getIndex?count=25&page=%s&containerid=%s"%(page,containerid)
requset = urllib2.Request(url = url, headers = {'User-Agent' : USER_AGENT})
response = urllib2.urlopen(requset)
jsondata = json.loads(response.read())
if len(jsondata["cards"]) == 0:
return
else:
urls = []
for card in jsondata["cards"]:
if "mblog" in card:
if "pics" in card["mblog"]:
for pic in card["mblog"]["pics"]:
if "large" in pic:
urls.append(pic["large"]["url"])
return urls
def uid_to_containerid(uid):
if re.search(r'^\d{10}$',uid) == None:
return
else:
return "107603" + uid
def username_to_containerid(name):
url = "https://weibo.cn/" + name.encode("utf-8")
requset = urllib2.Request(url = url, headers = {'User-Agent' : USER_AGENT})
try:
response = urllib2.urlopen(requset)
except urllib2.HTTPError:
return
htmlback = response.read()
print htmlback
response.close()
find = re.search(r'<a href="/(\d{10})/info">',htmlback)
if find == None:
return
else:
uid = find.group(1)
return "107603" + uid
def nickname_to_containerid(nickname):
url = "https://m.weibo.com/n/" + nickname.encode("utf-8")
requset = urllib2.Request(url = url, headers = {'User-Agent' : USER_AGENT})
try:
response = urllib2.urlopen(requset)
except urllib2.HTTPError:
return
urlback = response.geturl()
response.close()
if urlback == url:
return
else:
uid = urlback[27:]
return "107603" + uid
def download(url):
reconnect = 0
while True:
try:
request = urllib2.Request(url = url,headers = {'User-Agent':USER_AGENT})
response = urllib2.urlopen(request,timeout = 5)
return response.read()
except Exception as e:
print_fit("出现错误 %s"%e)
reconnect = reconnect + 1
if reconnect > 10:
return
print_fit("重试 %d次"%reconnect)
if __name__ == "__main__":
while True:
print_fit("请输入图片要保存的地址:")
home_path = raw_input_fit()
if os.path.exists(home_path) == True:
break
confirm = raw_input_fit("该目录不存在, 是否创建?(Y/N):")
confirm = re.sub("\s","",confirm)
if confirm == "y" or confirm == "Y":
try:
os.makedirs(home_path)
except:
print_fit("目录创建失败, 请尝试手动创建, 或者更改目录\n")
else:
break
elif confirm == "n" or confirm == "N":
print_fit("请手动创建, 或者更改目录\n")
while True:
print_fit("请输入要下载的账号类型:\n[1]用户ID\n[2]用户名\n[3]用户昵称(建议)")
choice = raw_input_fit("(1/2/3):")
choice = re.sub("\s","",choice)
if choice == "1":
uid = raw_input_fit("请输入用户ID:")
containerid = uid_to_containerid(uid)
if containerid == None:
print_fit("用户ID无效\n")
else:
break
elif choice == "2":
name = raw_input_fit("请输入用户名:")
containerid = username_to_containerid(name)
if containerid == None:
print_fit("无法找到该用户\n")
else:
break
elif choice == "3":
nickname = raw_input_fit("请输入用户昵称:")
containerid = nickname_to_containerid(nickname)
if containerid == None:
print_fit("无法找到该用户\n")
else:
break
i = 1
downloadlist = []
while True:
print_fit("分析微博中: %d"%i)
listback = get_img_urls(containerid,i)
if listback != None:
downloadlist.extend(listback)
i = i + 1
time.sleep(1)
else:
break
print_fit("分析完毕")
amount = len(downloadlist)
print_fit("图片数量: %d"%amount)
user_path = home_path + "/" + containerid[6:] + "/"
if os.path.exists(user_path) == False:
os.mkdir(user_path)
for x in xrange(1,amount+1):
print_fit("下载图片: %d"%x)
downloadUrl = downloadlist[x-1]
data = download(downloadUrl)
filename = str(x).zfill(len(str(amount)))
filetype = downloadUrl[-3:]
if data == None:
print_fit("图片 %s 下载失败"%downloadUrl.encode("utf-8"))
continue
f = open(user_path + filename + "." + filetype,'wb')
f.write(data)
f.close()
print_fit("图片下载完成, 路径是 %s"%user_path.encode("utf-8"))
sys.stdin.read()
exit()
#! /usr/bin/python3
#coding:utf-8
# 修改自:
# 免登录下载微博图片 爬虫 Download Weibo Images without Logging-in
# https://github.com/yAnXImIN/weiboPicDownloader
import os
import requests
import json
NICKNAMES_FILE = 'weibo_nicknames.txt'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
def get(url,stream=False,allow_redirects=True):
print(url)
return requests.get(url=url,headers=HEADERS,allow_redirects=allow_redirects)
def save_image(nickname,url):
save_path = os.path.join('WeiboAlbum',"WeiboAlbum_" + nickname)
if not os.path.exists(save_path):
os.makedirs(save_path)
image_path = os.path.join(save_path,nickname+'_'+url.split('/')[-1])
if os.path.isfile(image_path):
print("File already exists: " + image_path)
return
response = get(url=url, stream=True)
image = response.content
try:
with open(image_path, "wb") as image_object:
image_object.write(image)
return
except IOError:
print("IO Error\n")
return
def get_urls(containerid,page):
url = "https://m.weibo.cn/api/container/getIndex?count=25&page={}&containerid={}".format(page,containerid);
resp_text = get(url=url).text
json_data = json.loads(resp_text)
cards = json_data['cards']
if not cards:
return None
photos = []
for card in cards:
mblog = card.get('mblog')
if mblog:
pics = mblog.get('pics')
if pics:
photos.extend([pic.get('large').get('url') for pic in pics])
return photos
def nickname_to_containerid(nickname):
url = "http://m.weibo.com/n/{}".format(nickname)
resp = get(url,allow_redirects=False)
cid = resp.headers['Location'][27:]
return '107603{}'.format(cid)
def read_nicknames():
nicknames = []
with open(NICKNAMES_FILE,'r',encoding='utf-8') as f:
for line in f:
nicknames.extend(line.split(' '))
return nicknames
def handle_user(nickname):
cid = nickname_to_containerid(nickname)
if not cid:
return
all = []
page = 0
has_more = True
while has_more:
page += 1
urls = get_urls(containerid=cid,page=page)
has_more = bool(urls)
if has_more:
all.extend(urls)
count = len(all)
index = 0
for url in all:
index += 1
print('{} {}/{}'.format(nickname,index,count))
save_image(nickname,url)
pass
def main():
for nickname in read_nicknames():
handle_user(nickname)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment