Skip to content

Instantly share code, notes, and snippets.

@dodola
Created May 8, 2014 00:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dodola/4d4cd82b298802026246 to your computer and use it in GitHub Desktop.
Save dodola/4d4cd82b298802026246 to your computer and use it in GitHub Desktop.
__author__ = 'dodola'
#encoding: utf-8
from time import sleep, ctime
import time
import urllib.request
import threading
import contextlib
import queue
import string
import shutil
import os
import mimetypes
import tempfile
import json
import glob
from urllib.error import URLError, HTTPError, ContentTooShortError
import re
from urllib.parse import (
urlparse, urlsplit, urljoin, unwrap, quote, unquote,
splittype, splithost, splitport, splituser, splitpasswd,
splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
PINSURL = "http://huaban.com/pins/%s/zoom/"
BROADURL="http://huaban.com/boards/%s/"
NURL="http://huaban.com/boards/%s/?huwu7jsv&limit=20000&wfl=1"
UURL="http://huaban.com/%s/?huwzcasa&limit=1000&wfl=1"
DROOTURL="http://img.hb.aicdn.com/%s"
mimetypes.init()
def myurlretrieve(url, filename=None, reporthook=None, data=None):
_url_tempfiles = []
url_type, path = splittype(url)
#user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'
#headers = {'User-Agent': user_agent}
req = urllib.request.Request(url, data)
with contextlib.closing(urllib.request.urlopen(req)) as fp:
headers = fp.info()
if url_type == "file" and not filename:
return os.path.normpath(path), headers
# Handle temporary file setup.
if filename:
tfp = open(filename, 'wb')
else:
tfp = tempfile.NamedTemporaryFile(delete=False)
filename = tfp.name
_url_tempfiles.append(filename)
with tfp:
result = filename, headers
bs = 1024 * 8
size = -1
read = 0
blocknum = 0
if "content-length" in headers:
size = int(headers["Content-Length"])
if reporthook:
reporthook(blocknum, 0, size)
while True:
block = fp.read(bs)
if not block:
break
read += len(block)
tfp.write(block)
blocknum += 1
if reporthook:
reporthook(blocknum, len(block), size)
if size >= 0 and read < size:
raise ContentTooShortError(
"retrieval incomplete: got only %i out of %i bytes"
% (read, size), result)
return result
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|'
new_title = re.sub(rstr, "", title)
return new_title
def donwloadBroad(bid, savePath):
# broadUrl=BROADURL%bid
# print(broadUrl)
# broadRes=urllib.request.urlopen(broadUrl)
# broadContent=str(broadRes.read(),"utf-8")
# cr=r'app\.page\["board"\]\s=\s(.*?);'
# contentRe=re.compile(cr,re.M)
# match=contentRe.search(broadContent)
#
# if match:
# print (match.group(1))
#
# firstJson=json.loads(match.group(1))
# #获取列表
# folderName = firstJson["title"]
# print(folderName)
# savePath = savePath + '/' + validateTitle(folderName)
# if not os.path.exists(savePath):
# os.makedirs(savePath)
# firstPins=firstJson["pins"]
# maxid = firstPins[0]["pin_id"]
# #解析当前列表
# for firstPin in firstPins:
# pid=str(firstPin["pin_id"])
# print("下载"+pid)
# type=firstPin["file"]["type"]
# ext = mimetypes.guess_extension(type)
# saveFilePath="%s/%s%s"%(savePath,pid,ext)
# if os.path.exists(saveFilePath):
# continue
# myurlretrieve(DROOTURL%firstPin["file"]["key"],saveFilePath)
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
req=urllib.request.Request(NURL%(bid),headers=headers)
res=urllib.request.urlopen(req)
jsonstr=str(res.read(),"utf8")
print(jsonstr)
jsonData=json.loads(jsonstr)
jsonPins=jsonData["board"]["pins"]
folderName=jsonData["board"]["title"]
savePath = savePath + '/' + validateTitle(folderName)
if not os.path.exists(savePath):
os.makedirs(savePath)
for pin in jsonPins:
pid=str(pin["pin_id"])
print("下载"+pid)
type=pin["file"]["type"]
ext = mimetypes.guess_extension(type,True)
saveFilePath="%s/%s%s"%(savePath,pid,ext)
if os.path.exists(saveFilePath):
continue
myurlretrieve(DROOTURL%pin["file"]["key"],saveFilePath)
# donwloadBroad("14405824","d:/huaban/")
# donwloadBroad("7472188","d:/huaban/")
# donwloadBroad("15801564","d:/huaban/")
# donwloadBroad("15815753","d:/huaban/")
# donwloadBroad("15816943","d:/huaban/")
# donwloadBroad("15814198","d:/huaban/")
# donwloadBroad("3879637","d:/huaban/")
def downloadUser(id):
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
req=urllib.request.Request(UURL%(id),headers=headers)
res=urllib.request.urlopen(req)
jsonstr=str(res.read(),"utf8")
print(jsonstr)
jsonData=json.loads(jsonstr)
boards=jsonData["user"]["boards"]
for board in boards:
donwloadBroad(board["board_id"],"L:/huaban2/")
downloadUser("gxpgxt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment