magicdawn/tieba.py

## tieba.py
#beautiful soup doc : http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
DEBUG = False #是否在调试
EXAMPLE_URL = 'http://tieba.baidu.com/p/3287469841' #示例网址
PAGE_ENCODING = "gbk" #返回的encoding,默认就好
IMAGE_DIR = 'image' #存放目录

def getTitle(soup): #获取标题,文件夹名
    title = (soup.select("h1.core_title_txt"))[0]['title']
    return title.strip()
def getImgSrcs(soup): #如何获取本页的img src
    srcs = [img['src'] for img in soup.select("img.BDE_Image")]
    return srcs
def has_next_page(soup): #还有下一页没啊
    return False
def get_next_page(soup): #获取下一页地址
    return ''

import urllib,urllib.request,os,sys
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# import pyquery as pq

#page html
def request(url):
    global PAGE_ENCODING
    res = urllib.request.urlopen(url)
    html = res.read()

    if PAGE_ENCODING == None:
        PAGE_ENCODING = "utf8"
    return html.decode(PAGE_ENCODING)

#写入一行
def writeLine(f,content):
    '''
    f = open()
    content : Unicode,use utf8 encode
    '''
    f.write(content.encode("utf8")+'\n')

if DEBUG:   #在debug,取example_url
    url = EXAMPLE_URL
elif len(sys.argv) == 1: #正常运行时,提示帮助信息
    print("请指定图片所在网页(如{0})".format(EXAMPLE_URL))
    exit()
else:   #正常指定下载
    url = sys.argv[1] #python [down.py http://xxxx]

soup = BeautifulSoup(request(url)) #创建soup
title = getTitle(soup) #如何获取标题,只获取当前页
print("图片系列为 : {0}".format(title))

#image文件夹
base_path = ''
if IMAGE_DIR == None or IMAGE_DIR == '':
    #当前目录
    base_path = title
else:
    if not os.path.exists(IMAGE_DIR):
        os.mkdir(IMAGE_DIR)
    base_path = IMAGE_DIR + "/" + title + "/"

if os.path.exists(IMAGE_DIR + "/" + title):
    answer = input("已经下载过了啊!,要重新下载 (y/n) ? ")
    if answer != 'y' and answer != 'yes':
        exit()
else:
    os.mkdir(IMAGE_DIR + "/" + title + "/")

cur_url = url
srcs = [urljoin(url,src) for src in getImgSrcs(soup)]
#Get srcs
while has_next_page(soup):
    next_url = get_next_page(soup) #获取下一页地址
    next_url = urljoin(cur_url,next_url)
    cur_url = next_url
    soup = BeautifulSoup(request(next_url)) #构建下一页的soup
    srcs = srcs.concat([urljoin(url,src) for src in getImgSrcs(soup)])

index = 1
for src in srcs:
    #image/xxx-title/1.jpg
    dot = src.rindex('.')
    ext = src[dot:] # .jpg
    path = base_path + "{0:02}".format(index) + ext
    print("正在下载第{0:02}张 : {1}".format(index,src))
    urllib.request.urlretrieve(src,path)
    index+=1
	#beautiful soup doc : http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
	DEBUG = False #是否在调试
	EXAMPLE_URL = 'http://tieba.baidu.com/p/3287469841' #示例网址
	PAGE_ENCODING = "gbk" #返回的encoding,默认就好
	IMAGE_DIR = 'image' #存放目录

	def getTitle(soup): #获取标题,文件夹名
	title = (soup.select("h1.core_title_txt"))[0]['title']
	return title.strip()
	def getImgSrcs(soup): #如何获取本页的img src
	srcs = [img['src'] for img in soup.select("img.BDE_Image")]
	return srcs
	def has_next_page(soup): #还有下一页没啊
	return False
	def get_next_page(soup): #获取下一页地址
	return ''

	import urllib,urllib.request,os,sys
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	# import pyquery as pq

	#page html
	def request(url):
	global PAGE_ENCODING
	res = urllib.request.urlopen(url)
	html = res.read()

	if PAGE_ENCODING == None:
	PAGE_ENCODING = "utf8"
	return html.decode(PAGE_ENCODING)

	#写入一行
	def writeLine(f,content):
	'''
	f = open()
	content : Unicode,use utf8 encode
	'''
	f.write(content.encode("utf8")+'\n')

	if DEBUG: #在debug,取example_url
	url = EXAMPLE_URL
	elif len(sys.argv) == 1: #正常运行时,提示帮助信息
	print("请指定图片所在网页(如{0})".format(EXAMPLE_URL))
	exit()
	else: #正常指定下载
	url = sys.argv[1] #python [down.py http://xxxx]

	soup = BeautifulSoup(request(url)) #创建soup
	title = getTitle(soup) #如何获取标题,只获取当前页
	print("图片系列为 : {0}".format(title))

	#image文件夹
	base_path = ''
	if IMAGE_DIR == None or IMAGE_DIR == '':
	#当前目录
	base_path = title
	else:
	if not os.path.exists(IMAGE_DIR):
	os.mkdir(IMAGE_DIR)
	base_path = IMAGE_DIR + "/" + title + "/"

	if os.path.exists(IMAGE_DIR + "/" + title):
	answer = input("已经下载过了啊!,要重新下载 (y/n) ? ")
	if answer != 'y' and answer != 'yes':
	exit()
	else:
	os.mkdir(IMAGE_DIR + "/" + title + "/")

	cur_url = url
	srcs = [urljoin(url,src) for src in getImgSrcs(soup)]
	#Get srcs
	while has_next_page(soup):
	next_url = get_next_page(soup) #获取下一页地址
	next_url = urljoin(cur_url,next_url)
	cur_url = next_url
	soup = BeautifulSoup(request(next_url)) #构建下一页的soup
	srcs = srcs.concat([urljoin(url,src) for src in getImgSrcs(soup)])

	index = 1
	for src in srcs:
	#image/xxx-title/1.jpg
	dot = src.rindex('.')
	ext = src[dot:] # .jpg
	path = base_path + "{0:02}".format(index) + ext
	print("正在下载第{0:02}张 : {1}".format(index,src))
	urllib.request.urlretrieve(src,path)
	index+=1