Created
July 14, 2014 13:57
-
-
Save anonymous/edd25344684a48453858 to your computer and use it in GitHub Desktop.
豆瓣小组—图片采集—练习
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -* | |
import urllib | |
import urllib2 | |
import re | |
import time | |
import sys | |
#解决中文报错 | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
#HuoQ = 'http://www.douban.com/group/haixiuzu/?ref=sidebar' | |
### 代理模块(全局代理) | |
print '#'*50 | |
print '本程序主要采集豆瓣<请不要害羞>小组的图片' | |
print '#'*50 | |
print '采集前需要输入代理服务器地址,这样可以防止被豆瓣屏蔽.' | |
print '推荐一个代理地址: http://cn-proxy.com/' | |
print '只需要输入服务器地址以及端口号,不需要输入http' | |
print '例子:127.0.0.1:8080' | |
print '#'*50 | |
proxy_input = raw_input('请输入采集代理服务器:') | |
proxy_handler = urllib2.ProxyHandler({'http':'%s'%proxy_input}) | |
opener = urllib2.build_opener(proxy_handler) | |
urllib2.install_opener(opener) | |
#采集本地路径全局变量 | |
#img_LuJ = raw_input("路径:") | |
#/home/o21/文档/PythonEX/Doubanimg | |
#设置UserAgent | |
#req = urllib2.Request(HuoQ) | |
#req.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36') | |
#reader = urllib2.urlopen(req) | |
#输出 | |
#print reader.read() | |
#模块化输出 | |
#获取帖子单页html | |
def gethtml2(url2): | |
html2 = urllib2.urlopen(url2).read().decode('utf-8') | |
return html2 | |
#抽取图片并下载列表 | |
def gettoimg(html2): | |
#添加正则 匹配url路径数字 d+代表获取0-9无限循环\转义符号 | |
reg2 = r'http://www.douban.com/group/topic/\d+' | |
toplist = re.findall(reg2,html2) | |
x = 0 | |
#限制下载图片数量 | |
#输出topicurl 每次输出一个 的循环 | |
for topicurl in toplist: | |
x+=1 | |
return topicurl | |
#下载图片到本地 | |
def download(topic_page): | |
#获取贴内图片 正则 ".+\" .匹配任意字符 + 匹配前一个字符或无限次 \ 转移符号 也就是 匹配所有字符 | |
reg3 = r'http://img3.douban.com/view/group_topic/large/public/.+\.jpg' | |
imglist = re.findall(reg3,topic_page) | |
i = 1 | |
download_img = None | |
for imgurl in imglist: | |
#取图片id为文件名 | |
img_numlist = re.findall(r'p\d{7}',imgurl) | |
for img_num in img_numlist: | |
download_img = urllib.urlretrieve(imgurl,'/home/o21/文档/PythonEX/Doubanimg/%s.jpg'%img_num) | |
time.sleep(1) | |
i+=1 | |
print (imgurl) | |
return download_img | |
page_end = int(input('请输入采集页码数:')) | |
num_end = page_end*25 | |
num = 0 | |
page_num = 1 | |
while num<=num_end: | |
#获取页面数 从 0 开始 | |
html2 = gethtml2('http://www.douban.com/group/haixiuzu/discussion?start=%d'%num) | |
#抽取下载图片 | |
topicurl = gettoimg(html2) | |
topic_page = gethtml2(topicurl) | |
download_img=download(topic_page) | |
# download_img = download(topic_page) | |
num = page_num*25 | |
page_num+=1 | |
else: | |
print('程序采集完成') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment