Created
December 3, 2014 10:01
-
-
Save abcfy2/f201fe09960022e80020 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding:utf8 | |
import sys, os | |
import urllib2 | |
import time | |
from threading import Thread | |
from BeautifulSoup import BeautifulSoup | |
import random | |
urlRoot = 'http://uuu.11com137aa.comwww.vvvv12.com' # 已被屏蔽--!. | |
sPage = 1 # 开始页数 | |
ePage = 100 # 结束页数(页数少下载完成的快,页数多下载完成的慢) | |
downCount = 10 # 每页下载检查的次数(有时下载会失败,让它多跑几次,但不会重复) | |
DEFAULT_ENCODING = sys.getfilesystemencoding() | |
headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0"}, | |
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"}, | |
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"}, | |
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"}, | |
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0"}, | |
{"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"}, | |
{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"}, | |
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"}, | |
{"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"}, | |
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133"}, | |
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"}, | |
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"}, | |
{"User-Agent":"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"}, | |
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"}, | |
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"}, | |
{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101"}, | |
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}] | |
def RandomHeaders(): | |
return random.choice(headers) | |
# 抓网页源码,并返回为str | |
def getHtmlSrc(url, header): | |
req = urllib2.Request(url, header) | |
res = urllib2.urlopen(url, timeout=60) | |
htmlSrc = res.read() | |
res.close | |
return htmlSrc | |
# 把抓到的网页源码保存到txt文本中 | |
def saveHtmlSrc(url, fileName): | |
start = time.time() | |
header = RandomHeaders() | |
try: | |
html = getHtmlSrc(url, header) | |
with open(fileName,'w') as f: | |
f.write(html) | |
except Exception, e: | |
print "[-] Bad Header: ",header | |
print "Time: %s s" % (time.time() - start) | |
def getChildUrls(fileName, url): | |
childUrlList = [] | |
with open(fileName,'r') as f: | |
bs = BeautifulSoup(f) | |
hrefs = bs.findAll('a') | |
for href in hrefs: | |
hUrl = href.get('href') | |
if hUrl[:4] == '/htm': | |
childUrlList.append(url+hUrl) | |
return childUrlList | |
def getImgUrls(fileName): | |
imgUrlList = [] | |
with open(fileName,'r') as f: | |
bs = BeautifulSoup(f) | |
hrefs = bs.findAll('img') | |
for href in hrefs: | |
imgUrl = href.get('src') | |
if imgUrl[:4] == 'http': | |
imgUrlList.append(imgUrl) | |
print 'Urls: ',len(imgUrlList) | |
return (len(imgUrlList), imgUrlList) | |
def saveImg(imgUrl ,dirPath): | |
start = time.time() | |
fname = dirPath + '/' + str(imgUrl.split("/")[-1]) | |
if not os.path.exists(fname): | |
header = RandomHeaders() | |
try: | |
req = urllib2.Request(imgUrl, header) | |
res = urllib2.urlopen(imgUrl, timeout=60) | |
pic = res.read() | |
with open(fname, "wb") as f: | |
f.write(pic) | |
# print fname | |
except Exception, e: | |
print imgUrl | |
print "[-] Bad Header: ",header | |
print "Time: %s s" % (time.time() - start) | |
def getHtmlTile(fileName): | |
with open(fileName, "r") as f: | |
bs = BeautifulSoup(f) | |
return urllib2.unquote(str(bs.title.string).split('-')[0]) | |
class catch(Thread): | |
def __init__(self, url, dirPath): | |
Thread.__init__(self) | |
self.url = url | |
self.dirPath = dirPath | |
def run(self): | |
saveImg(self.url, self.dirPath) | |
if __name__ == '__main__': | |
for page in range(sPage,ePage+1): | |
for i in range(downCount): | |
urlPage = urlRoot+'/p01/list_%d.html' % page | |
print urlPage | |
dirPathHtml = unicode(sys.path[0] + "/html/" + str(page) + '/', DEFAULT_ENCODING) | |
if not os.path.exists(dirPathHtml): | |
os.makedirs(dirPathHtml) | |
htmlFile = dirPathHtml + '%s.html' % str(page) | |
if not os.path.exists(htmlFile): | |
saveHtmlSrc(urlPage, htmlFile) | |
else: | |
childUrlList = getChildUrls(htmlFile, urlRoot) | |
for childUrl in childUrlList: | |
start = time.time() | |
childHtmlFile = dirPathHtml + childUrl.split("/")[-1] | |
print childUrl | |
if not os.path.exists(childHtmlFile): | |
saveHtmlSrc(childUrl, childHtmlFile) | |
else: | |
threadCount, imgUrls = getImgUrls(childHtmlFile) | |
dirPathPic = unicode(sys.path[0] + "/pic/" + str(page) + '/', DEFAULT_ENCODING) | |
if not os.path.exists(dirPathPic): | |
os.makedirs(dirPathPic) | |
dirName = unicode(urllib2.unquote(str(imgUrls[0].split("/")[-2])), 'utf8') | |
if not os.path.exists(dirPathPic + dirName): | |
os.makedirs(dirPathPic + dirName) | |
print 'Files: ',sum([len(files) for root,dirs,files in os.walk(dirPathPic + dirName)]) | |
for tc in range(threadCount): | |
c = catch(imgUrls[tc], dirPathPic + dirName) | |
c.start() | |
c.join() | |
print "Total time: %s" % (time.time() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment