Skip to content

Instantly share code, notes, and snippets.

@abcfy2
Created December 3, 2014 10:01
Show Gist options
  • Save abcfy2/f201fe09960022e80020 to your computer and use it in GitHub Desktop.
Save abcfy2/f201fe09960022e80020 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#coding:utf8
import sys, os
import urllib2
import time
from threading import Thread
from BeautifulSoup import BeautifulSoup
import random
urlRoot = 'http://uuu.11com137aa.comwww.vvvv12.com' # 已被屏蔽--!.
sPage = 1 # 开始页数
ePage = 100 # 结束页数(页数少下载完成的快,页数多下载完成的慢)
downCount = 10 # 每页下载检查的次数(有时下载会失败,让它多跑几次,但不会重复)
DEFAULT_ENCODING = sys.getfilesystemencoding()
headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0"},
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"},
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)"},
{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0"},
{"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"},
{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"},
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"},
{"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"},
{"User-Agent":"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}]
def RandomHeaders():
return random.choice(headers)
# 抓网页源码,并返回为str
def getHtmlSrc(url, header):
req = urllib2.Request(url, header)
res = urllib2.urlopen(url, timeout=60)
htmlSrc = res.read()
res.close
return htmlSrc
# 把抓到的网页源码保存到txt文本中
def saveHtmlSrc(url, fileName):
start = time.time()
header = RandomHeaders()
try:
html = getHtmlSrc(url, header)
with open(fileName,'w') as f:
f.write(html)
except Exception, e:
print "[-] Bad Header: ",header
print "Time: %s s" % (time.time() - start)
def getChildUrls(fileName, url):
childUrlList = []
with open(fileName,'r') as f:
bs = BeautifulSoup(f)
hrefs = bs.findAll('a')
for href in hrefs:
hUrl = href.get('href')
if hUrl[:4] == '/htm':
childUrlList.append(url+hUrl)
return childUrlList
def getImgUrls(fileName):
imgUrlList = []
with open(fileName,'r') as f:
bs = BeautifulSoup(f)
hrefs = bs.findAll('img')
for href in hrefs:
imgUrl = href.get('src')
if imgUrl[:4] == 'http':
imgUrlList.append(imgUrl)
print 'Urls: ',len(imgUrlList)
return (len(imgUrlList), imgUrlList)
def saveImg(imgUrl ,dirPath):
start = time.time()
fname = dirPath + '/' + str(imgUrl.split("/")[-1])
if not os.path.exists(fname):
header = RandomHeaders()
try:
req = urllib2.Request(imgUrl, header)
res = urllib2.urlopen(imgUrl, timeout=60)
pic = res.read()
with open(fname, "wb") as f:
f.write(pic)
# print fname
except Exception, e:
print imgUrl
print "[-] Bad Header: ",header
print "Time: %s s" % (time.time() - start)
def getHtmlTile(fileName):
with open(fileName, "r") as f:
bs = BeautifulSoup(f)
return urllib2.unquote(str(bs.title.string).split('-')[0])
class catch(Thread):
def __init__(self, url, dirPath):
Thread.__init__(self)
self.url = url
self.dirPath = dirPath
def run(self):
saveImg(self.url, self.dirPath)
if __name__ == '__main__':
for page in range(sPage,ePage+1):
for i in range(downCount):
urlPage = urlRoot+'/p01/list_%d.html' % page
print urlPage
dirPathHtml = unicode(sys.path[0] + "/html/" + str(page) + '/', DEFAULT_ENCODING)
if not os.path.exists(dirPathHtml):
os.makedirs(dirPathHtml)
htmlFile = dirPathHtml + '%s.html' % str(page)
if not os.path.exists(htmlFile):
saveHtmlSrc(urlPage, htmlFile)
else:
childUrlList = getChildUrls(htmlFile, urlRoot)
for childUrl in childUrlList:
start = time.time()
childHtmlFile = dirPathHtml + childUrl.split("/")[-1]
print childUrl
if not os.path.exists(childHtmlFile):
saveHtmlSrc(childUrl, childHtmlFile)
else:
threadCount, imgUrls = getImgUrls(childHtmlFile)
dirPathPic = unicode(sys.path[0] + "/pic/" + str(page) + '/', DEFAULT_ENCODING)
if not os.path.exists(dirPathPic):
os.makedirs(dirPathPic)
dirName = unicode(urllib2.unquote(str(imgUrls[0].split("/")[-2])), 'utf8')
if not os.path.exists(dirPathPic + dirName):
os.makedirs(dirPathPic + dirName)
print 'Files: ',sum([len(files) for root,dirs,files in os.walk(dirPathPic + dirName)])
for tc in range(threadCount):
c = catch(imgUrls[tc], dirPathPic + dirName)
c.start()
c.join()
print "Total time: %s" % (time.time() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment