Skip to content

Instantly share code, notes, and snippets.

@Q2h1Cg
Last active December 17, 2015 21:39
Show Gist options
  • Save Q2h1Cg/5675751 to your computer and use it in GitHub Desktop.
Save Q2h1Cg/5675751 to your computer and use it in GitHub Desktop.
百度接口批量扫描备份
# -*- coding: cp936 -*-
'''
百度关键字批量扫描网站备份。
Author: Chu
Blog: www.chuhades.com
用法: python xx.py 关键字 页面数 字典文件
Example: python xx.py 关键字 30 rars.txt'''
from re import compile
from sys import argv
from threading import Thread
import urllib2
re_baidu_result = compile('<span class="g">(.+?)/.+?</span>')
class MyThread(Thread):
'''多线程'''
def __init__(self, func, args):
Thread.__init__(self)
self.func = func
self.args = args
def run(self):
self.func(*self.args)
def getsites():
'''获取百度结果'''
sites = []
keyword = argv[1].replace(' ', '%20')
pages = argv[2]
for i in xrange(int(pages)):
url = 'http://www.baidu.com/s?wd=%s&pn=%d0' % (keyword, i)
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
html = urllib2.urlopen(req).read()
for j in re_baidu_result.finditer(html):
if j.group(1).strip() not in sites:
sites.append(j.group(1).strip())
return sites
def check_one(url, result_list):
'''单个扫描'''
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
try:
x = urllib2.urlopen(req)
if x.getcode() == 200 and x.geturl() == url and (x.headers.type == "application/x-rar-compressed" or x.headers.type == "application/zip"):
result_list.append("<a href='%s' target='_blank'>%s</a><br>" % (url, url))
print ' 发现 %s!' % url
x.close()
except BaseException, e:
pass
def check(site, rars_txt, result_list):
'''批量扫描'''
print '扫描 %s 中...' % site
rars = [i.strip() for i in open(rars_txt)]
for i in ('zip', 'rar'):
rars.append(site+'.'+i)
rars.append(site.split('.')[-2]+'.'+site.split('.')[-1]+'.'+i)
threads = [MyThread(check_one, ('http://%s/%s' % (site, i), result_list)) for i in rars]
for i in threads: i.start()
for i in threads: i.join()
def main():
'''主函数'''
sites = getsites()
result_list = []
rars_txt = argv[3]
print '共 %d 个网站,请骚等...\n' % len(sites)
for i in sites: check(i, rars_txt, result_list)
f = open('result.html', 'w')
f.write('''<head><title>扫描结果</title></head>
<center><h1>扫描结果</h1></center>
<hr/>
共扫描 %d 个网站,结果如下:<br /><br/>''' % len(sites))
for i in result_list: f.write(i)
f.close()
print '扫描完毕,请查看目录下的result.html。'
if __name__ == '__main__':
if len(argv) == 4:
main()
else:
print __doc__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment