Last active
December 17, 2015 21:39
-
-
Save Q2h1Cg/5675751 to your computer and use it in GitHub Desktop.
百度接口批量扫描备份
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: cp936 -*- | |
''' | |
百度关键字批量扫描网站备份。 | |
Author: Chu | |
Blog: www.chuhades.com | |
用法: python xx.py 关键字 页面数 字典文件 | |
Example: python xx.py 关键字 30 rars.txt''' | |
from re import compile | |
from sys import argv | |
from threading import Thread | |
import urllib2 | |
re_baidu_result = compile('<span class="g">(.+?)/.+?</span>') | |
class MyThread(Thread): | |
'''多线程''' | |
def __init__(self, func, args): | |
Thread.__init__(self) | |
self.func = func | |
self.args = args | |
def run(self): | |
self.func(*self.args) | |
def getsites(): | |
'''获取百度结果''' | |
sites = [] | |
keyword = argv[1].replace(' ', '%20') | |
pages = argv[2] | |
for i in xrange(int(pages)): | |
url = 'http://www.baidu.com/s?wd=%s&pn=%d0' % (keyword, i) | |
req = urllib2.Request(url) | |
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0') | |
html = urllib2.urlopen(req).read() | |
for j in re_baidu_result.finditer(html): | |
if j.group(1).strip() not in sites: | |
sites.append(j.group(1).strip()) | |
return sites | |
def check_one(url, result_list): | |
'''单个扫描''' | |
req = urllib2.Request(url) | |
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0') | |
try: | |
x = urllib2.urlopen(req) | |
if x.getcode() == 200 and x.geturl() == url and (x.headers.type == "application/x-rar-compressed" or x.headers.type == "application/zip"): | |
result_list.append("<a href='%s' target='_blank'>%s</a><br>" % (url, url)) | |
print ' 发现 %s!' % url | |
x.close() | |
except BaseException, e: | |
pass | |
def check(site, rars_txt, result_list): | |
'''批量扫描''' | |
print '扫描 %s 中...' % site | |
rars = [i.strip() for i in open(rars_txt)] | |
for i in ('zip', 'rar'): | |
rars.append(site+'.'+i) | |
rars.append(site.split('.')[-2]+'.'+site.split('.')[-1]+'.'+i) | |
threads = [MyThread(check_one, ('http://%s/%s' % (site, i), result_list)) for i in rars] | |
for i in threads: i.start() | |
for i in threads: i.join() | |
def main(): | |
'''主函数''' | |
sites = getsites() | |
result_list = [] | |
rars_txt = argv[3] | |
print '共 %d 个网站,请骚等...\n' % len(sites) | |
for i in sites: check(i, rars_txt, result_list) | |
f = open('result.html', 'w') | |
f.write('''<head><title>扫描结果</title></head> | |
<center><h1>扫描结果</h1></center> | |
<hr/> | |
共扫描 %d 个网站,结果如下:<br /><br/>''' % len(sites)) | |
for i in result_list: f.write(i) | |
f.close() | |
print '扫描完毕,请查看目录下的result.html。' | |
if __name__ == '__main__': | |
if len(argv) == 4: | |
main() | |
else: | |
print __doc__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment