Q2h1Cg/baidu_scan_backup.py

## baidu_scan_backup.py
# -*- coding: cp936 -*-

'''
百度关键字批量扫描网站备份。
Author: Chu
Blog: www.chuhades.com

用法： python xx.py 关键字 页面数 字典文件
Example: python xx.py 关键字 30 rars.txt'''

from re import compile
from sys import argv
from threading import Thread
import urllib2

re_baidu_result = compile('<span class="g">(.+?)/.+?</span>')

class MyThread(Thread):
    '''多线程'''
    def __init__(self, func, args):
        Thread.__init__(self)
        self.func = func
        self.args = args

    def run(self):
        self.func(*self.args)


def getsites():
    '''获取百度结果'''
    sites = []
    keyword = argv[1].replace(' ', '%20')
    pages = argv[2]
    for i in xrange(int(pages)):
        url = 'http://www.baidu.com/s?wd=%s&pn=%d0' % (keyword, i)
        req = urllib2.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
        html = urllib2.urlopen(req).read()
        for j in re_baidu_result.finditer(html):
            if j.group(1).strip() not in sites:
                sites.append(j.group(1).strip())
    return sites

def check_one(url, result_list):
    '''单个扫描'''
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
    try:
        x = urllib2.urlopen(req)
        if x.getcode() == 200 and x.geturl() == url and (x.headers.type == "application/x-rar-compressed" or x.headers.type == "application/zip"):
            result_list.append("<a href='%s' target='_blank'>%s</a><br>" % (url, url))
            print '  发现 %s！' % url
            x.close()
    except BaseException, e:
        pass

def check(site, rars_txt, result_list):
    '''批量扫描'''
    print '扫描 %s 中...' % site
    rars = [i.strip() for i in open(rars_txt)]
    for i in ('zip', 'rar'):
        rars.append(site+'.'+i)
        rars.append(site.split('.')[-2]+'.'+site.split('.')[-1]+'.'+i)

    threads = [MyThread(check_one, ('http://%s/%s' % (site, i), result_list)) for i in rars]
    for i in threads: i.start()
    for i in threads: i.join()

def main():
    '''主函数'''
    sites = getsites()
    result_list = []
    rars_txt = argv[3]
    print '共 %d 个网站，请骚等...\n' % len(sites)
    for i in sites: check(i, rars_txt, result_list)

    f = open('result.html', 'w')
    f.write('''<head><title>扫描结果</title></head>
<center><h1>扫描结果</h1></center>
<hr/>
共扫描 %d 个网站，结果如下：<br /><br/>''' % len(sites))
    for i in result_list: f.write(i)
    f.close()

    print '扫描完毕，请查看目录下的result.html。'

if __name__ == '__main__':
    if len(argv) == 4:
        main()
    else:
        print __doc__
	# -- coding: cp936 --

	'''
	百度关键字批量扫描网站备份。
	Author: Chu
	Blog: www.chuhades.com

	用法： python xx.py 关键字页面数字典文件
	Example: python xx.py 关键字 30 rars.txt'''

	from re import compile
	from sys import argv
	from threading import Thread
	import urllib2

	re_baidu_result = compile('<span class="g">(.+?)/.+?</span>')

	class MyThread(Thread):
	'''多线程'''
	def __init__(self, func, args):
	Thread.__init__(self)
	self.func = func
	self.args = args

	def run(self):
	self.func(*self.args)


	def getsites():
	'''获取百度结果'''
	sites = []
	keyword = argv[1].replace(' ', '%20')
	pages = argv[2]
	for i in xrange(int(pages)):
	url = 'http://www.baidu.com/s?wd=%s&pn=%d0' % (keyword, i)
	req = urllib2.Request(url)
	req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
	html = urllib2.urlopen(req).read()
	for j in re_baidu_result.finditer(html):
	if j.group(1).strip() not in sites:
	sites.append(j.group(1).strip())
	return sites

	def check_one(url, result_list):
	'''单个扫描'''
	req = urllib2.Request(url)
	req.add_header('User-Agent','Mozilla/5.0 (Windows NT 5.1; rv:17.0) Gecko/20100101 Firefox/17.0')
	try:
	x = urllib2.urlopen(req)
	if x.getcode() == 200 and x.geturl() == url and (x.headers.type == "application/x-rar-compressed" or x.headers.type == "application/zip"):
	result_list.append("<a href='%s' target='_blank'>%s</a><br>" % (url, url))
	print ' 发现 %s！' % url
	x.close()
	except BaseException, e:
	pass

	def check(site, rars_txt, result_list):
	'''批量扫描'''
	print '扫描 %s 中...' % site
	rars = [i.strip() for i in open(rars_txt)]
	for i in ('zip', 'rar'):
	rars.append(site+'.'+i)
	rars.append(site.split('.')[-2]+'.'+site.split('.')[-1]+'.'+i)

	threads = [MyThread(check_one, ('http://%s/%s' % (site, i), result_list)) for i in rars]
	for i in threads: i.start()
	for i in threads: i.join()

	def main():
	'''主函数'''
	sites = getsites()
	result_list = []
	rars_txt = argv[3]
	print '共 %d 个网站，请骚等...\n' % len(sites)
	for i in sites: check(i, rars_txt, result_list)

	f = open('result.html', 'w')
	f.write('''<head><title>扫描结果</title></head>
	<center><h1>扫描结果</h1></center>
	<hr/>
	共扫描 %d 个网站，结果如下：<br /><br/>''' % len(sites))
	for i in result_list: f.write(i)
	f.close()

	print '扫描完毕，请查看目录下的result.html。'

	if __name__ == '__main__':
	if len(argv) == 4:
	main()
	else:
	print __doc__