Skip to content

Instantly share code, notes, and snippets.

@dingyaguang117
Last active August 29, 2015 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dingyaguang117/088d07216ca52e13288a to your computer and use it in GitHub Desktop.
Save dingyaguang117/088d07216ca52e13288a to your computer and use it in GitHub Desktop.
#coding=utf-8
__author__ = 'ding'
from gevent import monkey
monkey.patch_all()
from gevent.pool import Pool
import requests
sites = [u'0i4t.com', u'0757dk.com', u'0007ka.com', u'003782.com', u'0735.com', u'04336789.com', u'078078.com', u'0851qy.com', u'008321.com', u'0427d7.se', u'010sf.com', u'0476com.com', u'0470a.com', u'0163.com', u'00992.com', u'0759job.com', u'0437t.com', u'057191.com', u'0932.info', u'0311xinhua.cn', u'041pao.com', u'009997.com', u'018tk.com', u'099cf.com', u'0576qq.com', u'0101010.info', u'001lv.org', u'0771pc.com', u'0734kj.com', u'076665.cn', u'001123.com', u'08kkkk.com', u'0416hlw.com', u'0452e.com', u'001ni.com', u'001dd.com', u'012666.com', u'010tk.com', u'0579h.com', u'0851yifu.com', u'06049.com', u'044944.com', u'0752oa.net', u'003003.net', u'050gg.com', u'02325.org', u'0744y.com', u'0523114.com', u'0449aa.com', u'0597house.com', u'0411house.com', u'099333.com', u'001lv.com', u'050q.com', u'067.cc', u'02kkk.com', u'050ww.com', u'0437.gov.cn', u'073311.com', u'024www.net', u'073img.com', u'0592rc.cn', u'09gao.com', u'008zyz.com', u'022car.com.cn', u'00087.com', u'050aa.com', u'001d88.com', u'02728.com', u'07073.cn', u'001jm.com', u'003zyz.com', u'0773f.com', u'0311dd.com', u'023001.com', u'040206.com', u'03088.com', u'099456.com', u'0907.org', u'0572home.com', u'0731tg.cc', u'007tk.net', u'030401.com', u'01116.com', u'035pao.com', u'0551night.com', u'0123456.co', u'009cnc.com', u'06379.com.cn', u'002cn.cn', u'08118.com', u'01666.com', u'021lvzhou.com', u'0519.com', u'07141.com.cn', u'028tsh.com', u'0768000.com', u'0510wxly.com', u'0370home.com', u'0755lswjs.com']
def crawl_title(site):
try:
url = 'http://' + site
content = requests.get(url).content
except:
print site, 'get error'
return
print site, len(content)
def crawl():
pool = Pool(10)
pool.map(crawl_title, sites)
if __name__ == '__main__':
crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment