Skip to content

Instantly share code, notes, and snippets.

@olsososo
Created June 18, 2014 05:33
Show Gist options
  • Save olsososo/b8eb40e54bc573b5e5bc to your computer and use it in GitHub Desktop.
Save olsososo/b8eb40e54bc573b5e5bc to your computer and use it in GitHub Desktop.
过滤百度搜索结果
import requests
from lxml import etree
def main(keyword, pages):
f = open('./domain.txt', 'a')
domains = []
pn = 0
for page in xrange(pages):
for k in keyword:
url = "http://www.baidu.com/s?wd="+k+"&pn="+str(pn)+"&oq="+k+"&tn=monline_dg&ie=utf-8&usm=1"
print url
r = requests.get(url, timeout=15)
htmlElement = etree.HTML(r.content)
try:
for i in htmlElement.xpath('.//span[@class="g"]'):
href = etree.tostring(i).replace('<b>','').replace('</b>','').replace('<span class="g">','').replace('</span>','')
if '/' in href:
doamin = href.split('/')[0]
if doamin not in domains:
domains.append(doamin)
except Exception:
pass
pn = (page+1) * 10
for doamin in domains:
f.write('127.0.0.2 '+doamin+'\r\n')
f.close()
if __name__ == '__main__':
keyword = []
main(keyword, 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment