Created
February 4, 2018 09:51
-
-
Save JichunMa/8ce64bbee7141ee641d6bc58b86312f7 to your computer and use it in GitHub Desktop.
望大佬帮忙 review 一下
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
import os | |
import threading | |
import Queue | |
import time | |
import urllib2 | |
import string | |
f=open('d:\\nlx.txt','w') #存储路径 | |
NUM=10 #线程数 | |
JOBS=2 #定义末页, 共有900多页,测试阶段设为2 | |
q=Queue.Queue() | |
f.write(time.strftime('%H:%M:%S')) | |
def do_somthing(arg): | |
#print arg | |
pass | |
#lock=threading.Lock() | |
def working(): | |
global lock | |
while True: | |
arg=q.get(block=True,timeout=None) | |
#do_somthing(arg) | |
res=requests.get(arg) | |
text=res.text | |
soup=BeautifulSoup(text,'html.parser') #解析网页源码 | |
f.write(time.strftime('%H:%M:%S')) #写入线程时间 | |
f.write('_'*5 + soup.find('title').string[10:].encode('utf-8')+'_'*5+'\r\n') #写入标题 | |
kk=soup.findAll('div',class_='viewbox') | |
for st in kk: | |
f.write(st.text.replace('\n','')) #写入内容 | |
q.task_done() | |
res.close() | |
root_url='http://www.nlx.gov.cn/inter/' | |
header={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
'Accept-Encoding':'gzip, deflate', | |
'Accept-Language':'zh-CN,zh;q=0.9', | |
'Cache-Control':'max-age=0', | |
'Connection':'keep-alive', | |
'Host':'www.nlx.gov.cn', | |
'Upgrade-Insecure-Requests':'1', | |
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'} | |
#re_=re.compile('onclick="SwitchTheme.*?<a href="view\.php\?tid=(.*?)" target="_blank">',re.S) | |
for i in range(NUM): | |
t=threading.Thread(target=working) | |
t.setDaemon(True) | |
t.start() | |
for i in range(1,JOBS): | |
host_url='http://www.nlx.gov.cn/inter/?tid=&pages=%s'%i | |
req=urllib2.Request(host_url) | |
res=urllib2.urlopen(req) | |
text=res.read() | |
soup=BeautifulSoup(text,'html.parser') | |
a=soup.find('dl',class_='lists') | |
for aa in a.find_all('a'): | |
q.put(root_url+aa['href'],True,None) #传详细分页源址 | |
res.close() | |
q.join() | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment