Skip to content

Instantly share code, notes, and snippets.

@JichunMa
Created February 4, 2018 09:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JichunMa/8ce64bbee7141ee641d6bc58b86312f7 to your computer and use it in GitHub Desktop.
Save JichunMa/8ce64bbee7141ee641d6bc58b86312f7 to your computer and use it in GitHub Desktop.
望大佬帮忙 review 一下
#coding=utf-8
import requests
import re
from bs4 import BeautifulSoup
import os
import threading
import Queue
import time
import urllib2
import string
f=open('d:\\nlx.txt','w') #存储路径
NUM=10 #线程数
JOBS=2 #定义末页, 共有900多页,测试阶段设为2
q=Queue.Queue()
f.write(time.strftime('%H:%M:%S'))
def do_somthing(arg):
#print arg
pass
#lock=threading.Lock()
def working():
global lock
while True:
arg=q.get(block=True,timeout=None)
#do_somthing(arg)
res=requests.get(arg)
text=res.text
soup=BeautifulSoup(text,'html.parser') #解析网页源码
f.write(time.strftime('%H:%M:%S')) #写入线程时间
f.write('_'*5 + soup.find('title').string[10:].encode('utf-8')+'_'*5+'\r\n') #写入标题
kk=soup.findAll('div',class_='viewbox')
for st in kk:
f.write(st.text.replace('\n','')) #写入内容
q.task_done()
res.close()
root_url='http://www.nlx.gov.cn/inter/'
header={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.nlx.gov.cn',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
#re_=re.compile('onclick="SwitchTheme.*?<a href="view\.php\?tid=(.*?)" target="_blank">',re.S)
for i in range(NUM):
t=threading.Thread(target=working)
t.setDaemon(True)
t.start()
for i in range(1,JOBS):
host_url='http://www.nlx.gov.cn/inter/?tid=&pages=%s'%i
req=urllib2.Request(host_url)
res=urllib2.urlopen(req)
text=res.read()
soup=BeautifulSoup(text,'html.parser')
a=soup.find('dl',class_='lists')
for aa in a.find_all('a'):
q.put(root_url+aa['href'],True,None) #传详细分页源址
res.close()
q.join()
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment