Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
毕业生电子离校单,爬取,获得,显示结果。
方法1:
# 1. 复制根目录,不需要安装任何东西(看到这个就是已经属于复制好了)
# 2. 打开powershell,转到(cd)到src目录,
# 3. 在src里面的目录运行
..\envir\Scripts\python.exe -m http.server 8000
# 4. 访问浏览器 http://127.0.0.1:8000
# 静态文件的url是 名字.html
# 查找自己名字,选择即可
## 注意,直接用浏览器默认打开也是可以的,只是没有通过或者未通过的图片,但是"是否准予离校" 具有现实准与否的
方法2:
直接点击astart.sh运行。然后访问http://127.0.0.1:8000。然后ctrl+f搜名字就好了。看完叉掉保持运行cmd程序即可
# 原理:
##====1====== 微型静态服务(主要是绑定图片和css)
# python -m http.server --cgi 8000
# 命令行运行 查看网页 http://127.0.0.1:8000/
# 或者比如npm i serve - g 然后在当前目录运行serve 返回访问即可‘
# 任何开启服务的方式,都可以,如果直接用浏览器打开,看不到css样式,图片。所以需要建立一个微型服务器,静态文件来看
# 但是直接打开也是可以知道 "是否准予离校" 进行判断是否准予通过,或者一个一个点出来看
##====2======爬取的脚本哎read.py(重新抓取的话,需要看自己需求删除html文件,因为存在那个会跳过用户
# python read.py 是爬取内容 在环境中src目录下运行是 ..\envir\Scripts\python.exe read.py
# 是pycharm创建的环境,不需要安装,直接复制文件夹粘贴即可
##=====3=======抓取原因
# 登录网址 http://202.115.158.19/xhu-egraduation-main/j_spring_security_check?logurl=/index!mgr.action&Verify=false&logintype=wz
# 选择学生 账户是学号 姓名是姓名 验证码随便填个数字,不为空就可以
##=====error.text是网络问题,服务器没有响应,出现问题的,是需要重新抓取的。
# -*- coding: UTF-8 -*-
import requests
import json
import csv
import time
import threading
import os
from bs4 import BeautifulSoup
import datetime
def reader_date(name):
students = []
with open(name,encoding='utf-8', mode = 'r') as csv_file:
# csv_write = csv.writer(csv_file)
# csv_write.writerow(all_num_data)
csv_read = csv.reader(csv_file)
# data = next(csv_read)
# print(csv_read.__dict__)
next(csv_read) # 标题
# next(csv_read) # 第一个
for index, row in enumerate(csv_read):
student = [row[0], row[1]]
# print("index:"+str(index), row[0], row[1])
students.append(student)
return students
def get_user(val):
url = "http://202.115.158.19/xhu-egraduation-main/j_spring_security_check?logurl=/index!mgr.action&Verify=false&logintype=wz"
headers = {
'Connection': 'keep-alive',
# 'Content-Type': 'application/json; charset=utf-8',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
}
pdata = {'j_username': val[0], 'j_password': val[1], "userkind": 1, 'imageCodeName': 2333}
s = requests.session() # 建立一个Session
r = s.post(url, data=pdata, headers=headers, allow_redirects=True, timeout=3)
# print(r.text)
# f = open("index.html", "w")
# f.write(r.text)
# f.close()
r2 = s.get("http://202.115.158.19/xhu-egraduation-main/lxgl/lxglgrnew.action", timeout=3)
f = open(str(val[1])+".html", "wb")
res = r2.text.encode('utf-8') # response返回的内容需要转码,否则会乱码
f.write(res)
f.close()
soup = BeautifulSoup(r2.text, 'lxml')
fails = soup.select('.fail')
if len(fails) > 0:
f = open("fail.txt", "a+", encoding="utf-8")
f.writelines("\n未通过:" + val[1] + " 未通过的个数" + str(len(fails)))
f.close()
f2 = open("fail.html", "a+", encoding="uft-8")
f2.writelines(f'\n<a href="/{val[1]}.html"> {val[1]} {str(len(fails))}</a>')
f2.close()
print(pdata)
s.close()
time.sleep(1) # 睡一下
if __name__ == '__main__':
# 第一次运行
students = reader_date("./2019届毕业生.csv")
for student in students:
print(student)
# 第二次运行,如果有“名字.html” 就不抓取,没有就抓取, 第一次运行后出现有一些错漏,error【】.text有值就是有错,没有抓取完整,重复运行
if os.path.exists(str(student[1])+".html"):
print(student[0] + " " + student[1] + "已经存在================")
continue
try:
get_user(student)
except:
f = open("error1.txt", "a+")
# f.writel("Error:" + student[0] + " "+ student[1])
f.writelines("\n网络Error:" + student[0] + " " + student[1])
f.close()
time.sleep(1)
### 写index.html
# msg = '<HTML><body>'
# for student in students:
# a = '<a href="/' + student[1] + '.html">' + student[1] + '</a> '
# msg += a
# msg += '<br/></a href="/fail.txt">未通过的人:</a> ' + '更新时间:'+ str(datetime.date.today())
# msg += '</body></HTML>'
# f = open("index.html", "w")
# f.write(msg)
# f.close()
### 单个的测试
# data = ["3120150901232", "熊潇"]
# data = ["3120150901233", "胡军"]
# data = ["3120150901201", "潘振"]
# get_user(data)
f = open("error1.txt", "a+")
# f.writel("Error:" + student[0] + " "+ student[1])
f.writelines("\n===================结尾======================")
f.close()
# -*- coding: UTF-8 -*-
import requests
import json
import csv
import time
import threading
import os
from bs4 import BeautifulSoup
import datetime
def reader_date(name):
_students = []
with open(name, encoding='utf-8', mode = 'r') as csv_file:
# csv_write = csv.writer(csv_file)
# csv_write.writerow(all_num_data)
csv_read = csv.reader(csv_file)
# data = next(csv_read)
# print(csv_read.__dict__)
next(csv_read) # 标题
# next(csv_read) # 第一个
for index, row in enumerate(csv_read):
_student = [row[0], row[1]]
# print("index:"+str(index), row[0], row[1])
_students.append(_student)
return _students
if __name__ == '__main__':
students = reader_date("./2019届毕业生.csv")
# 写index.html
msg = '<HTML><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head><body>'
for student in students:
a = '<a href="/' + student[1] + '.html">' + student[1] + '</a> '
msg += a
# str(datetime.date.today() 2019-06-26
# time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()) # 'Wed Jun 26 11:42:46 2019'
t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
msg += '<br/><a href="/fail.txt">未通过的人:</a> ' + '更新时间:' + str(t)
msg += '<br/><a href="/fail.html">可怕</a> '
msg += '</body></HTML>'
f = open("index.html", "w", encoding="utf-8")
f.write(msg)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment