Skip to content

Instantly share code, notes, and snippets.

@foriequal0
Last active December 18, 2017 07:16
Show Gist options
  • Save foriequal0/ae597fa1d8c262072c7174489e63e2c4 to your computer and use it in GitHub Desktop.
Save foriequal0/ae597fa1d8c262072c7174489e63e2c4 to your computer and use it in GitHub Desktop.
졸업프로젝트 크롤러
import requests
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings()
HOST=""
LOGIN_PAYLOAD = {
"mode": "login",
"userid": "",
"passwd": "",
}
def login(s):
LOGIN_URL=HOST+"/member/login"
res = s.post(LOGIN_URL, data=LOGIN_PAYLOAD)
def get_html(s, url):
r = s.get(url)
if r.status_code == 200:
r.encoding='utf8'
return r.text
def get_list(s, year, term):
LISTING_URL=HOST+"/intranet/lecture/project?year={year}&term={term}"
html = get_html(s, LISTING_URL.format(year= year, term= term))
soup = BeautifulSoup(html, 'html.parser')
table = soup.select("table.tbs-01.acenter")[0]
for row in table.find_all("tr"):
cols = row.find_all("td")
if len(cols) == 0:
continue
yield {
"year": int(cols[1].text),
"half": int(cols[2].text),
"id": cols[3].text,
"prof": cols[4].text,
"max": int(cols[5].text),
"title": cols[6].text,
}
def get_detail(s, row):
DETAIL_URL=HOST+"/intranet/lecture/project?mode=view&pjtid={id}"
html = get_html(s, DETAIL_URL.format(id= row["id"]))
soup = BeautifulSoup(html, 'html.parser')
table = soup.select("div.form_table")[0]
applied_r = soup.select("table.acenter")[0].find("tbody").find_all("tr");
done_r = soup.select("table.acenter")[1].find("tbody").find_all("tr");
if len(applied_r) == 1 and "등록된 자료가 없습니다." in applied_r[0].text:
applied = 0
else:
applied = len(applied_r)
if len(done_r) == 1 and "등록된 자료가 없습니다." in done_r[0].text:
done = 0
else:
done = len(done_r)
return {
**row,
"scope": soup.select("table > tbody")[0].find_all("tr")[2].find("td").text,
"require": soup.select("table > tbody")[0].find_all("tr")[4].find("td").text,
"body": soup.select("table.form_table3 > tbody > tr > td")[0].text,
"applied": applied,
"done": done,
}
projects = []
with requests.Session() as s:
s.verify = False
login(s)
for (year, half) in [(2018, 1), (2017, 2), (2017, 1), (2016, 2), (2016, 1), (2015, 2), (2015, 1), (2014, 2)]:
print(year, half)
for row in get_list(s, year, half):
projects.append(get_detail(s, row))
print(len(projects))
# export as sqlite
import sqlite3
conn = sqlite3.connect('grad_projects.sqlite')
c = conn.cursor()
c.execute('''DROP TABLE IF EXISTS projects''')
c.execute('''
CREATE TABLE projects
(id TEXT,
year INTEGER, half INTEGER,
prof TEXT, scope TEXT, title TEXT, required TEXT,
max INTEGER, applied INTEGER, done INTEGER,
body TEXT)
''')
c.executemany('INSERT INTO projects VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
[(p['id'], p['year'], p['half'],
p['prof'], p['scope'], p['title'], p['require'],
p['max'], p['applied'], p['done'],
p['body']) for p in projects])
conn.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment