Last active
May 7, 2020 12:51
-
-
Save eugene87222/8386433cc4596f18fe4e82a59267ea53 to your computer and use it in GitHub Desktop.
20200507 ccca 爬蟲社課 Python demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"title:\n", | |
"\n", | |
"公告| [緊急公告]校園骨幹網路設備異常公告 2020/04/22 \n", | |
"content\n", | |
"[緊急公告]校園骨幹網路設備異常公告\n", | |
"影響時間:109年 4月22日(三) 下午17:00-17:30\n", | |
"影響範圍:全校網路服務\n", | |
"內 容:校園骨幹網路設備故障異常,進行緊急修復處理,施作期間將會影響全校網路服務,網路服務將出現瞬斷或短暫中斷等狀況,造成不便,敬請見諒\n", | |
"\n", | |
"\n", | |
"承辦人陳俐君\n", | |
"\n", | |
"電話 03-5712121#31268\n", | |
"Email lichun80@nctu.edu.tw\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"publish date\n", | |
"2020/04/22\n" | |
] | |
} | |
], | |
"source": [ | |
"# 校計中公告\n", | |
"import re\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"\n", | |
"url = 'https://www.it.nctu.edu.tw/?page_id=18'\n", | |
"headers = {\n", | |
" 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'\n", | |
"}\n", | |
"\n", | |
"res = requests.get(url, headers=headers)\n", | |
"soup = BeautifulSoup(res.text, 'lxml')\n", | |
"\n", | |
"table = soup.select_one('#posts-table-1')\n", | |
"# table = soup.find('table', {'id': 'posts-table-1'})\n", | |
"rows = table.select('tbody > tr')\n", | |
"# rows = table.find('tbody').find_all('tr')\n", | |
"a = rows[0].select('a')[1]\n", | |
"url = a['href']\n", | |
"\n", | |
"res = requests.get(url, headers=headers)\n", | |
"soup = BeautifulSoup(res.text)\n", | |
"title = soup.select_one('h2.entry-title')\n", | |
"print('title:')\n", | |
"print(title.text)\n", | |
"content = soup.select_one('.entry-content')\n", | |
"trash = content.select_one('.mh-social-bottom')\n", | |
"trash.decompose()\n", | |
"print('content')\n", | |
"print(content.text)\n", | |
"pub_datetime = re.findall(r'\\d+/\\d+/\\d+', title.text)[0]\n", | |
"print('publish date')\n", | |
"print(pub_datetime)\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.7.5 64-bit", | |
"language": "python", | |
"name": "python37564bit9c3aa344ed5e49b1b7f5168b1a98152a" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment