Created
May 28, 2020 18:29
-
-
Save linzino7/89022f6225985561323cb0d34caa63dc to your computer and use it in GitHub Desktop.
vvdfvdfv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 29 00:38:13 2020 | |
@author: ASUS | |
""" | |
# 導入 模組(module) | |
import requests | |
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具 | |
import bs4 | |
# 文章連結 | |
URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html" | |
# 設定Header與Cookie | |
my_headers = {'cookie': 'over18=1;'} | |
# 發送get 請求 到 ptt 八卦版 | |
response = requests.get(URL, headers = my_headers) | |
# 把網頁程式碼(HTML) 丟入 bs4模組分析 | |
soup = bs4.BeautifulSoup(response.text,"html.parser") | |
## PTT 上方4個欄位 | |
header = soup.find_all('span','article-meta-value') | |
# 作者 | |
author = header[0].text | |
# 看版 | |
board = header[1].text | |
# 標題 | |
title = header[2].text | |
# 日期 | |
date = header[3].text | |
## 查找所有html 元素 抓出內容 | |
main_container = soup.find(id='main-container') | |
# 把所有文字都抓出來 | |
all_text = main_container.text | |
# 把整個內容切割透過 "-- " 切割成2個陣列 | |
pre_text = all_text.split('--')[0] | |
# 把每段文字 根據 '\n' 切開 | |
texts = pre_text.split('\n') | |
# 如果你爬多篇你會發現 | |
contents = texts[2:] | |
# 內容 | |
content = '\n'.join(contents) | |
# 顯示 | |
print('作者:'+author) | |
print('看板:'+board) | |
print('標題:'+title) | |
print('日期:'+date) | |
print('內容:'+content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment