Created
May 24, 2020 10:11
-
-
Save linzino7/df0d508b2c67df1488096cff2ea6131f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue May 12 04:38:47 2020 | |
@author: Zino | |
""" | |
# 導入 模組(module) | |
import requests | |
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具 | |
import bs4 | |
import time | |
def get_one_page(URL): | |
''' | |
程式碼解說詳細請看: | |
https://medium.com/p/a8216873a9d3 | |
''' | |
# 設定Header與Cookie | |
my_headers = {'cookie': 'over18=1;'} | |
# 發送get 請求 到 ptt 八卦版 | |
response = requests.get(URL, headers = my_headers) | |
# 2-1 把網頁程式碼(HTML) 丟入 bs4模組分析 | |
soup = bs4.BeautifulSoup(response.text,"html.parser") | |
# 2-2 查找所有html 元素 過濾出 標籤名稱為 'div' 同時class為 title | |
titles = soup.find_all('div','title') | |
# 2-3 萃取文字出來。 | |
# 因為我們有多個Tags存放在 List titles中。 | |
# 所以需要使用for 迴圈將逐筆將List | |
for t in titles: | |
print(t.text.strip()) #strip 是把空白去掉的意思。 | |
start = 38880 # 設定起始網頁 (務必自行調整) | |
number = 5 # 設定要從開始頁面往後爬多少個 | |
end = start - number | |
for i in range(start,end,-1): | |
# 組成 正確 URL | |
link = "https://www.ptt.cc/bbs/Gossiping/index"+str(i)+".html" | |
# 執行單頁面網頁爬蟲 | |
get_one_page(link) | |
# 避免被太快被 PTT 封鎖請求 | |
time.sleep(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment