Skip to content

Instantly share code, notes, and snippets.

@River2056
Created October 20, 2020 10:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save River2056/5b82b2d5cb8948c52781421fb96a03e6 to your computer and use it in GitHub Desktop.
Save River2056/5b82b2d5cb8948c52781421fb96a03e6 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from io import StringIO
import re
import time
# 基本變數設定
base_url = 'http://ptt.cc'
sub_base_url = '/bbs/Gossiping/index.html' # ptt gossiping url
full_url = base_url + sub_base_url
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
cookies = {'over18': '1'} # 八卦版為成人才能閱讀
# 存放所有貼文
titles_over_10_comments = []
# 如果貼文數還沒累積到20則, 繼續迴圈
while len(titles_over_10_comments) < 20:
# 發送請求, 獲取網頁
res = requests.get(url=full_url, headers=headers, cookies=cookies)
soup = BeautifulSoup(StringIO(res.text), 'html.parser')
# 尋找標籤
all_title_tags = soup.find_all('div', { 'class': 'r-ent' })
for tag in all_title_tags:
indicator = tag.find('div', { 'class': 'nrec' }).find('span', { 'class': ['hl f3', 'hl f1'] })
if (indicator != None and indicator.get_text() == '爆') or (indicator != None and int(indicator.get_text().strip()) >= 10):
titles_over_10_comments.append(tag)
# 如果已經累積20則貼文, 中斷迴圈
if len(titles_over_10_comments) >= 20:
break
# 找完整頁的標籤後, 請換下一頁
page_btns = soup.find_all('a', { 'class': 'btn wide' })
gossiping_index = page_btns[1]['href']
full_url = base_url + gossiping_index
print(f'Done fetching... {len(titles_over_10_comments)}')
time.sleep(1)
# 輸出所有貼文
for tag in titles_over_10_comments:
indicator = tag.find('div', { 'class': 'nrec' }).find('span', { 'class': ['hl f1', 'hl f3'] })
push_amount = indicator.get_text()
a_link = tag.find_all(href=re.compile("/bbs/Gossiping/.*"))
print(f'{push_amount} {a_link[0].get_text()} \n link: {base_url}{a_link[0]["href"]} \n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment