Created
October 20, 2020 10:19
-
-
Save River2056/5b82b2d5cb8948c52781421fb96a03e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from io import StringIO | |
import re | |
import time | |
# 基本變數設定 | |
base_url = 'http://ptt.cc' | |
sub_base_url = '/bbs/Gossiping/index.html' # ptt gossiping url | |
full_url = base_url + sub_base_url | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'} | |
cookies = {'over18': '1'} # 八卦版為成人才能閱讀 | |
# 存放所有貼文 | |
titles_over_10_comments = [] | |
# 如果貼文數還沒累積到20則, 繼續迴圈 | |
while len(titles_over_10_comments) < 20: | |
# 發送請求, 獲取網頁 | |
res = requests.get(url=full_url, headers=headers, cookies=cookies) | |
soup = BeautifulSoup(StringIO(res.text), 'html.parser') | |
# 尋找標籤 | |
all_title_tags = soup.find_all('div', { 'class': 'r-ent' }) | |
for tag in all_title_tags: | |
indicator = tag.find('div', { 'class': 'nrec' }).find('span', { 'class': ['hl f3', 'hl f1'] }) | |
if (indicator != None and indicator.get_text() == '爆') or (indicator != None and int(indicator.get_text().strip()) >= 10): | |
titles_over_10_comments.append(tag) | |
# 如果已經累積20則貼文, 中斷迴圈 | |
if len(titles_over_10_comments) >= 20: | |
break | |
# 找完整頁的標籤後, 請換下一頁 | |
page_btns = soup.find_all('a', { 'class': 'btn wide' }) | |
gossiping_index = page_btns[1]['href'] | |
full_url = base_url + gossiping_index | |
print(f'Done fetching... {len(titles_over_10_comments)}') | |
time.sleep(1) | |
# 輸出所有貼文 | |
for tag in titles_over_10_comments: | |
indicator = tag.find('div', { 'class': 'nrec' }).find('span', { 'class': ['hl f1', 'hl f3'] }) | |
push_amount = indicator.get_text() | |
a_link = tag.find_all(href=re.compile("/bbs/Gossiping/.*")) | |
print(f'{push_amount} {a_link[0].get_text()} \n link: {base_url}{a_link[0]["href"]} \n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment