Create a gist now

Instantly share code, notes, and snippets.

Embed
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = 'https://www.ptt.cc'
def get_web_page(url):
time.sleep(0.5) # 每次爬取前暫停 0.5 秒以免被 PTT 網站判定為大量惡意爬取
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_articles(dom, date):
soup = BeautifulSoup(dom, 'html.parser')
# 取得上一頁的連結
paging_div = soup.find('div', 'btn-group btn-group-paging')
prev_url = paging_div.find_all('a')[1]['href']
articles = [] # 儲存取得的文章資料
divs = soup.find_all('div', 'r-ent')
for d in divs:
if d.find('div', 'date').string.strip() == date: # 發文日期正確
# 取得推文數
push_count = 0
if d.find('div', 'nrec').string:
try:
push_count = int(d.find('div', 'nrec').string) # 轉換字串為數字
except ValueError: # 若轉換失敗,不做任何事,push_count 保持為 0
pass
# 取得文章連結及標題
if d.find('a'): # 有超連結,表示文章存在,未被刪除
href = d.find('a')['href']
title = d.find('a').string
articles.append({
'title': title,
'href': href,
'push_count': push_count
})
return articles, prev_url
def parse(dom):
soup = BeautifulSoup(dom, 'html.parser')
links = soup.find(id='main-content').find_all('a')
img_urls = []
for link in links:
if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
img_urls.append(link['href'])
return img_urls
def save(img_urls, title):
if img_urls:
try:
dname = title.strip() # 用 strip() 去除字串前後的空白
os.makedirs(dname)
for img_url in img_urls:
if img_url.split('//')[1].startswith('m.'):
img_url = img_url.replace('//m.', '//i.')
if not img_url.split('//')[1].startswith('i.'):
img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
if not img_url.endswith('.jpg'):
img_url += '.jpg'
fname = img_url.split('/')[-1]
urllib.request.urlretrieve(img_url, os.path.join(dname, fname))
except Exception as e:
print(e)
if __name__ == '__main__':
current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html')
if current_page:
articles = [] # 全部的今日文章
date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章
while current_articles: # 若目前頁面有今日文章則加入 articles,並回到上一頁繼續尋找是否有今日文章
articles += current_articles
current_page = get_web_page(PTT_URL + prev_url)
current_articles, prev_url = get_articles(current_page, date)
# 已取得文章列表,開始進入各文章讀圖
for article in articles:
print('Processing', article)
page = get_web_page(PTT_URL + article['href'])
if page:
img_urls = parse(page)
save(img_urls, article['title'])
article['num_image'] = len(img_urls)
# 儲存文章資訊
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment