Skip to content

Instantly share code, notes, and snippets.

@ting11222001
Last active June 11, 2020 12:17
Show Gist options
  • Save ting11222001/8b8c704a8defdd427a7b22b86c00e0ac to your computer and use it in GitHub Desktop.
Save ting11222001/8b8c704a8defdd427a7b22b86c00e0ac to your computer and use it in GitHub Desktop.
medium_drafts
import requests
from bs4 import BeautifulSoup as bs
#NBA網址
res = requests.get('https://www.ptt.cc/bbs/NBA/index.html')
#用lxml’s HTML parser直接解析html的tag
soup = bs(res.text,'lxml')
#建議看完就把它comment掉省記憶體
#res.text
#觀念練習起手式
#抓這個頁面所有符合這個條件也就是的內容
#find_all回傳是一個list
raw_titles = soup.find_all('div','title')
#可以抓出第一個標籤
raw_titles[0]
#每ㄧ個內容物是在developer tool 的 element頁面下的標籤
type(raw_titles[0])
#抓出標籤中的文字
raw_titles[0].text
#在tag裡面再找'a'tag, 再get('href')
raw_titles[0].find('a').get('href')
#也可以寫成:
raw_titles[0].find('a')['href']
#用for迴圈把所有文章的tag印出來
for i in raw_titles:
print(i)
#第一種解法, 每個i都是div標籤, 然後div裡面有a, a裡面有href, 建議每次都先print(i)並用print('===')隔開:
# results = []
# for i in raw_titles:
# result = {}
# result['topic'] = i.text.strip() #字串前後空白都會去掉
# result['url'] = i.find('a').get('href')
# results.append(result)
# print(results)
#就完成囉!
#也可以練習selector定位, 也是回傳list
soup.select('#main-container > div.r-list-container.action-bar-margin.bbs-screen > div:nth-child(2) > div.title > a')
#如果把指定位置的div:nth-child(2)改成div, 就會取到所有文章的'a'
soup.select('#main-container > div.r-list-container.action-bar-margin.bbs-screen > div > div.title > a')
#也可以把所有位置縮減成div.title > a, 也會取到所有文章的'a'
#建議在selector位置的一整串定位中,從最後面開始看, 有沒有帶有數字等等, 試著縮減一整串定位看看
#這個做法是可以在有被刪除的文章時,就不會報錯
soup.select('div.title > a')
#拿到a標籤中的超連結
for a in soup.select('div.title > a'):
print(a.get('href'))
#第二種解法, 每個i都是a標籤
#可以省略div.title變成.title
raw_titles = soup.select('.title > a')
results = []
for i in raw_titles:
result = {}
result['topic'] = i.text
result['url'] = 'https://www.ptt.cc'+i.get('href') #可以在抓下來的網址前面加上ptt的url標頭
results.append(result)
print(results)
#匯出成json檔
import json
with open('nba.json','a',encoding='utf8') as file:
for i in results:
file.write(json.dumps(i, ensure_ascii=False))
file.write('\n')
@ting11222001
Copy link
Author

Added on 20200611

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment