Last active
June 11, 2020 12:17
-
-
Save ting11222001/8b8c704a8defdd427a7b22b86c00e0ac to your computer and use it in GitHub Desktop.
medium_drafts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as bs | |
#NBA網址 | |
res = requests.get('https://www.ptt.cc/bbs/NBA/index.html') | |
#用lxml’s HTML parser直接解析html的tag | |
soup = bs(res.text,'lxml') | |
#建議看完就把它comment掉省記憶體 | |
#res.text | |
#觀念練習起手式 | |
#抓這個頁面所有符合這個條件也就是的內容 | |
#find_all回傳是一個list | |
raw_titles = soup.find_all('div','title') | |
#可以抓出第一個標籤 | |
raw_titles[0] | |
#每ㄧ個內容物是在developer tool 的 element頁面下的標籤 | |
type(raw_titles[0]) | |
#抓出標籤中的文字 | |
raw_titles[0].text | |
#在tag裡面再找'a'tag, 再get('href') | |
raw_titles[0].find('a').get('href') | |
#也可以寫成: | |
raw_titles[0].find('a')['href'] | |
#用for迴圈把所有文章的tag印出來 | |
for i in raw_titles: | |
print(i) | |
#第一種解法, 每個i都是div標籤, 然後div裡面有a, a裡面有href, 建議每次都先print(i)並用print('===')隔開: | |
# results = [] | |
# for i in raw_titles: | |
# result = {} | |
# result['topic'] = i.text.strip() #字串前後空白都會去掉 | |
# result['url'] = i.find('a').get('href') | |
# results.append(result) | |
# print(results) | |
#就完成囉! | |
#也可以練習selector定位, 也是回傳list | |
soup.select('#main-container > div.r-list-container.action-bar-margin.bbs-screen > div:nth-child(2) > div.title > a') | |
#如果把指定位置的div:nth-child(2)改成div, 就會取到所有文章的'a' | |
soup.select('#main-container > div.r-list-container.action-bar-margin.bbs-screen > div > div.title > a') | |
#也可以把所有位置縮減成div.title > a, 也會取到所有文章的'a' | |
#建議在selector位置的一整串定位中,從最後面開始看, 有沒有帶有數字等等, 試著縮減一整串定位看看 | |
#這個做法是可以在有被刪除的文章時,就不會報錯 | |
soup.select('div.title > a') | |
#拿到a標籤中的超連結 | |
for a in soup.select('div.title > a'): | |
print(a.get('href')) | |
#第二種解法, 每個i都是a標籤 | |
#可以省略div.title變成.title | |
raw_titles = soup.select('.title > a') | |
results = [] | |
for i in raw_titles: | |
result = {} | |
result['topic'] = i.text | |
result['url'] = 'https://www.ptt.cc'+i.get('href') #可以在抓下來的網址前面加上ptt的url標頭 | |
results.append(result) | |
print(results) | |
#匯出成json檔 | |
import json | |
with open('nba.json','a',encoding='utf8') as file: | |
for i in results: | |
file.write(json.dumps(i, ensure_ascii=False)) | |
file.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Added on 20200611