Last active
June 12, 2020 09:41
-
-
Save ting11222001/03245671f890ef039c50024189c159b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://technews.tw/ | |
#爬取內容請以當日網頁為準 | |
#第一階段 | |
import requests | |
from bs4 import BeautifulSoup as bs | |
import json | |
res = requests.get('https://technews.tw/') | |
soup = bs(res.text,'lxml') | |
result = [] | |
#抓出每一個文章區塊 | |
blocks = soup.find_all('li','block2014') | |
for block in blocks: | |
try: | |
dic = {} | |
category = block.find('div','cat01').text | |
dic['category'] = category | |
sum_title = block.find('div','sum_title').text.strip() | |
dic['sum_title'] = sum_title | |
sum_title_url = block.find('div','img').find('a').get('href') | |
dic['sum_title_url'] = 'https:'+sum_title_url | |
spotlist_list = block.find_all('li','spotlist') | |
spotlist = [] | |
for li in spotlist_list: | |
dic_spotlist = {} | |
title = li.text.strip() | |
dic_spotlist['title'] = title | |
url = li.find('a').get('href') | |
dic_spotlist['url'] = 'https:'+url | |
spotlist.append(dic_spotlist) | |
dic['spotlist'] = spotlist | |
result.append(dic) | |
except Exception as e: | |
print(e) | |
print(i) | |
continue | |
file_name = 'technews.json' | |
with open (file_name, 'w', encoding = 'utf8') as file: | |
json.dump(result, file, ensure_ascii=False) | |
#第二階段 | |
file_name = 'technews.json' | |
with open(file_name, 'r', encoding = 'utf8') as file: | |
articles = json.load(file) | |
# tag.text.strip()是可以把文字抓出來 | |
for article in articles: | |
try: | |
res = requests.get(article['sum_title_url']) | |
soup = bs(res.text,'lxml') | |
file_name = 'sum_'+article['category']+'_'+article['sum_title'][0:4]+'.txt' | |
for p_text in soup.select('div.indent > p'): | |
with open(file_name, 'a') as file: | |
file.write(p_text.text.strip()) | |
spotlist = article['spotlist'] | |
for spot in spotlist: | |
res2 = requests.get(spot['url']) | |
soup2 = bs(res2.text,'lxml') | |
soup2.select('div.indent > p') | |
file_name2 = 'spot_'+article['category']+'_'+spot['title'][0:4]+'.txt' | |
for p_text in soup2.select('div.indent > p'): | |
with open(file_name2, 'a') as file: | |
file.write(p_text.text.strip()) | |
except Exception as e: | |
print(e) | |
print(i) | |
continue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment