Skip to content

Instantly share code, notes, and snippets.

@ting11222001
Last active June 12, 2020 09:41
Show Gist options
  • Save ting11222001/03245671f890ef039c50024189c159b6 to your computer and use it in GitHub Desktop.
Save ting11222001/03245671f890ef039c50024189c159b6 to your computer and use it in GitHub Desktop.
#https://technews.tw/
#爬取內容請以當日網頁為準
#第一階段
import requests
from bs4 import BeautifulSoup as bs
import json
res = requests.get('https://technews.tw/')
soup = bs(res.text,'lxml')
result = []
#抓出每一個文章區塊
blocks = soup.find_all('li','block2014')
for block in blocks:
try:
dic = {}
category = block.find('div','cat01').text
dic['category'] = category
sum_title = block.find('div','sum_title').text.strip()
dic['sum_title'] = sum_title
sum_title_url = block.find('div','img').find('a').get('href')
dic['sum_title_url'] = 'https:'+sum_title_url
spotlist_list = block.find_all('li','spotlist')
spotlist = []
for li in spotlist_list:
dic_spotlist = {}
title = li.text.strip()
dic_spotlist['title'] = title
url = li.find('a').get('href')
dic_spotlist['url'] = 'https:'+url
spotlist.append(dic_spotlist)
dic['spotlist'] = spotlist
result.append(dic)
except Exception as e:
print(e)
print(i)
continue
file_name = 'technews.json'
with open (file_name, 'w', encoding = 'utf8') as file:
json.dump(result, file, ensure_ascii=False)
#第二階段
file_name = 'technews.json'
with open(file_name, 'r', encoding = 'utf8') as file:
articles = json.load(file)
# tag.text.strip()是可以把文字抓出來
for article in articles:
try:
res = requests.get(article['sum_title_url'])
soup = bs(res.text,'lxml')
file_name = 'sum_'+article['category']+'_'+article['sum_title'][0:4]+'.txt'
for p_text in soup.select('div.indent > p'):
with open(file_name, 'a') as file:
file.write(p_text.text.strip())
spotlist = article['spotlist']
for spot in spotlist:
res2 = requests.get(spot['url'])
soup2 = bs(res2.text,'lxml')
soup2.select('div.indent > p')
file_name2 = 'spot_'+article['category']+'_'+spot['title'][0:4]+'.txt'
for p_text in soup2.select('div.indent > p'):
with open(file_name2, 'a') as file:
file.write(p_text.text.strip())
except Exception as e:
print(e)
print(i)
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment