ting11222001/beautifulsoup_technews.py

## beautifulsoup_technews.py
#https://technews.tw/
#爬取內容請以當日網頁為準
#第一階段
import requests
from bs4 import BeautifulSoup as bs
import json

res = requests.get('https://technews.tw/')
soup = bs(res.text,'lxml')

result = []
#抓出每一個文章區塊
blocks = soup.find_all('li','block2014')

for block in blocks:
    try:
        dic = {}
        category = block.find('div','cat01').text
        dic['category'] = category
        sum_title = block.find('div','sum_title').text.strip()
        dic['sum_title'] = sum_title
        sum_title_url = block.find('div','img').find('a').get('href')
        dic['sum_title_url'] = 'https:'+sum_title_url
        spotlist_list = block.find_all('li','spotlist')
        spotlist = []
        for li in spotlist_list:
            dic_spotlist = {}
            title = li.text.strip()
            dic_spotlist['title'] = title
            url = li.find('a').get('href')
            dic_spotlist['url'] = 'https:'+url
            spotlist.append(dic_spotlist)
        dic['spotlist'] = spotlist
        result.append(dic)
    except Exception as e:
        print(e)
        print(i)
        continue

file_name = 'technews.json'
with open (file_name, 'w', encoding = 'utf8') as file:
    json.dump(result, file, ensure_ascii=False)

#第二階段
file_name = 'technews.json'
with open(file_name, 'r', encoding = 'utf8') as file:
    articles = json.load(file)

# tag.text.strip()是可以把文字抓出來
for article in articles:
    try:
        res = requests.get(article['sum_title_url'])
        soup = bs(res.text,'lxml')
        file_name = 'sum_'+article['category']+'_'+article['sum_title'][0:4]+'.txt'
        for p_text in soup.select('div.indent > p'):
            with open(file_name, 'a') as file:
                file.write(p_text.text.strip())
        spotlist = article['spotlist']
        for spot in spotlist:
            res2 = requests.get(spot['url'])
            soup2 = bs(res2.text,'lxml')
            soup2.select('div.indent > p')
            file_name2 = 'spot_'+article['category']+'_'+spot['title'][0:4]+'.txt'
            for p_text in soup2.select('div.indent > p'):
                with open(file_name2, 'a') as file:
                    file.write(p_text.text.strip())
    except Exception as e:
        print(e)
        print(i)
        continue
	#https://technews.tw/
	#爬取內容請以當日網頁為準
	#第一階段
	import requests
	from bs4 import BeautifulSoup as bs
	import json

	res = requests.get('https://technews.tw/')
	soup = bs(res.text,'lxml')

	result = []
	#抓出每一個文章區塊
	blocks = soup.find_all('li','block2014')

	for block in blocks:
	try:
	dic = {}
	category = block.find('div','cat01').text
	dic['category'] = category
	sum_title = block.find('div','sum_title').text.strip()
	dic['sum_title'] = sum_title
	sum_title_url = block.find('div','img').find('a').get('href')
	dic['sum_title_url'] = 'https:'+sum_title_url
	spotlist_list = block.find_all('li','spotlist')
	spotlist = []
	for li in spotlist_list:
	dic_spotlist = {}
	title = li.text.strip()
	dic_spotlist['title'] = title
	url = li.find('a').get('href')
	dic_spotlist['url'] = 'https:'+url
	spotlist.append(dic_spotlist)
	dic['spotlist'] = spotlist
	result.append(dic)
	except Exception as e:
	print(e)
	print(i)
	continue

	file_name = 'technews.json'
	with open (file_name, 'w', encoding = 'utf8') as file:
	json.dump(result, file, ensure_ascii=False)

	#第二階段
	file_name = 'technews.json'
	with open(file_name, 'r', encoding = 'utf8') as file:
	articles = json.load(file)

	# tag.text.strip()是可以把文字抓出來
	for article in articles:
	try:
	res = requests.get(article['sum_title_url'])
	soup = bs(res.text,'lxml')
	file_name = 'sum_'+article['category']+'_'+article['sum_title'][0:4]+'.txt'
	for p_text in soup.select('div.indent > p'):
	with open(file_name, 'a') as file:
	file.write(p_text.text.strip())
	spotlist = article['spotlist']
	for spot in spotlist:
	res2 = requests.get(spot['url'])
	soup2 = bs(res2.text,'lxml')
	soup2.select('div.indent > p')
	file_name2 = 'spot_'+article['category']+'_'+spot['title'][0:4]+'.txt'
	for p_text in soup2.select('div.indent > p'):
	with open(file_name2, 'a') as file:
	file.write(p_text.text.strip())
	except Exception as e:
	print(e)
	print(i)
	continue