linzino7/pp_z.py

## pp_z.py
# -*- coding: utf-8 -*-
"""
Created on Tue May 12 04:38:47 2020

@author: Zino
"""

# 導入 模組(module)
import requests
# 導入 BeautifulSoup 模組(module)：解析HTML 語法工具
import bs4

import time

def get_one_page(URL):
    '''
    程式碼解說詳細請看：
    https://medium.com/p/a8216873a9d3
    '''

    # 設定Header與Cookie
    my_headers = {'cookie': 'over18=1;'}
    # 發送get 請求 到 ptt 八卦版
    response = requests.get(URL, headers = my_headers)


    # 2-1 把網頁程式碼(HTML) 丟入 bs4模組分析
    soup = bs4.BeautifulSoup(response.text,"html.parser")

    # 2-2 查找所有html 元素 過濾出 標籤名稱為 'div' 同時class為 title
    titles = soup.find_all('div','title')

    # 2-3 萃取文字出來。
    # 因為我們有多個Tags存放在 List titles中。
    # 所以需要使用for 迴圈將逐筆將List
    for t in titles:
        print(t.text.strip())  #strip 是把空白去掉的意思。


start = 38880 # 設定起始網頁 (務必自行調整)
number = 5    # 設定要從開始頁面往後爬多少個
end = start - number

for i in range(start,end,-1):

    # 組成 正確 URL
    link = "https://www.ptt.cc/bbs/Gossiping/index"+str(i)+".html"
    # 執行單頁面網頁爬蟲
    get_one_page(link)
    # 避免被太快被 PTT 封鎖請求
    time.sleep(1)
	# -- coding: utf-8 --
	"""
	Created on Tue May 12 04:38:47 2020

	@author: Zino
	"""

	# 導入模組(module)
	import requests
	# 導入 BeautifulSoup 模組(module)：解析HTML 語法工具
	import bs4

	import time

	def get_one_page(URL):
	'''
	程式碼解說詳細請看：
	https://medium.com/p/a8216873a9d3
	'''

	# 設定Header與Cookie
	my_headers = {'cookie': 'over18=1;'}
	# 發送get 請求到 ptt 八卦版
	response = requests.get(URL, headers = my_headers)


	# 2-1 把網頁程式碼(HTML) 丟入 bs4模組分析
	soup = bs4.BeautifulSoup(response.text,"html.parser")

	# 2-2 查找所有html 元素過濾出標籤名稱為 'div' 同時class為 title
	titles = soup.find_all('div','title')

	# 2-3 萃取文字出來。
	# 因為我們有多個Tags存放在 List titles中。
	# 所以需要使用for 迴圈將逐筆將List
	for t in titles:
	print(t.text.strip()) #strip 是把空白去掉的意思。


	start = 38880 # 設定起始網頁 (務必自行調整)
	number = 5 # 設定要從開始頁面往後爬多少個
	end = start - number

	for i in range(start,end,-1):

	# 組成正確 URL
	link = "https://www.ptt.cc/bbs/Gossiping/index"+str(i)+".html"
	# 執行單頁面網頁爬蟲
	get_one_page(link)
	# 避免被太快被 PTT 封鎖請求
	time.sleep(1)