Skip to content

Instantly share code, notes, and snippets.

@u0401006
Last active May 5, 2020 04:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save u0401006/5914d98393484c40a74e1db3154df34c to your computer and use it in GitHub Desktop.
Save u0401006/5914d98393484c40a74e1db3154df34c to your computer and use it in GitHub Desktop.
臉書粉絲頁貼文爬蟲
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time as tm
import pandas as pd
from random import randint
#設定webdriver的參數,這個prefs是關掉Facebook通知
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome('chromedriver-80',chrome_options=chrome_options)
#假瀏覽器打開的網址
driver.get("https://m.facebook.com/pg/你要的頁面/posts/")
#開4個list暫存解析的資料
p_links, p_time, p_content, p_interaction = [], [], [], []
counter = 0
while counter < 2000:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
tm.sleep(randint(0,3))
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
tm.sleep(randint(0,3))
soup = BeautifulSoup(driver.page_source,"lxml")
post_div = soup.find_all("article")
#抓到每一則貼文的層級
for j,i in enumerate(post_div):
#貼文內容
try:
p_content.append(''.join([k.text for k in i.find('header').find_next_siblings()]))
except:
p_content.append('')
print('content'+str(j))
#貼文時間
try:
p_time.append(i.find("header").find("abbr").text)
except AttributeError:
try:
p_time.append(i.find("header").find_next_sibling().find('abbr').text)
except:
p_time.append('')
print('time'+str(j))
#貼文連結
try:
p_links.append(i.find("header").find('h3').find_next_sibling().find('a')['href'])
except:
try:
p_links.append(i.find("header").find('a')['href'])
except:
p_links.append('')
print('link'+str(j))
#貼文互動數據
try:
p_interaction.append(i.find('footer').find("span").text)
except:
p_interaction.append('')
print('reaction'+str(j))
counter += 1
if counter % 500 == 1:
df = pd.DataFrame()
columns = ['p_content','p_time','p_links','p_interaction']
cols = [p_content, p_time, p_links, p_interaction]
for i,j in zip(columns,cols):
df[i] = j
df.to_csv('tsai-fb-re-m.csv',sep='\t',encoding='utf-8')
df = pd.DataFrame()
columns = ['p_content','p_time','p_links','p_interaction']
cols = [p_content, p_time, p_links, p_interaction]
for i,j in zip(columns,cols):
df[i] = j
df.drop_duplicates()
df.to_csv('檔名.csv',sep='\t',encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment