Last active
May 5, 2020 04:41
-
-
Save u0401006/5914d98393484c40a74e1db3154df34c to your computer and use it in GitHub Desktop.
臉書粉絲頁貼文爬蟲
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import selenium | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import time as tm | |
import pandas as pd | |
from random import randint | |
#設定webdriver的參數,這個prefs是關掉Facebook通知 | |
chrome_options = webdriver.ChromeOptions() | |
prefs = {"profile.default_content_setting_values.notifications" : 2} | |
chrome_options.add_experimental_option("prefs",prefs) | |
driver=webdriver.Chrome('chromedriver-80',chrome_options=chrome_options) | |
#假瀏覽器打開的網址 | |
driver.get("https://m.facebook.com/pg/你要的頁面/posts/") | |
#開4個list暫存解析的資料 | |
p_links, p_time, p_content, p_interaction = [], [], [], [] | |
counter = 0 | |
while counter < 2000: | |
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") | |
tm.sleep(randint(0,3)) | |
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") | |
tm.sleep(randint(0,3)) | |
soup = BeautifulSoup(driver.page_source,"lxml") | |
post_div = soup.find_all("article") | |
#抓到每一則貼文的層級 | |
for j,i in enumerate(post_div): | |
#貼文內容 | |
try: | |
p_content.append(''.join([k.text for k in i.find('header').find_next_siblings()])) | |
except: | |
p_content.append('') | |
print('content'+str(j)) | |
#貼文時間 | |
try: | |
p_time.append(i.find("header").find("abbr").text) | |
except AttributeError: | |
try: | |
p_time.append(i.find("header").find_next_sibling().find('abbr').text) | |
except: | |
p_time.append('') | |
print('time'+str(j)) | |
#貼文連結 | |
try: | |
p_links.append(i.find("header").find('h3').find_next_sibling().find('a')['href']) | |
except: | |
try: | |
p_links.append(i.find("header").find('a')['href']) | |
except: | |
p_links.append('') | |
print('link'+str(j)) | |
#貼文互動數據 | |
try: | |
p_interaction.append(i.find('footer').find("span").text) | |
except: | |
p_interaction.append('') | |
print('reaction'+str(j)) | |
counter += 1 | |
if counter % 500 == 1: | |
df = pd.DataFrame() | |
columns = ['p_content','p_time','p_links','p_interaction'] | |
cols = [p_content, p_time, p_links, p_interaction] | |
for i,j in zip(columns,cols): | |
df[i] = j | |
df.to_csv('tsai-fb-re-m.csv',sep='\t',encoding='utf-8') | |
df = pd.DataFrame() | |
columns = ['p_content','p_time','p_links','p_interaction'] | |
cols = [p_content, p_time, p_links, p_interaction] | |
for i,j in zip(columns,cols): | |
df[i] = j | |
df.drop_duplicates() | |
df.to_csv('檔名.csv',sep='\t',encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment