Skip to content

Instantly share code, notes, and snippets.

@e96031413
Created January 30, 2020 06:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save e96031413/e92647ecd62c3bb975b43308767337af to your computer and use it in GitHub Desktop.
Save e96031413/e92647ecd62c3bb975b43308767337af to your computer and use it in GitHub Desktop.
臉書粉絲專頁爬蟲-以FB台灣大哥大粉絲專頁為例
# 程式碼來自:https://tlyu0419.github.io/2019/05/01/Crawl-Facebook/
# 程式碼最後匯出成Excel的函式需要xlrd套件(pip install xlrd)
import pandas as pd
import re, time, requests
from selenium import webdriver
from bs4 import BeautifulSoup
def FindLinks(url, n):
Links = []
driver.get(url)
for i in range(n):
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# 這裡會跳出要我們登入的大畫面,找到「稍後再說」的按鈕並點擊
driver.find_element_by_xpath('//a[@id="expanding_cta_close_button"]').click()
soup = BeautifulSoup(driver.page_source)
posts = soup.findAll('div', {'class':'clearfix y_c3pyo2ta3'})
for i in posts:
Links.append('https://www.facebook.com' + i.find('a',{'class':'_5pcq'}).attrs['href'].split('?',2)[0])
return Links
def expand(url):
driver.get(url)
try:
driver.find_element_by_xpath('//a[@lang="en_US"]').click()
except:
print("Now is in EN_US")
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# 點擊「comments」,藉以展開留言
try:
driver.find_element_by_xpath('//div[@class="_5pcr userContentWrapper"]//a[@data-testid="UFI2CommentsCount/root"]').click()
time.sleep(1)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
driver.find_element_by_id('expanding_cta_close_button').click()
except:
print('There is no comment!')
k = 1
while k != 0:
k = 0
for i in driver.find_elements_by_xpath('//div[@class="_5pcr userContentWrapper"]//div[@data-testid="UFI2CommentsList/root_depth_0"]//a[@role="button"]'):
# 反覆偵測是否有「看更多留言」、「看更多回覆」與「看完整貼文內容」等按鈕,若有擇點擊
if bool(re.search('comment|More|Repl',i.text)) == True :
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2)
try:
driver.find_element_by_xpath('//div[@style="display: block;"]//a[@id="expanding_cta_close_button"]').click()
except:
print('No pupup!')
try:
i.click()
except:
print('Nothing')
time.sleep(2)
k += 1
# 文章內容與互動摘要
def PostContent(soup):
# po文區塊
userContent = soup.find('div', {'class':'_5pcr userContentWrapper'})
# po文人資訊區塊
PosterInfo = userContent.find('div', {'class':'l_c3pyo2v0u i_c3pynyi2f clearfix'})
# 互動摘要區(讚、留言與分享)
feedback = soup.find('form', {'class':'commentable_item collapsed_comments'})
# 名稱
Name = PosterInfo.find('img').attrs['aria-label']
# ID
ID = PosterInfo.find('a', {'class':'_5pb8 o_c3pynyi2g _8o _8s lfloat _ohe'}).attrs['href'].split('/?',2)[0].split('/',-1)[-1]
# 網址
Link = driver.current_url
# 發文時間
try:
Time = PosterInfo.find('abbr').attrs['title']
except:
Time = PosterInfo.find('div', {'class':'_1atc fsm fwn fcg'}).text
# 文章內容
try:
Content = userContent.find('div', {'class':'_5pbx userContent _3576'}).text
except:
Content = ""
# Like
try:
Like = feedback.find('span', {'data-testid':'UFI2TopReactions/tooltip_LIKE'}).find('a').attrs['aria-label']
except:
Like = '0'
# Angry
try:
ANGER = feedback.find('span', {'data-testid':'UFI2TopReactions/tooltip_ANGER'}).find('a').attrs['aria-label']
except:
ANGER = '0'
# HAHA
try:
HAHA = feedback.find('span', {'data-testid':'UFI2TopReactions/tooltip_HAHA'}).find('a').attrs['aria-label']
except:
HAHA = '0'
# 留言
try:
commentcount = feedback.find('a', {'data-testid':'UFI2CommentsCount/root'}).text
except:
commentcount = '0'
# 分享
try:
share = feedback.find('span', {'class':'_355t _4vn2'}).text
except:
share = '0'
return pd.DataFrame(
data = [{'Name':Name,
'ID':ID,
'Link':Link,
'Time':Time,
'Content':Content,
'Like':Like,
'ANGER':ANGER,
"HAHA":HAHA,
'commentcount':commentcount,
'share':share}],
columns = ['Name', 'ID', 'Time', 'Content', 'Like', 'ANGER', 'HAHA', 'commentcount', 'share', 'Link'])
# 留言
def CrawlComment(soup):
Comments = pd.DataFrame()
# po文區塊
userContent = soup.find('div', {'class':'_5pcr userContentWrapper'})
# 用戶留言區
userContent = soup.find('div', {'class':'_5pcr userContentWrapper'})
# 回應貼文的留言
for i in userContent.findAll('div', {'data-testid':'UFI2Comment/root_depth_0'}):
try:
CommentContent = i.find('span', {'dir':'ltr'}).text
except:
CommentContent = 'Sticker'
Comment = pd.DataFrame(data = [{'CommentID':i.find('a', {'class':' _3mf5 _3mg0'}).attrs['data-hovercard'].split('id=',2)[1],
'CommentName':i.find('img').attrs['alt'],
'CommentTime':i.find('abbr',{'class':'livetimestamp'}).attrs['data-tooltip-content'],
'CommentContent':CommentContent,
'Link':driver.current_url}],
columns = ['CommentID', 'CommentName', 'CommentTime', 'CommentContent', 'Link'])
Comments = pd.concat([Comments, Comment], ignore_index=True)
# 回應留言的留言
for i in userContent.findAll('div', {'data-testid':'UFI2Comment/root_depth_1'}):
try:
CommentContent = i.find('span', {'dir':'ltr'}).text
except:
CommentContent = 'Sticker'
Comment = pd.DataFrame(data = [{'CommentID':i.find('a', {'class':' _3mf5 _3mg1'}).attrs['data-hovercard'].split('id=',2)[1],
'CommentName':i.find('img').attrs['alt'],
'CommentTime':i.find('abbr',{'class':'livetimestamp'}).attrs['data-tooltip-content'],
'CommentContent':CommentContent,
'Link':driver.current_url}],
columns = ['CommentID', 'CommentName', 'CommentTime', 'CommentContent', 'Link'])
Comments = pd.concat([Comments, Comment], ignore_index=True)
return Comments
driver = webdriver.Chrome()
Links = FindLinks(url = 'https://facebook.com/taiwanmobile/',
n = 20)
Links
# 抓下來所有留言
PostsInformation = pd.DataFrame()
PostsComments = pd.DataFrame()
for i in Links:
print('Dealing with: ' + i)
try:
expand(i)
soup = BeautifulSoup(driver.page_source)
PostsInformation = pd.concat([PostsInformation, PostContent(soup)],ignore_index=True)
PostsComments = pd.concat([PostsComments, CrawlComment(soup)],ignore_index=True)
except:
print('Load Failed: ' + i)
PostsInformation
PostsComments
PostsInformation.to_excel('PostsInformation.xlsx')
PostsComments.to_excel('PostsComments.xlsx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment