Skip to content

Instantly share code, notes, and snippets.

@dreamline2
Created February 18, 2020 06:48
Show Gist options
  • Save dreamline2/6df1c4ff4e00e36404669b1437b45820 to your computer and use it in GitHub Desktop.
Save dreamline2/6df1c4ff4e00e36404669b1437b45820 to your computer and use it in GitHub Desktop.
Crawler for facebook group posts.
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import numpy as np
import time
import csv
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome("./chromedriver",chrome_options=chrome_options)
driver.get("https://www.facebook.com/groups/pythontw/")
post=[]
soup=BeautifulSoup(driver.page_source,"lxml")
first=soup.find_all(class_="text_exposed_root")
for i in first:
post.append(i.text)
timme=soup.find_all("abbr")
real=[]
for i in timme:
if i.get("title")!=None:
real.append(i.get("title"))
while real[-1].split(" ")[0]!="2019/9/3":
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
post=[]
soup=BeautifulSoup(driver.page_source,"lxml")
first=soup.find_all(class_="text_exposed_root")
for i in first:
post.append(i.text)
timme=soup.find_all("abbr")
real=[]
for i in timme:
if i.get("title")!=None:
real.append(i.get("title"))
print(real[-1].split(" ")[0])
print(len(post))
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment