Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save QuantumCalzone/7c728be7cd44e732e7487ffa0ff90a54 to your computer and use it in GitHub Desktop.
Save QuantumCalzone/7c728be7cd44e732e7487ffa0ff90a54 to your computer and use it in GitHub Desktop.
Scrapes your saved Facebook links, cleans them up, and export them as a list to a txt file
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
import os
import time
import sys
import bs4
from urllib.parse import unquote
import unittest, time, re
from bs4 import BeautifulSoup as soup
#create the initial csv file
filename = "SavedFacebookLinks.text"
#f is the normal convention for a File Writer | w stands for "Write"
f = open(filename, "w")
print("\n")
loops = int(input("Enter loops: "))
print("\n")
sleepTime = int(input("Enter sleepTime: "))
debug = "Done!"
class Sel(unittest.TestCase):
def setUp(self):
#set chrome settings
options = webdriver.ChromeOptions()
options.add_argument("--ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors")
#load real user data
options.add_argument(r"--user-data-dir=C:\Users\georg_000\AppData\Local\Google\Chrome\User Data\Default")
#fullscreen
#options.add_argument("--start-maximized")
#assign driver location
chromedriver = "C:\Python27\DownloadedTools\chromedriver_win32\chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver
self.driver = webdriver.Chrome(options=options, executable_path=chromedriver)
self.verificationErrors = []
self.accept_next_alert = True
def test_sel(self):
driver = self.driver
driver.get("https://www.facebook.com/saved/")
#driver.find_element_by_link_text("All").click()
print("\n")
time.sleep(sleepTime)
for i in range(1,loops):
print("loop "+str(i)+" / "+str(loops))
time.sleep(sleepTime)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html_source = driver.page_source
#data = html_source.encode('utf-8')
#html parsing
page_soup = soup(html_source, "html.parser")
savedLinks = page_soup.findAll("div", {"class":"_4bl9 _5yjp"})
print("\n")
for savedLink in savedLinks:
title = savedLink.a
print(title)
link = savedLink.a["href"]
link = unquote(link)
#clean up the links
link = link.replace("https://l.facebook.com/l.php?u=", "")
link = link.replace("https://facebook.com", "")
if ("/videos/" in link and "." not in link):
link = "https://facebook.com"+link
#clean up the links
link = re.sub("&h=?.*", "", link)
link = re.sub("\?(.*)", "", link)
print(link)
print("\n")
f.write(link + "\n")
print("\n")
print("savedLinks: "+str(len(savedLinks)))
f.close()
print("\n")
print(debug)
if __name__ == "__main__":
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment