QuantumCalzone/ScrapeSavedFacebookLinksAndExportToTxt.py

## ScrapeSavedFacebookLinksAndExportToTxt.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
import os
import time
import sys
import bs4
from urllib.parse import unquote

import unittest, time, re

from bs4 import BeautifulSoup as soup

#create the initial csv file
filename = "SavedFacebookLinks.text"

#f is the normal convention for a File Writer | w stands for "Write"
f = open(filename, "w")

print("\n")
loops = int(input("Enter loops: "))
print("\n")
sleepTime = int(input("Enter sleepTime: "))

debug = "Done!"

class Sel(unittest.TestCase):

    def setUp(self):

        #set chrome settings
        options = webdriver.ChromeOptions()
        options.add_argument("--ignore-certificate-errors")
        options.add_argument("--ignore-ssl-errors")
        #load real user data
        options.add_argument(r"--user-data-dir=C:\Users\georg_000\AppData\Local\Google\Chrome\User Data\Default")
        #fullscreen
        #options.add_argument("--start-maximized")
        #assign driver location
        chromedriver = "C:\Python27\DownloadedTools\chromedriver_win32\chromedriver.exe"
        os.environ["webdriver.chrome.driver"] = chromedriver
        self.driver = webdriver.Chrome(options=options, executable_path=chromedriver)
        self.verificationErrors = []
        self.accept_next_alert = True

    def test_sel(self):

        driver = self.driver
        driver.get("https://www.facebook.com/saved/")
        #driver.find_element_by_link_text("All").click()

        print("\n")
        time.sleep(sleepTime)

        for i in range(1,loops):
            print("loop "+str(i)+" / "+str(loops))
            time.sleep(sleepTime)
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        html_source = driver.page_source

        #data = html_source.encode('utf-8')

        #html parsing
        page_soup = soup(html_source, "html.parser")

        savedLinks = page_soup.findAll("div", {"class":"_4bl9 _5yjp"})

        print("\n")
        for savedLink in savedLinks:

            title = savedLink.a

            print(title)

            link = savedLink.a["href"]
            link = unquote(link)
            #clean up the links
            link = link.replace("https://l.facebook.com/l.php?u=", "")
            link = link.replace("https://facebook.com", "")
            if ("/videos/" in link and "." not in link):
                link = "https://facebook.com"+link

            #clean up the links
            link = re.sub("&h=?.*", "", link)
            link = re.sub("\?(.*)", "", link)

            print(link)
            print("\n")

            f.write(link + "\n")

        print("\n")
        print("savedLinks: "+str(len(savedLinks)))
        f.close()

        print("\n")


        print(debug)

if __name__ == "__main__":
    unittest.main()
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.common.by import By
	import selenium.webdriver.support.ui as ui
	import selenium.webdriver.support.expected_conditions as EC
	import os
	import time
	import sys
	import bs4
	from urllib.parse import unquote

	import unittest, time, re

	from bs4 import BeautifulSoup as soup

	#create the initial csv file
	filename = "SavedFacebookLinks.text"

	#f is the normal convention for a File Writer \| w stands for "Write"
	f = open(filename, "w")

	print("\n")
	loops = int(input("Enter loops: "))
	print("\n")
	sleepTime = int(input("Enter sleepTime: "))

	debug = "Done!"

	class Sel(unittest.TestCase):

	def setUp(self):

	#set chrome settings
	options = webdriver.ChromeOptions()
	options.add_argument("--ignore-certificate-errors")
	options.add_argument("--ignore-ssl-errors")
	#load real user data
	options.add_argument(r"--user-data-dir=C:\Users\georg_000\AppData\Local\Google\Chrome\User Data\Default")
	#fullscreen
	#options.add_argument("--start-maximized")
	#assign driver location
	chromedriver = "C:\Python27\DownloadedTools\chromedriver_win32\chromedriver.exe"
	os.environ["webdriver.chrome.driver"] = chromedriver
	self.driver = webdriver.Chrome(options=options, executable_path=chromedriver)
	self.verificationErrors = []
	self.accept_next_alert = True

	def test_sel(self):

	driver = self.driver
	driver.get("https://www.facebook.com/saved/")
	#driver.find_element_by_link_text("All").click()

	print("\n")
	time.sleep(sleepTime)

	for i in range(1,loops):
	print("loop "+str(i)+" / "+str(loops))
	time.sleep(sleepTime)
	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	html_source = driver.page_source

	#data = html_source.encode('utf-8')

	#html parsing
	page_soup = soup(html_source, "html.parser")

	savedLinks = page_soup.findAll("div", {"class":"_4bl9 _5yjp"})

	print("\n")
	for savedLink in savedLinks:

	title = savedLink.a

	print(title)

	link = savedLink.a["href"]
	link = unquote(link)
	#clean up the links
	link = link.replace("https://l.facebook.com/l.php?u=", "")
	link = link.replace("https://facebook.com", "")
	if ("/videos/" in link and "." not in link):
	link = "https://facebook.com"+link

	#clean up the links
	link = re.sub("&h=?.*", "", link)
	link = re.sub("\?(.*)", "", link)

	print(link)
	print("\n")

	f.write(link + "\n")

	print("\n")
	print("savedLinks: "+str(len(savedLinks)))
	f.close()

	print("\n")


	print(debug)

	if __name__ == "__main__":
	unittest.main()