Skip to content

Instantly share code, notes, and snippets.

@Kiwibp
Created June 8, 2018 17:04
Show Gist options
  • Save Kiwibp/78cf224a0a5d0c2c33fdb371b8ebdb93 to your computer and use it in GitHub Desktop.
Save Kiwibp/78cf224a0a5d0c2c33fdb371b8ebdb93 to your computer and use it in GitHub Desktop.
#facebook marketplace
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient
class App:
def __init__(self, email= "", password= "",
path='/Users/keenanburkepitts/Documents/Data_Science/NYC_DSA/final_project'):
self.email = email
self.password = password
self.path = path
self.driver = webdriver.Firefox()
self.main_url = "https://www.facebook.com"
self.client = MongoClient('mongodb://localhost:27017/')
self.driver.get(self.main_url)
self.log_in()
self.used_item_links = []
self.scrape_item_links()
self.scrape_item_details()
self.driver.quit()
def log_in(self):
try:
email_input = self.driver.find_element_by_id("email")
email_input.send_keys(self.email)
sleep(0.5)
password_input = self.driver.find_element_by_id("pass")
password_input.send_keys(self.password)
sleep(0.5)
login_button = self.driver.find_element_by_xpath("//*[@type='submit']")
login_button.click()
sleep(3)
except Exception:
print('Some exception occurred while trying to find username or password field')
def scrape_item_links(self):
marketplace_button = self.driver.find_element_by_xpath('//div[contains(text(), "Marketplace")]')
marketplace_button.click()
#create a list of each section and loop through list
sections = [self.driver.find_element_by_xpath('//div[contains(text(), "Home & Garden")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Entertainment")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Clothing & Accessories")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Family")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Electronics")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Hobbies")]'),
self.driver.find_element_by_xpath('//div[contains(text(), "Vehicles & Bicycles")]')]
for category in sections:
#self.driver.implicitly_wait(300)
category.click()
for i in range(100):
try:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1.5)
except:
pass
full_items_list = self.driver.find_elements_by_xpath("//a[@class='_1oem']")
if self.used_item_links == 0: #False or None?
self.used_item_links = [item.get_attribute('href') for item in full_items_list]
else:
#append or extend or add?
self.used_item_links = self.used_item_links.extend([item.get_attribute('href') for item in full_items_list])
#wait = WebDriverWait(self.driver, 10)
#category_element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, category.get_attribute('class'))))
#category_element.click()
print([item.get_attribute('href') for item in full_items_list])
print(self.used_item_links)
return self.used_item_links
def scrape_item_details(self, used_item_links):
db = LOCAL_USED_ITEMS
for url in used_item_links:
images = []
driver.get(url)
sleep(0.5)
url = url
try:
image_element = driver.find_element_by_xpath('//img[contains(@class, "_5m")]')
images = [image_element.get_attribute('src')]
except:
images = ""
try:
title = driver.find_element_by_xpath('//div[contains(@class, " _50f")]/span').text
except:
title = ""
try:
date_time = driver.find_element_by_xpath('//a[@class="_r3j"]').text
except:
date_time = ""
try:
location = driver.find_element_by_xpath('//span[@class="_7yi"]').text
except:
location = ""
try:
price = driver.find_element_by_xpath('//div[contains(@class, "_5_md")]').text
except:
price = ""
try:
if driver.find_element_by_xpath("//a[@title='More']").is_displayed():
driver.find_element_by_xpath("//a[@title='More']").click()
description = driver.find_element_by_xpath('//p[@class="_4etw"]/span').text
except:
description = ""
try:
previous_and_next_buttons = driver.find_elements_by_xpath("//i[contains(@class, '_3ffr')]")
next_image_button = previous_and_next_buttons[1]
while next_image_button.is_displayed():
next_image_button.click()
image_element = driver.find_element_by_xpath('//img[contains(@class, "_5m")]')
sleep(1)
if image_element.get_attribute('src') in images:
break
else:
images.append(image_element.get_attribute('src'))
except:
pass
db.Facebook_items.insert({ 'Url': url,
'Images': images,
'Title': title,
'Description': description,
'Date_Time': date_time,
'Location': location,
'Price': price,
})
print({ 'Url': url,
'Images': images,
'Title': title,
'Description': description,
'Date_Time': date_time,
'Location': location,
'Price': price,
})
if __name__ == '__main__':
app = App()
@Hamzosb
Copy link

Hamzosb commented Jul 4, 2020

@carlobianchi89 give me your email i will send you demonstration

@kiwikot
Copy link

kiwikot commented Oct 29, 2020

hi
Im trying to execute :
$ python3.4m Facebook-Marketplace-Selenium.py
and Im getting:

File "Facebook-Marketplace-Selenium.py", line 158, in <module> app = App() File "Facebook-Marketplace-Selenium.py", line 18, in __init__ self.driver = webdriver.Firefox() File "/usr/lib/python3.4/site-packages/selenium/webdriver/firefox/webdriver.py", line 157, in __init__ self.service.start() File "/usr/lib/python3.4/site-packages/selenium/webdriver/common/service.py", line 76, in start stdin=PIPE) File "/usr/lib64/python3.4/subprocess.py", line 858, in __init__ restore_signals, start_new_session) File "/usr/lib64/python3.4/subprocess.py", line 1456, in _execute_child raise child_exception_type(errno_num, err_msg) NotADirectoryError: [Errno 20] Not a directory

any idea what Im doing wrong ?
Cheers

@Kiwibp
Copy link
Author

Kiwibp commented Oct 31, 2020

@kiwikot you probably just need to specify the folder you're going to send the data to based on what I'm guessing from that error message.

def init(self, email= "", password= "",
path='specify/folderpath/here'):

@ohliuw
Copy link

ohliuw commented Dec 19, 2020

So how do I go if I want to scrape for new postings only (eg 24 hours)? Also, what is the area this script would be using? I want to be able to search nationwide?

Thanks

@osrstaurean
Copy link

Hello, from what i could find, these lines of codes arent defined. how is LOCAL_USED_ITEMS supposed to perform?
db = LOCAL_USED_ITEMS

    db.Facebook_items.insert({ 'Url': url,
                     'Images': images,
                     'Title': title,
                     'Description': description,
                     'Date_Time': date_time,
                     'Location': location,
                     'Price': price,          
                    })

thank you for any help:)

@Kiwibp
Copy link
Author

Kiwibp commented Mar 15, 2022

@ohliuw this script just manually scrapes when you run the code, you would need to run it manually daily to capture new posts. I haven't used it in a while but I'm sure there's a way you could automate running the script every 24 hours. Also, the area script covers is dependent on the approximate area from which you're signing in. Sounds like you are looking for more commercialized scraping which is beyond the scope of this project

@osrstaurean that method is simply inserting the parameters for each item scraped into the database. The values for the dictionary created for each item are parsed out via the scrape_item_details function of the script.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment