Skip to content

Instantly share code, notes, and snippets.

@AnderRV
Created November 2, 2022 21:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AnderRV/ce1e59d4f626dfab25873cc98dea1c48 to your computer and use it in GitHub Desktop.
Save AnderRV/ce1e59d4f626dfab25873cc98dea1c48 to your computer and use it in GitHub Desktop.
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
def parse_img_url(url):
# get the first url
url = url.split(", ")[0]
# split it by `/`
splitted_url = url.split("/")
# loop over the elements to find where `cloudfront` url begins
for idx, part in enumerate(splitted_url):
if "cloudfront" in part:
# add the HTTP scheme and concatenate the rest of the URL
# then return the processed url
return "https://" + "/".join(splitted_url[idx:])
# as we don't know if that's the only measurement to take,
# return None if the cloudfront couldn't be found
return None
def extract_data(element):
img = element.find_element(By.TAG_NAME, "img").get_attribute("srcset")
img = parse_img_url(img)
# A>B means the B elements where A is the parent element.
dietary_attrs = element.find_elements(By.CSS_SELECTOR, "div[class*='DietaryAttributes']>span")
# if there aren't any, then `dietary_attrs` will be None and `if` block won't work
# but if there are any dietary attributes, extract the text from them
if dietary_attrs:
dietary_attrs = [attr.text for attr in dietary_attrs]
else:
# set the variable to None if there aren't any dietary attributes found.
dietary_attrs = None
# get the span elements where the parent is a `div` element that
# has `ItemBCardDefault` substring in the `class` attribute
price = element.find_elements(By.CSS_SELECTOR, "div[class*='ItemBCardDefault']>span")
# extract the price text if we could find the price span
if price:
price = price[0].text
else:
price = None
name = element.find_element(By.TAG_NAME, "h2").text
size = element.find_element(By.CSS_SELECTOR, "div[class*='Size']").text
return {
"price": price,
"name": name,
"size": size,
"attrs": dietary_attrs,
"img": img
}
# start by defining the options
options = webdriver.ChromeOptions()
options.headless = True # it's more scalable to work in headless mode
# normally, selenium waits for all resources to download
# we don't need it as the page also populated with the running javascript code.
options.page_load_strategy = 'none'
# this returns the path web driver downloaded
chrome_path = ChromeDriverManager().install()
chrome_service = Service(chrome_path)
# pass the defined options and service objects to initialize the web driver
driver = Chrome(options=options, service=chrome_service)
driver.implicitly_wait(5)
url = "https://www.instacart.com/store/sprouts/collections/bread?guest=True"
driver.get(url)
time.sleep(10)
content = driver.find_element(By.CSS_SELECTOR, "div[class*='ItemsGridWithPostAtcRecommendations'")
breads = content.find_elements(By.TAG_NAME, "li")
data = []
for bread in breads:
extracted_data = extract_data(bread)
data.append(extracted_data)
df = pd.DataFrame(data)
df.to_csv("result.csv", index=False)
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment