Skip to content

Instantly share code, notes, and snippets.

@xtream1101
Last active January 31, 2022 03:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xtream1101/5d7ea9e2672a162aa565c46d17feabc0 to your computer and use it in GitHub Desktop.
Save xtream1101/5d7ea9e2672a162aa565c46d17feabc0 to your computer and use it in GitHub Desktop.
Instacart scraper
import json
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriverdownloader import GeckoDriverDownloader
gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install()
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
def login(email, password):
print("Logging in...")
login_url = 'https://www.instacart.com/accounts/login'
options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options,
executable_path=geckodriver[1])
driver.get(login_url)
time.sleep(2)
login_link = driver.find_element_by_xpath('//a[@class="ic-btn ic-btn-success no-underline"]')
login_link.click()
time.sleep(2)
(WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH, '//input[@type="email"]'))))
email_input = (driver.find_element_by_xpath(
'//input[@type="email"]'))
password_input = driver.find_element_by_xpath('//input[@type="password"]')
email_input.clear()
email_input.send_keys(email)
password_input.clear()
password_input.send_keys(password)
password_input.send_keys(Keys.RETURN)
time.sleep(2)
baked_cookies = {}
# Requests does not need all this info in the cookies, strip it out
for cookie in driver.get_cookies():
baked_cookies[cookie['name']] = cookie['value']
return baked_cookies
def get_pdp(cookies, product_id):
print(f"Getting product {product_id}...")
url = f'https://www.instacart.com/v3/containers/items/item_{product_id}'
r = requests.get(url,
cookies=cookies,
headers=headers)
if r.status_code == 200:
return r.json()
else:
print(f"Failed to get product {product_id} {r}")
def process_pdp(pdp_data):
# Rather then just saving the data, pull out the data needed and do something with it
product_title = pdp_data['container']['title']
product_id = pdp_data['container']['tracking_params']['item_id']
print(f"Processing: {product_id} - {product_title}")
# TODO: make sure folder products is already created
with open(f'products/product-{product_id}.json', 'w') as outfile:
json.dump(pdp_data, outfile, sort_keys=True, indent=4)
def search(cookies, term):
# TODO: Make sure the search term is urlencoded
print(f"Searching: {term}...")
url = f'https://www.instacart.com/v3/containers/kroger/search_v3/{term}?source=web&per=50'
r = requests.get(url,
cookies=cookies,
headers=headers)
if r.status_code == 200:
return r.json()
else:
print(f"Failed to search for {term} {r}")
def extract_product_ids(search_results):
# Get the product ids and return as a list
product_list = []
for module in search_results['container']['modules']:
if 'items' in module['data']:
product_list = module['data']['items']
break
product_ids = []
for product in product_list:
product_ids.append(product['id'].split('_')[1])
return product_ids
# Just do this once per session
login_cookies1 = login("<email>", "<password>")
terms = ['cookies']
for term in terms:
# First Search the term
search_results = search(login_cookies1, term)
# Get all the product ids that are in the search results
product_ids = extract_product_ids(search_results)
print(f"Found {len(product_ids)} products for the term {term}")
for product_id in product_ids:
# For each product id, get the product details
pdp_data = get_pdp(login_cookies1, product_id)
# Extract/save the product details
process_pdp(pdp_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment