xtream1101/instacart_scraper.py

## instacart_scraper.py
import json
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriverdownloader import GeckoDriverDownloader

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install()


headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}

def login(email, password):
    print("Logging in...")
    login_url = 'https://www.instacart.com/accounts/login'
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(firefox_options=options,
                               executable_path=geckodriver[1])

    driver.get(login_url)
    time.sleep(2)
    login_link = driver.find_element_by_xpath('//a[@class="ic-btn ic-btn-success no-underline"]')
    login_link.click()
    time.sleep(2)
    (WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((
            By.XPATH, '//input[@type="email"]'))))
    email_input = (driver.find_element_by_xpath(
                   '//input[@type="email"]'))
    password_input = driver.find_element_by_xpath('//input[@type="password"]')
    email_input.clear()
    email_input.send_keys(email)
    password_input.clear()
    password_input.send_keys(password)
    password_input.send_keys(Keys.RETURN)

    time.sleep(2)

    baked_cookies = {}
    # Requests does not need all this info in the cookies, strip it out
    for cookie in driver.get_cookies():
        baked_cookies[cookie['name']] = cookie['value']
    return baked_cookies


def get_pdp(cookies, product_id):
    print(f"Getting product {product_id}...")
    url = f'https://www.instacart.com/v3/containers/items/item_{product_id}'
    r = requests.get(url,
                     cookies=cookies,
                     headers=headers)

    if r.status_code == 200:
        return r.json()
    else:
        print(f"Failed to get product {product_id} {r}")

def process_pdp(pdp_data):
    # Rather then just saving the data, pull out the data needed and do something with it

    product_title = pdp_data['container']['title']
    product_id = pdp_data['container']['tracking_params']['item_id']

    print(f"Processing: {product_id} - {product_title}")
    # TODO: make sure folder products is already created
    with open(f'products/product-{product_id}.json', 'w') as outfile:
        json.dump(pdp_data, outfile, sort_keys=True, indent=4)


def search(cookies, term):
    # TODO: Make sure the search term is urlencoded
    print(f"Searching: {term}...")
    url = f'https://www.instacart.com/v3/containers/kroger/search_v3/{term}?source=web&per=50'
    r = requests.get(url,
                     cookies=cookies,
                     headers=headers)

    if r.status_code == 200:
        return r.json()
    else:
        print(f"Failed to search for {term} {r}")

def extract_product_ids(search_results):
    # Get the product ids and return as a list
    product_list = []
    for module in search_results['container']['modules']:
        if 'items' in module['data']:
            product_list = module['data']['items']
            break

    product_ids = []
    for product in product_list:
        product_ids.append(product['id'].split('_')[1])

    return product_ids

# Just do this once per session
login_cookies1 = login("<email>", "<password>")


terms = ['cookies']
for term in terms:
    # First Search the term
    search_results = search(login_cookies1, term)
    # Get all the product ids that are in the search results
    product_ids = extract_product_ids(search_results)
    print(f"Found {len(product_ids)} products for the term {term}")
    for product_id in product_ids:
        # For each product id, get the product details
        pdp_data = get_pdp(login_cookies1, product_id)
        # Extract/save the product details
        process_pdp(pdp_data)
	import json
	import time
	import requests
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.firefox.options import Options as FirefoxOptions
	from webdriverdownloader import GeckoDriverDownloader

	gdd = GeckoDriverDownloader()
	geckodriver = gdd.download_and_install()


	headers = {
	'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
	}

	def login(email, password):
	print("Logging in...")
	login_url = 'https://www.instacart.com/accounts/login'
	options = FirefoxOptions()
	options.add_argument("--headless")
	driver = webdriver.Firefox(firefox_options=options,
	executable_path=geckodriver[1])

	driver.get(login_url)
	time.sleep(2)
	login_link = driver.find_element_by_xpath('//a[@class="ic-btn ic-btn-success no-underline"]')
	login_link.click()
	time.sleep(2)
	(WebDriverWait(driver, 5).until(
	EC.presence_of_element_located((
	By.XPATH, '//input[@type="email"]'))))
	email_input = (driver.find_element_by_xpath(
	'//input[@type="email"]'))
	password_input = driver.find_element_by_xpath('//input[@type="password"]')
	email_input.clear()
	email_input.send_keys(email)
	password_input.clear()
	password_input.send_keys(password)
	password_input.send_keys(Keys.RETURN)

	time.sleep(2)

	baked_cookies = {}
	# Requests does not need all this info in the cookies, strip it out
	for cookie in driver.get_cookies():
	baked_cookies[cookie['name']] = cookie['value']
	return baked_cookies


	def get_pdp(cookies, product_id):
	print(f"Getting product {product_id}...")
	url = f'https://www.instacart.com/v3/containers/items/item_{product_id}'
	r = requests.get(url,
	cookies=cookies,
	headers=headers)

	if r.status_code == 200:
	return r.json()
	else:
	print(f"Failed to get product {product_id} {r}")

	def process_pdp(pdp_data):
	# Rather then just saving the data, pull out the data needed and do something with it

	product_title = pdp_data['container']['title']
	product_id = pdp_data['container']['tracking_params']['item_id']

	print(f"Processing: {product_id} - {product_title}")
	# TODO: make sure folder products is already created
	with open(f'products/product-{product_id}.json', 'w') as outfile:
	json.dump(pdp_data, outfile, sort_keys=True, indent=4)


	def search(cookies, term):
	# TODO: Make sure the search term is urlencoded
	print(f"Searching: {term}...")
	url = f'https://www.instacart.com/v3/containers/kroger/search_v3/{term}?source=web&per=50'
	r = requests.get(url,
	cookies=cookies,
	headers=headers)

	if r.status_code == 200:
	return r.json()
	else:
	print(f"Failed to search for {term} {r}")

	def extract_product_ids(search_results):
	# Get the product ids and return as a list
	product_list = []
	for module in search_results['container']['modules']:
	if 'items' in module['data']:
	product_list = module['data']['items']
	break

	product_ids = []
	for product in product_list:
	product_ids.append(product['id'].split('_')[1])

	return product_ids

	# Just do this once per session
	login_cookies1 = login("<email>", "<password>")


	terms = ['cookies']
	for term in terms:
	# First Search the term
	search_results = search(login_cookies1, term)
	# Get all the product ids that are in the search results
	product_ids = extract_product_ids(search_results)
	print(f"Found {len(product_ids)} products for the term {term}")
	for product_id in product_ids:
	# For each product id, get the product details
	pdp_data = get_pdp(login_cookies1, product_id)
	# Extract/save the product details
	process_pdp(pdp_data)