Skip to content

Instantly share code, notes, and snippets.

@cosmoscalibur
Last active June 22, 2019 17:40
Show Gist options
  • Save cosmoscalibur/107d5e7b20eca3154a7369bd9b116604 to your computer and use it in GitHub Desktop.
Save cosmoscalibur/107d5e7b20eca3154a7369bd9b116604 to your computer and use it in GitHub Desktop.
web scraping example with requests and bs4
#! /usr/bin/env python3
# author: Edward Villegas-Pulgarin (@cosmoscalibur)
# Scraping web site and convert to markdown format.
# Extract product catalog.
# June 28, 2017.
# Last test: June 22, 2019.
import requests
import bs4
site = "https://www.vexrobotics.com/vexedr/products/view-all"
site_req = requests.get(site)
site_bs = bs4.BeautifulSoup(site_req.text, "lxml")
products_table = site_bs.find_all('li', {'class': 'item'})
productos = len(products_table)
count = 0
for product in products_table:
count = count + 1
if product.find('button', {'title': 'Add to Cart'}):
product_tag = product.find('h2', {'class': 'product-name'}).find('a')
product_url = product_tag.get('href')
product_name = product_tag.string
product_req = requests.get(product_url)
product_bs = bs4.BeautifulSoup(product_req.text, "lxml")
product_img_tag = product_bs.find_all('a', {'rel': 'gallery', 'class': 'thumb-link'})
product_img_url = []
for img in product_img_tag:
product_img_url.append(img.get('href'))
product_price = product_bs.find('span', {'class': 'price'}).string
product_sku = product_bs.find('h3', {'class': 'sku'}).string
product_short = product_bs.find('div', {'class': 'std'}).get_text(" ")
print("Producto {} de {}\n".format(count, productos))
print("# Nombre: {} \n__URL__: {} \n__SKU__: {} \n__Precio__: {} \n__Descripción__: {} \n".format(product_name, product_url, product_sku, product_price, product_short))
for img in product_img_url:
print("![]({})\n".format(img))
print("\n\n")
print("Procesados {} productos".format(productos))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment