Skip to content

Instantly share code, notes, and snippets.

@hannesdatta
Last active September 14, 2023 13:42
Show Gist options
  • Save hannesdatta/61b5d117b97cd2fb995d0b6a11d16806 to your computer and use it in GitHub Desktop.
Save hannesdatta/61b5d117b97cd2fb995d0b6a11d16806 to your computer and use it in GitHub Desktop.
Web Scraping Mistakes: Handling Lists in Python: Code for https://youtu.be/RV9WOlqmL3E
# FINAL CODE
import requests
from bs4 import BeautifulSoup
# Define the URL and user-agent header
url = 'https://www.coolblue.nl/tweedekans-product/2191236'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
# Send an HTTP GET request to the URL with the specified headers
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
# Extract the HTML source code from the response
source_code = response.text
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(source_code, 'html.parser')
# Extract product information
product_name = soup.find(class_='js-product-name').get_text().strip()
former_price = soup.find(class_='sales-price__former-price').get_text().strip()
current_price = soup.find(class_='sales-price__current js-sales-price-current').get_text().strip()
reviews = soup.find(class_='review-rating__reviews text--truncate').get_text().strip()
current_state = soup.find('li', class_='inline-list__item js-inline-list-item').get_text().strip()
# Extract additional information from the status list
status = soup.find('ul', attrs={'class': 'list list--bullet'})
status_items = status.find_all('li')
# Create a list to store the extracted status information
status_info = []
for state in status_items:
status_info.append(state.get_text().strip())
# Print the extracted information
print("Status Information:")
for item in status_info:
print(item)
print("Product Name:", product_name)
print("Former Price:", former_price)
print("Current Price:", current_price)
print("Reviews:", reviews[0:7].strip(), reviews[7:20].strip())
print("Current State:", current_state)
# ORIGINAL CODE RECEIVED BY STUDENT FOR DEUBGGING
import requests
from bs4 import BeautifulSoup
url = 'https://www.coolblue.nl/tweedekans-product/2191236'
header= {'User-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
phone_request = requests.get(url, headers = header)
phone_request.encoding = phone_request.apparent_encoding
phone_source_code = phone_request.text
soup = BeautifulSoup(phone_source_code)
product_name = soup.find(class_='js-product-name').get_text().strip()
former_price = soup.find(class_='sales-price__former-price').get_text().strip()
current_price = soup.find(class_='sales-price__current js-sales-price-current').get_text().strip()
reviews = soup.find(class_='review-rating__reviews text--truncate').get_text().strip()
current_state = soup.find('li', class_='inline-list__item js-inline-list-item').get_text().strip()
status = soup.find('ul',attrs={'class':'list list--bullet'})
status_phone = status.find_all('li')
out = []
for state in status_phone: out.append(status_phone.get_text())
out
print(product_name)
print(former_price)
print(current_price)
print(reviews[0:7].strip())
print(reviews[7:20].strip())
print(current_state)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment