Skip to content

Instantly share code, notes, and snippets.

@pyoneerC
Last active July 19, 2024 15:02
Show Gist options
  • Save pyoneerC/0030be5deca5e8f324053a2ba1dbc87b to your computer and use it in GitHub Desktop.
Save pyoneerC/0030be5deca5e8f324053a2ba1dbc87b to your computer and use it in GitHub Desktop.
Web Scraper to get the prices of an item in MercadoLibre Argentina and plot a histogram of the prices, shows USD conversion, and shows an image of the first item found. Can search N pages (8 default)
# Script to get the prices of an item in MercadoLibre Argentina and plot a histogram of the prices
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from PIL import Image
import numpy as np
import requests
import datetime
import re
import io
API_URL = "https://dolarapi.com/v1/dolares/blue"
NUMBER_OF_PAGES = 5
def get_exchange_rate():
"""Fetch the current exchange rate from the API."""
try:
response = requests.get(API_URL)
response.raise_for_status() # Raise an exception if the request failed
return response.json()['venta']
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return None
venta_dolar = get_exchange_rate()
def get_prices(item):
"""Fetch the prices of the given item from MercadoLibre."""
prices_list = []
image_urls = []
for i in range(NUMBER_OF_PAGES):
start_item = i * 50 + 1
url = f'https://listado.mercadolibre.com.ar/{item}_Desde_{start_item}_NoIndex_True'
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
prices = soup.find_all('span', class_='andes-money-amount__fraction')
prices_list.extend([int(re.sub(r'\D', '', price.text)) for price in prices])
images = soup.find_all('img', class_='poly-component__picture poly-component__picture--square')
image_urls.extend([image['src'] for image in images])
except requests.exceptions.RequestException as e:
if response.status_code == 404:
print(f'Error: {response.status_code} - {response.reason}')
print('Please scan less pages!')
else:
print(f"Error: {e}")
return None, None, None
return prices_list, url, image_urls
def format_x(value, tick_number):
"""Format the x-axis values."""
return "{:,}".format(int(value))
def plot_prices(prices_list, item, url, image_urls):
"""Plot a histogram of the prices."""
plt.figure(figsize=(10, 5))
plt.hist(prices_list, bins=20, color='lightblue', edgecolor='black')
# Plain numbers in the x-axis
plt.ticklabel_format(style='plain', axis='x')
formatter = ticker.FuncFormatter(format_x)
plt.gca().xaxis.set_major_formatter(formatter)
y_position = plt.gca().get_ylim()[1] * 0.05
median = np.median(prices_list)
# Adjust the x position offset according to the median price
if median > 70000:
x_pos_offset = 10000
elif median > 50000:
x_pos_offset = 3500
elif median > 20000:
x_pos_offset = 2000
elif median > 10000:
x_pos_offset = 1000
else:
x_pos_offset = 500
plt.xlabel('Price in ARS')
plt.ylabel('Frequency')
current_date = datetime.date.today().strftime('%d/%m/%Y')
base_url = 'https://listado.mercadolibre.com.ar/' + item
plt.title('Histogram of ' + item.replace('-', ' ').upper() + ' prices in MercadoLibre Argentina ' +
'(' + current_date + ')' + '\n' +
'Number of items indexed: ' + str(len(prices_list)) + ' (' + str(NUMBER_OF_PAGES) + ' pages)' + '\n' +
'URL: ' + base_url + '\n')
# plot of the mean, median, max, min and standard deviation of the selected item
std_dev = np.std(prices_list)
avg_price = np.mean(prices_list)
median_price = np.median(prices_list)
max_price = max(prices_list)
min_price = min(prices_list)
plt.axvline(median_price, color='red', linestyle='solid', linewidth=1)
plt.text(median_price + x_pos_offset, y_position,
'Median: ' + "{:,}".format(int(median_price)) + ' ARS' + ' (' + "{:,.0f}".format(
median_price / venta_dolar) + ' USD)', rotation=90, color='red')
plt.axvline(avg_price, color='purple', linestyle='solid', linewidth=1)
plt.text(avg_price + x_pos_offset, y_position,
'Avg: ' + "{:,}".format(int(avg_price)) + ' ARS' + ' (' + "{:,.0f}".format(
avg_price / venta_dolar) + ' USD)', rotation=90, color='purple')
plt.axvline(max_price, color='blue', linestyle='dashed', linewidth=1)
plt.text(max_price + x_pos_offset, y_position,
'Max: ' + "{:,}".format(int(max_price)) + ' ARS' + ' (' + "{:,.0f}".format(
max_price / venta_dolar) + ' USD)', rotation=90)
plt.axvline(min_price, color='blue', linestyle='dashed', linewidth=1)
plt.text(min_price + x_pos_offset, y_position,
'Min: ' + "{:,}".format(int(min_price)) + ' ARS' + ' (' + "{:,.0f}".format(
min_price / venta_dolar) + ' USD)', rotation=90)
plt.axvline(avg_price + std_dev, color='black', linestyle='dotted', linewidth=3)
plt.axvline(np.percentile(prices_list, 25), color='green', linestyle='dashed', linewidth=2)
plt.text(np.percentile(prices_list, 25) + x_pos_offset, y_position,
'25th percentile: ' + "{:,}".format(
int(np.percentile(prices_list, 25))) + ' ARS' + ' (' + "{:,.0f}".format(
np.percentile(prices_list, 25) / venta_dolar) + ' USD)', rotation=90, color='green')
plt.axvline(avg_price - std_dev, color='black', linestyle='dotted', linewidth=3)
plt.legend(['Median', 'Avg', 'Max', 'Min', 'Std Dev', '25th percentile'], loc='upper right')
if len(image_urls) > 0 and len(prices_list) > 0:
img = Image.open(io.BytesIO(requests.get(image_urls[0]).content))
ylim = plt.gca().get_ylim()
ytop = ylim[1] - 0.1 * (ylim[1] - ylim[0])
imagebox = OffsetImage(img, zoom=0.2)
ab = AnnotationBbox(imagebox, (max_price, ytop), frameon=False)
plt.gca().add_artist(ab)
plt.grid(True)
plt.tight_layout()
plt.show()
def print_statistics(prices_list, item, url):
"""Print the statistics of the prices."""
std_dev = np.std(prices_list)
mean_price = np.mean(prices_list)
median_price = np.median(prices_list)
max_price = max(prices_list)
min_price = min(prices_list)
coeff_var = (std_dev / mean_price) * 100
print(f'Statistics of \'{item.replace("-", " ").upper()}\' prices in MercadoLibre Argentina:')
print('Number of items: ', len(prices_list))
print('Dollar price: ', "{:,}".format(int(venta_dolar)) + ' ARS')
print('url: ', url)
print('-' * 50)
print('Median price: ', "{:,}".format(int(median_price)) + ' ARS')
print('Avg price: ', "{:,}".format(int(mean_price)) + ' ARS')
print('Max price: ', "{:,}".format(int(max_price)) + ' ARS')
print('Min price: ', "{:,}".format(int(min_price)) + ' ARS')
print('-' * 50)
print('Prices in USD:')
print('Median price: ', "{:.2f}".format(median_price / venta_dolar) + ' USD')
print('Avg price: ', "{:.2f}".format(mean_price / venta_dolar) + ' USD')
print('Max price: ', "{:.2f}".format(max_price / venta_dolar) + ' USD')
print('Min price: ', "{:.2f}".format(min_price / venta_dolar) + ' USD')
def main():
"""Main function to execute the script."""
item = input('Enter the item to scan: ')
global NUMBER_OF_PAGES
NUMBER_OF_PAGES = max(1, min(15, int(input('Enter the number of pages to scan (default 5, min 1, max 15): ') or NUMBER_OF_PAGES)))
print(f'Scanning the first {NUMBER_OF_PAGES} \'{item}\' pages in MercadoLibre Argentina...')
print('Please wait...')
while len(item) < 3:
print('Please enter an item with at least 3 characters!')
item = input('Enter the item to search: ')
item = item.replace(' ', '-').lower()
exchange_rate = get_exchange_rate()
if exchange_rate is None:
print("Failed to fetch exchange rate. Exiting.")
return
prices_list, url, image_urls = get_prices(item)
if prices_list is None or url is None:
print("Failed to fetch prices. Exiting.")
return
plot_prices(prices_list, item, url, image_urls)
print_statistics(prices_list, item, url)
if __name__ == "__main__":
main()
@pyoneerC
Copy link
Author

chocolate

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment