Skip to content

Instantly share code, notes, and snippets.

@dbigman
Created March 25, 2023 18:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbigman/b75f7cfe3efd49c2d2be68c8a99d8d52 to your computer and use it in GitHub Desktop.
Save dbigman/b75f7cfe3efd49c2d2be68c8a99d8d52 to your computer and use it in GitHub Desktop.
Subastas scraper
#! python3
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import dateparser
import logging # Setup code for error logging
logging.basicConfig(level=logging.DEBUG, format ='%(asctime)s - %(levelname)s - %(message)s') # basic logging format
# logging.disable(logging.CRITICAL) # disables logging at all levels.
logging.debug('Start of program')
def fetch_content(url):
response = requests.get(url)
return response.content
def parse_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# Parsing the page for what we want.
titles = soup.find_all('span', class_='title')
# name of agency
agencia_element = soup.find('div', class_='agency')
agencia = agencia_element.text.replace('Agencia: ', '').strip()
#Location
localizacion_element = soup.find('div', class_='localization')
localizacion = localizacion_element.text.replace('Localización: ', '').strip()
# Status
estatus_element = soup.find('div', class_='agency')
estatus = estatus_element.text.replace('Estatus: ', '').strip()
# Dates
fechaApertura_elements = soup.find_all('div', class_='fechaApertura')
fechaPresubasta_elements = soup.find_all('div', class_='fechaPreSubasta')
fechaPliegos_elements = soup.find_all('div', class_='fechaPliegos')
# Extract the date strings and remove the 'Fecha apertura:' and other prefixes.
fechaApertura_strings = [element.text.strip().replace('Fecha apertura:', '').strip() for element in fechaApertura_elements]
fechaPresubasta_strings = [element.text.strip().replace('Fecha pre-subasta:', '').strip() for element in fechaPresubasta_elements]
fechaPliegos_strings = [element.text.strip().replace('Fecha pliegos:', '').strip() for element in fechaPliegos_elements]
# Convert the date strings to 'date' objects
fechaApertura = [parsed_date.date() for parsed_date in (dateparser.parse(fechaApertura_string) for fechaApertura_string in fechaApertura_strings) if parsed_date]
fechaPresubasta = [parsed_date.date() for parsed_date in (dateparser.parse(fechaPresubasta_string) for fechaPresubasta_string in fechaPresubasta_strings) if parsed_date]
fechaPliegos = [parsed_date.date() for parsed_date in (dateparser.parse(fechaPliegos_string) for fechaPliegos_string in fechaPliegos_strings) if parsed_date]
# Create a list to store the scraped items
items = []
for i in range(len(titles)):
item = {
'title': titles[i].text.strip(),
'fechaPliegos': fechaPliegos[i] if i < len(fechaPliegos) else None,
'fechaApertura': fechaApertura[i] if i < len(fechaApertura) else None,
'fechaPresubasta': fechaPresubasta[i] if i < len(fechaPresubasta) else None,
'Localizacion': localizacion[i] if i < len(localizacion) else None,
'Estatus': estatus[i] if i < len(estatus) else None,
'Agencia': agencia[i] if i < len(agencia) else None
}
items.append(item)
return items
def save_to_excel(data, file_name):
df = pd.DataFrame(data)
df.to_excel(file_name, index=False)
def main():
url = 'https://subastas.pr.gov/Pages/subastas.aspx' # Target URL
content = fetch_content(url)
items = parse_content(content)
excel_file_name = 'Subastas_scrape.xlsx'
target_directory = r'C:\'
os.chdir(target_directory)
current_directory = os.getcwd()
logging.debug('Current working directory: (%s)' % (current_directory))
save_to_excel(items, excel_file_name)
logging.debug('File saved: (%s)' % (excel_file_name))
if __name__ == '__main__':
main()
logging.debug('End of program')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment