Created
March 25, 2023 18:44
-
-
Save dbigman/b75f7cfe3efd49c2d2be68c8a99d8d52 to your computer and use it in GitHub Desktop.
Subastas scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import os | |
import dateparser | |
import logging # Setup code for error logging | |
logging.basicConfig(level=logging.DEBUG, format ='%(asctime)s - %(levelname)s - %(message)s') # basic logging format | |
# logging.disable(logging.CRITICAL) # disables logging at all levels. | |
logging.debug('Start of program') | |
def fetch_content(url): | |
response = requests.get(url) | |
return response.content | |
def parse_content(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Parsing the page for what we want. | |
titles = soup.find_all('span', class_='title') | |
# name of agency | |
agencia_element = soup.find('div', class_='agency') | |
agencia = agencia_element.text.replace('Agencia: ', '').strip() | |
#Location | |
localizacion_element = soup.find('div', class_='localization') | |
localizacion = localizacion_element.text.replace('Localización: ', '').strip() | |
# Status | |
estatus_element = soup.find('div', class_='agency') | |
estatus = estatus_element.text.replace('Estatus: ', '').strip() | |
# Dates | |
fechaApertura_elements = soup.find_all('div', class_='fechaApertura') | |
fechaPresubasta_elements = soup.find_all('div', class_='fechaPreSubasta') | |
fechaPliegos_elements = soup.find_all('div', class_='fechaPliegos') | |
# Extract the date strings and remove the 'Fecha apertura:' and other prefixes. | |
fechaApertura_strings = [element.text.strip().replace('Fecha apertura:', '').strip() for element in fechaApertura_elements] | |
fechaPresubasta_strings = [element.text.strip().replace('Fecha pre-subasta:', '').strip() for element in fechaPresubasta_elements] | |
fechaPliegos_strings = [element.text.strip().replace('Fecha pliegos:', '').strip() for element in fechaPliegos_elements] | |
# Convert the date strings to 'date' objects | |
fechaApertura = [parsed_date.date() for parsed_date in (dateparser.parse(fechaApertura_string) for fechaApertura_string in fechaApertura_strings) if parsed_date] | |
fechaPresubasta = [parsed_date.date() for parsed_date in (dateparser.parse(fechaPresubasta_string) for fechaPresubasta_string in fechaPresubasta_strings) if parsed_date] | |
fechaPliegos = [parsed_date.date() for parsed_date in (dateparser.parse(fechaPliegos_string) for fechaPliegos_string in fechaPliegos_strings) if parsed_date] | |
# Create a list to store the scraped items | |
items = [] | |
for i in range(len(titles)): | |
item = { | |
'title': titles[i].text.strip(), | |
'fechaPliegos': fechaPliegos[i] if i < len(fechaPliegos) else None, | |
'fechaApertura': fechaApertura[i] if i < len(fechaApertura) else None, | |
'fechaPresubasta': fechaPresubasta[i] if i < len(fechaPresubasta) else None, | |
'Localizacion': localizacion[i] if i < len(localizacion) else None, | |
'Estatus': estatus[i] if i < len(estatus) else None, | |
'Agencia': agencia[i] if i < len(agencia) else None | |
} | |
items.append(item) | |
return items | |
def save_to_excel(data, file_name): | |
df = pd.DataFrame(data) | |
df.to_excel(file_name, index=False) | |
def main(): | |
url = 'https://subastas.pr.gov/Pages/subastas.aspx' # Target URL | |
content = fetch_content(url) | |
items = parse_content(content) | |
excel_file_name = 'Subastas_scrape.xlsx' | |
target_directory = r'C:\' | |
os.chdir(target_directory) | |
current_directory = os.getcwd() | |
logging.debug('Current working directory: (%s)' % (current_directory)) | |
save_to_excel(items, excel_file_name) | |
logging.debug('File saved: (%s)' % (excel_file_name)) | |
if __name__ == '__main__': | |
main() | |
logging.debug('End of program') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment