-
-
Save Scheeperscreepers/bca1735e79596a610678eca42ff931e7 to your computer and use it in GitHub Desktop.
import imaplib | |
import email | |
from email.header import decode_header | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
from openpyxl import load_workbook | |
from IPython.display import display | |
# iCloud email credentials (replace with your own details) | |
IMAP_SERVER = "imap.mail.me.com" | |
IMAP_PORT = 993 | |
USERNAME = "xxx" # Replace with your iCloud email | |
PASSWORD = "xxx" # Replace with your app-specific password | |
def fetch_property24_emails(): | |
try: | |
# Connect to the iCloud IMAP server | |
mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT) | |
# Log in to your account | |
mail.login(USERNAME, PASSWORD) | |
# Select the mailbox you want to check (e.g., INBOX) | |
mail.select("inbox") | |
# Search for emails from Property24 | |
status, messages = mail.search(None, 'FROM', '"no-reply@property24.com"') | |
if status != "OK": | |
print("No emails found.") | |
return [] | |
# Get the list of email IDs | |
email_ids = messages[0].split() | |
# Optionally, limit the number of emails to process | |
email_ids = email_ids[-5:] # For example, only fetch the last 5 emails | |
urls = [] | |
# Process each email | |
for email_id in email_ids: | |
try: | |
# Fetch the email by ID | |
status, msg_data = mail.fetch(email_id, "(RFC822)") | |
if status != "OK" or not msg_data: | |
print(f"Failed to fetch email with ID: {email_id}") | |
continue | |
# Iterate through the fetched data to find the content | |
for part in msg_data: | |
# Check if `part` is a tuple and contains the expected byte content | |
if isinstance(part, tuple) and len(part) > 1 and part[1]: | |
# Get the raw email content | |
try: | |
msg = email.message_from_bytes(part[1]) | |
except Exception as e: | |
print(f"Error parsing email content for {email_id}: {e}") | |
continue | |
# Decode the email subject | |
subject, encoding = decode_header(msg["Subject"])[0] | |
if isinstance(subject, bytes): | |
subject = subject.decode(encoding or "utf-8") | |
print("Processing email:", subject) | |
# If the email has multiple parts, iterate through them | |
for subpart in msg.walk(): | |
content_type = subpart.get_content_type() | |
content_disposition = str(subpart.get("Content-Disposition")) | |
# Extract the body content of the email (text/html) | |
if content_type == "text/html": | |
body = subpart.get_payload(decode=True).decode(errors='ignore') | |
# Parse the HTML body using BeautifulSoup | |
soup = BeautifulSoup(body, 'html.parser') | |
# Using lambda to find anchor tags with "View Property" | |
matched_tags = soup.find_all(lambda tag: tag.name == 'a' and "view property" in tag.text.lower()) | |
for link in matched_tags: | |
property_url = link.get('href') | |
if property_url: # Ensure the href is not None | |
print(f"Found property URL: {property_url}") | |
urls.append(property_url) | |
except Exception as e: | |
print(f"Error processing email {email_id}: {e}") | |
# Close the connection | |
mail.close() | |
mail.logout() | |
except Exception as e: | |
print(f"Error fetching emails: {e}") | |
return urls | |
# Run the function | |
fetch_property24_emails() | |
# Function to extract property information from a URL | |
def extract_property_data(url): | |
try: | |
# Send a GET request to the webpage | |
response = requests.get(url) | |
# Check if the response was successful | |
if response.status_code != 200: | |
print(f"Failed to retrieve {url}") | |
return {"Error": "Failed to retrieve data"} | |
# Parse the content using BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract the title of the property | |
title_tag = soup.find('h1') | |
title = title_tag.text.strip() if title_tag else 'Title not found' | |
# Extract the price | |
price_tag = soup.find('span', class_='p24_price') | |
price = price_tag.text.strip().replace("R", "").replace(" ", "").replace(",", "") if price_tag else None | |
# Extract the number of bedrooms | |
bedrooms_tag = soup.find('li', title='Bedrooms') | |
bedrooms = int(bedrooms_tag.text.strip()) if bedrooms_tag else 'Bedrooms not found' | |
# Extract the number of bathrooms | |
bathrooms_tag = soup.find('li', title='Bathrooms') | |
bathrooms = float(bathrooms_tag.text.strip()) if bathrooms_tag else 'Bathrooms not found' | |
# Extract property size in square meters | |
size_tag = soup.find('li', title='Erf Size') or soup.find('li', class_='p24_size') or soup.find('span', class_='p24_size') | |
# Find the specific <span> that contains the size value in square meters | |
if size_tag: | |
size_span = size_tag.find('span') | |
size_text = size_span.text.strip().replace("m²", "").replace(" ", "") if size_span else 'Size not found' | |
else: | |
size_text = 'Size not found' | |
# Parse price and size as numeric values | |
try: | |
price_value = int(re.sub(r'[^\d]', '', price)) | |
except ValueError: | |
price_value = None | |
try: | |
size_value = int(size_text) | |
except ValueError: | |
size_value = None | |
# Calculate price per square meter | |
if price_value and size_value: | |
price_per_sqm = price_value / size_value | |
else: | |
price_per_sqm = None | |
return { | |
"Link": f'=HYPERLINK("{url}", "View Listing")', | |
"Title": title, | |
"Price": price_value, | |
"Bedrooms": bedrooms, | |
"Bathrooms": bathrooms, | |
"Size (sqm)": size_value, | |
"Price per Square Meter": price_per_sqm | |
} | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
return {"Error": str(e)} | |
# Function to scrape properties and save to Excel | |
def scrape_properties_with_email_updates(): | |
# Get URLs from your iCloud emails | |
email_urls = fetch_property24_emails() | |
# Prompt the user to add any additional URLs manually | |
manual_urls = input("Enter additional property URLs separated by ';': ") | |
manual_urls_list = manual_urls.split(';') if manual_urls.strip() else [] | |
# Combine URLs from emails and manual input | |
all_urls = email_urls + manual_urls_list | |
# Check if any URLs are present | |
if not all_urls: | |
print("No URLs to process. Exiting.") | |
return | |
# Proceed with your existing scraping and saving workflow | |
all_data = [] | |
for url in all_urls: | |
url = url.strip() | |
if url: | |
print(f"Scraping {url}...") | |
data = extract_property_data(url) | |
all_data.append(data) | |
# Convert list of dicts to a pandas DataFrame | |
df = pd.DataFrame(all_data) | |
# Save DataFrame to an Excel file with hyperlinks | |
output_filename = 'property_data_with_email_updates.xlsx' | |
with pd.ExcelWriter(output_filename, engine='openpyxl') as writer: | |
df.to_excel(writer, index=False, sheet_name='Properties') | |
# Display the dataframe in the notebook | |
display(df) | |
print(f"Scraping completed. Data saved to {output_filename}") | |
# Run the combined process | |
scrape_properties_with_email_updates() |
Scheeperscreepers
commented
Oct 1, 2024
Hi everyone,
I’m working on a Python script that connects to my iCloud inbox to extract property listing URLs from emails sent by Property24, a real estate listing service. The ultimate goal is to calculate the price per square meter for each property. Here’s a breakdown of what the code does, the issue I'm facing, and what I’ve tried so far.
What the Code is Supposed to Do
Connect to iCloud Inbox: Using the IMAP protocol, my code connects to my iCloud email account and searches for emails from no-reply@property24.com.
Extract URLs from Emails: The code parses the HTML content of these emails to find tags with the text "View Property," and extracts the href links from them.
Calculate Price per Square Meter: Once the URLs are extracted, the script scrapes each property page to gather data such as price and property size, then calculates the price per square meter.
Manual Input Option: In case the email extraction fails, the script also allows for manual input of URLs, which works correctly and returns the expected output.
The Problem
The manual input of URLs works perfectly, but the email scraping portion is not extracting any URLs from my iCloud inbox. When running the script, the output is "No URLs found," even though the emails contain the correct "View Property" links.
I've inspected the HTML of these emails, and each property URL is embedded within an tag that has "View Property" as its text. I’ve tried debugging by printing out all anchor tags but still can't seem to isolate the URLs correctly.
Debugging Steps Taken
Inspect HTML Structure: I reviewed the HTML elements in the email and confirmed that the property URLs are inside tags with the "View Property" text.
Checked for Formatting Issues: Ensured that all HTML is parsed properly and that "View Property" matches the case and spacing exactly as it appears in the email.
Added Print Statements: Used print statements to see if any anchor tags are being found, and inspected their text and href attributes.
I’ve also attached a screenshot of the email’s HTML and a simplified sample to provide context. The code is shared via a GitHub Gist [link to gist here].
Code Overview
Here are the main functions and their purposes:
fetch_property24_emails():
Connects to iCloud using IMAP, searches for Property24 emails, and extracts URLs from the HTML content.
extract_property_data(url):
Scrapes details like price and size from each property URL and calculates the price per square meter.
scrape_properties_to_excel():
Combines the functionalities of fetching URLs (either from emails or manually), extracting data, and saving results to an Excel file.
Request for Help
Could anyone help me figure out why the code is not finding URLs from the emails? I'm looking for any pointers or suggestions for debugging this issue, or if anyone has experienced a similar problem with imaplib or BeautifulSoup.
I would really appreciate any insights! Thanks in advance!