-
-
Save askedrelic/0f605ab5c0923541a4c1df2bc1dcd385 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| McDonald's Receipt Analyzer | |
| Parses HTML email receipts from McDonald's to extract order data and statistics. | |
| """ | |
| import re | |
| import html | |
| import email | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| import json | |
| @dataclass | |
| class OrderItem: | |
| name: str | |
| quantity: int | |
| price: float | |
| is_promotion: bool = False | |
| @dataclass | |
| class Receipt: | |
| date: str | |
| time: str | |
| location: str | |
| check_in_number: str | |
| items: List[OrderItem] | |
| subtotal: float | |
| tax: float | |
| total: float | |
| total_savings: float | |
| payment_method: str | |
| file_path: str | |
| class McDonaldsReceiptParser: | |
| def __init__(self): | |
| self.receipts: List[Receipt] = [] | |
| def parse_eml_file(self, file_path: Path) -> Optional[Receipt]: | |
| """Parse a single .eml file and extract receipt data.""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| msg = email.message_from_file(f) | |
| # Get HTML content | |
| html_content = None | |
| if msg.is_multipart(): | |
| for part in msg.walk(): | |
| if part.get_content_type() == "text/html": | |
| html_content = part.get_payload(decode=True).decode('utf-8') | |
| break | |
| else: | |
| if msg.get_content_type() == "text/html": | |
| html_content = msg.get_payload(decode=True).decode('utf-8') | |
| if not html_content: | |
| return None | |
| return self.parse_html_receipt(html_content, str(file_path)) | |
| except Exception as e: | |
| print(f"Error parsing {file_path}: {e}") | |
| return None | |
| def parse_html_receipt(self, html_content: str, file_path: str) -> Optional[Receipt]: | |
| """Parse HTML content to extract receipt data.""" | |
| try: | |
| # Decode quoted-printable encoding | |
| html_content = html_content.replace('=\n', '') # Remove soft line breaks | |
| html_content = re.sub(r'=([0-9A-F]{2})', lambda m: chr(int(m.group(1), 16)), html_content) | |
| # Extract basic info | |
| date = self.extract_date(html_content) | |
| time = self.extract_time(html_content) | |
| location = self.extract_location(html_content) | |
| check_in_number = self.extract_check_in_number(html_content) | |
| # Extract items | |
| items = self.extract_items(html_content) | |
| # Extract totals | |
| subtotal = self.extract_subtotal(html_content) | |
| tax = self.extract_tax(html_content) | |
| total = self.extract_total(html_content) | |
| total_savings = self.extract_total_savings(html_content) | |
| # Extract payment method | |
| payment_method = self.extract_payment_method(html_content) | |
| return Receipt( | |
| date=date, | |
| time=time, | |
| location=location, | |
| check_in_number=check_in_number, | |
| items=items, | |
| subtotal=subtotal, | |
| tax=tax, | |
| total=total, | |
| total_savings=total_savings, | |
| payment_method=payment_method, | |
| file_path=file_path | |
| ) | |
| except Exception as e: | |
| print(f"Error parsing HTML content from {file_path}: {e}") | |
| return None | |
| def extract_date(self, html_content: str) -> str: | |
| """Extract order date from HTML.""" | |
| # Look for date pattern like "01/04/2025" or "03/01/2025" | |
| date_match = re.search(r'(\d{2}/\d{2}/\d{4})', html_content) | |
| return date_match.group(1) if date_match else "" | |
| def extract_time(self, html_content: str) -> str: | |
| """Extract order time from HTML.""" | |
| # Look for time pattern like "09:04:30 AM" | |
| time_match = re.search(r'(\d{2}:\d{2}:\d{2}\s*[AP]M)', html_content) | |
| return time_match.group(1) if time_match else "" | |
| def extract_location(self, html_content: str) -> str: | |
| """Extract restaurant location from HTML.""" | |
| # Look for the restaurant address | |
| location_match = re.search(r'(\d+\s+[^<]+BLVD[^<]*)', html_content) | |
| if location_match: | |
| return location_match.group(1).strip() | |
| return "" | |
| def extract_check_in_number(self, html_content: str) -> str: | |
| """Extract check-in number from HTML.""" | |
| # Look for pattern like "Check-In# RF88-8227355" | |
| checkin_match = re.search(r'Check-In#\s*([A-Z0-9-]+)', html_content) | |
| return checkin_match.group(1) if checkin_match else "" | |
| def extract_items(self, html_content: str) -> List[OrderItem]: | |
| """Extract ordered items from HTML.""" | |
| items = [] | |
| # Find the products table | |
| products_table_match = re.search(r'<table[^>]*id="products"[^>]*>(.*?)</table>', html_content, re.DOTALL) | |
| if not products_table_match: | |
| return items | |
| table_content = products_table_match.group(1) | |
| # Find all item rows | |
| item_rows = re.findall(r'<tr[^>]*style="height:15pt"[^>]*>(.*?)</tr>', table_content, re.DOTALL) | |
| current_promotion = None | |
| for row in item_rows: | |
| # Check if this is a promotion header | |
| promotion_match = re.search(r'<td[^>]*>([^<]*(?:BOGO|Buy[^<]*get[^<]*FREE|Buy One[^<]*Add One)[^<]*)</td>', row) | |
| if promotion_match: | |
| current_promotion = promotion_match.group(1).strip() | |
| continue | |
| # Check if this is an item with price | |
| item_match = re.search(r'<td[^>]*font-weight:bold[^>]*>([^<]+)</td>.*?Qty\s*(\d+).*?\$(\d+\.\d{2})', row, re.DOTALL) | |
| if item_match: | |
| name = item_match.group(1).strip() | |
| # Clean up the name | |
| name = re.sub(r'\s+', ' ', name) | |
| name = re.sub(r'®|™', '', name) | |
| quantity = int(item_match.group(2)) | |
| price = float(item_match.group(3)) | |
| is_promotion = current_promotion is not None and price == 0.0 | |
| items.append(OrderItem( | |
| name=name, | |
| quantity=quantity, | |
| price=price, | |
| is_promotion=is_promotion | |
| )) | |
| return items | |
| def extract_subtotal(self, html_content: str) -> float: | |
| """Extract subtotal from HTML.""" | |
| subtotal_match = re.search(r'Subtotal.*?\$(\d+\.\d{2})', html_content, re.DOTALL) | |
| return float(subtotal_match.group(1)) if subtotal_match else 0.0 | |
| def extract_tax(self, html_content: str) -> float: | |
| """Extract tax amount from HTML.""" | |
| tax_match = re.search(r'Tax Amount.*?\$(\d+\.\d{2})', html_content, re.DOTALL) | |
| return float(tax_match.group(1)) if tax_match else 0.0 | |
| def extract_total(self, html_content: str) -> float: | |
| """Extract total amount from HTML.""" | |
| # Look for Total with specific styling | |
| total_match = re.search(r'Total.*?\$(\d+\.\d{2})', html_content, re.DOTALL) | |
| return float(total_match.group(1)) if total_match else 0.0 | |
| def extract_total_savings(self, html_content: str) -> float: | |
| """Extract total savings from HTML.""" | |
| savings_match = re.search(r'Total Savings.*?\$(\d+\.\d{2})', html_content, re.DOTALL) | |
| return float(savings_match.group(1)) if savings_match else 0.0 | |
| def extract_payment_method(self, html_content: str) -> str: | |
| """Extract payment method from HTML.""" | |
| payment_match = re.search(r'Card Type:\s*([^<]+)', html_content) | |
| return payment_match.group(1).strip() if payment_match else "" | |
| def process_all_receipts(self, receipt_files: List[Path]): | |
| """Process all receipt files.""" | |
| print(f"Processing {len(receipt_files)} receipt files...") | |
| for file_path in receipt_files: | |
| receipt = self.parse_eml_file(file_path) | |
| if receipt: | |
| self.receipts.append(receipt) | |
| print(f"✓ Processed: {file_path.name} - {receipt.date} - ${receipt.total}") | |
| else: | |
| print(f"✗ Failed to process: {file_path.name}") | |
| print(f"\nSuccessfully processed {len(self.receipts)} out of {len(receipt_files)} receipts") | |
| def generate_statistics(self) -> Dict: | |
| """Generate comprehensive statistics from all receipts.""" | |
| if not self.receipts: | |
| return {} | |
| stats = { | |
| 'total_receipts': len(self.receipts), | |
| 'total_spent': sum(r.total for r in self.receipts), | |
| 'total_savings': sum(r.total_savings for r in self.receipts), | |
| 'average_order': sum(r.total for r in self.receipts) / len(self.receipts), | |
| 'date_range': { | |
| 'earliest': min(r.date for r in self.receipts if r.date), | |
| 'latest': max(r.date for r in self.receipts if r.date) | |
| } | |
| } | |
| # Item frequency analysis | |
| item_counts = {} | |
| item_revenue = {} | |
| for receipt in self.receipts: | |
| for item in receipt.items: | |
| if item.name not in item_counts: | |
| item_counts[item.name] = 0 | |
| item_revenue[item.name] = 0.0 | |
| item_counts[item.name] += item.quantity | |
| item_revenue[item.name] += item.price * item.quantity | |
| # Sort by frequency | |
| popular_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True) | |
| stats['most_popular_items'] = popular_items[:10] | |
| # Sort by revenue | |
| revenue_items = sorted(item_revenue.items(), key=lambda x: x[1], reverse=True) | |
| stats['highest_revenue_items'] = revenue_items[:10] | |
| # Monthly spending | |
| monthly_spending = {} | |
| for receipt in self.receipts: | |
| if receipt.date: | |
| try: | |
| date_obj = datetime.strptime(receipt.date, '%m/%d/%Y') | |
| month_key = date_obj.strftime('%Y-%m') | |
| if month_key not in monthly_spending: | |
| monthly_spending[month_key] = 0.0 | |
| monthly_spending[month_key] += receipt.total | |
| except: | |
| pass | |
| stats['monthly_spending'] = dict(sorted(monthly_spending.items())) | |
| # Payment method analysis | |
| payment_methods = {} | |
| for receipt in self.receipts: | |
| method = receipt.payment_method or 'Unknown' | |
| payment_methods[method] = payment_methods.get(method, 0) + 1 | |
| stats['payment_methods'] = payment_methods | |
| return stats | |
| def print_summary_report(self): | |
| """Print a formatted summary report.""" | |
| if not self.receipts: | |
| print("No receipts to analyze.") | |
| return | |
| stats = self.generate_statistics() | |
| print("\n" + "="*60) | |
| print("McDONALD'S RECEIPT ANALYSIS SUMMARY") | |
| print("="*60) | |
| print(f"\n📊 OVERVIEW:") | |
| print(f" Total Receipts: {stats['total_receipts']}") | |
| print(f" Date Range: {stats['date_range']['earliest']} to {stats['date_range']['latest']}") | |
| print(f" Total Spent: ${stats['total_spent']:.2f}") | |
| print(f" Total Savings: ${stats['total_savings']:.2f}") | |
| print(f" Average Order: ${stats['average_order']:.2f}") | |
| print(f"\n🍔 MOST POPULAR ITEMS:") | |
| for i, (item, count) in enumerate(stats['most_popular_items'][:5], 1): | |
| print(f" {i}. {item}: {count} orders") | |
| print(f"\n💰 HIGHEST REVENUE ITEMS:") | |
| for i, (item, revenue) in enumerate(stats['highest_revenue_items'][:5], 1): | |
| print(f" {i}. {item}: ${revenue:.2f}") | |
| print(f"\n📅 MONTHLY SPENDING:") | |
| for month, total in stats['monthly_spending'].items(): | |
| print(f" {month}: ${total:.2f}") | |
| print(f"\n💳 PAYMENT METHODS:") | |
| for method, count in stats['payment_methods'].items(): | |
| print(f" {method}: {count} times") | |
| print("\n" + "="*60) | |
| def main(): | |
| """Main function to run the receipt analyzer.""" | |
| # Define the path to McDonald's receipt files | |
| receipt_files = [ | |
| ] | |
| # Initialize parser | |
| parser = McDonaldsReceiptParser() | |
| # Process all receipts | |
| parser.process_all_receipts(receipt_files) | |
| # Generate and print summary report | |
| parser.print_summary_report() | |
| # Save detailed data to JSON | |
| stats = parser.generate_statistics() | |
| output_file = Path("mcdonalds_analysis.json") | |
| with open(output_file, 'w') as f: | |
| json.dump(stats, f, indent=2) | |
| print(f"\n📁 Detailed analysis saved to: {output_file}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment