Created
July 25, 2025 10:55
-
-
Save mtelvers/8383fb563e171778bfaf412f3119d50c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Docker Build Log Analyzer | |
| Analyzes Docker build logs to extract build times and create visualizations. | |
| """ | |
| import os | |
| import re | |
| import glob | |
| from datetime import datetime, timedelta | |
| import matplotlib.pyplot as plt | |
| import matplotlib.dates as mdates | |
| from collections import defaultdict | |
| import pandas as pd | |
| class DockerLogAnalyzer: | |
| def __init__(self, log_directory, min_duration_seconds=30, exclude_failed=True): | |
| """ | |
| Initialize the analyzer with the root directory containing date folders. | |
| Args: | |
| log_directory (str): Path to directory containing date folders (e.g., 2024-09-24/) | |
| min_duration_seconds (int): Minimum build duration to include (filters out cached builds) | |
| exclude_failed (bool): Whether to exclude failed builds from analysis | |
| """ | |
| self.log_directory = log_directory | |
| self.min_duration_seconds = min_duration_seconds | |
| self.exclude_failed = exclude_failed | |
| self.build_data = [] | |
| self.filtered_data = [] | |
| def parse_timestamp(self, timestamp_str): | |
| """ | |
| Parse timestamp from log format: '2024-09-24 14:45.02' | |
| Args: | |
| timestamp_str (str): Timestamp string from log | |
| Returns: | |
| datetime: Parsed datetime object | |
| """ | |
| try: | |
| # Handle the format with minutes.seconds | |
| return datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M.%S") | |
| except ValueError: | |
| try: | |
| # Fallback for standard format | |
| return datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S") | |
| except ValueError: | |
| return None | |
| def extract_build_time(self, log_file_path): | |
| """ | |
| Extract start and end times from a single log file. | |
| Args: | |
| log_file_path (str): Path to the log file | |
| Returns: | |
| dict: Contains start_time, end_time, duration, file_path, and success status | |
| """ | |
| try: | |
| with open(log_file_path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| if not lines: | |
| return None | |
| # Find start time (first line with timestamp) | |
| start_time = None | |
| start_line = None | |
| for line in lines: | |
| timestamp_match = re.match(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}\.\d{2}):', line) | |
| if timestamp_match: | |
| start_time = self.parse_timestamp(timestamp_match.group(1)) | |
| start_line = line.strip() | |
| break | |
| # Find end time (last line with timestamp) | |
| end_time = None | |
| end_line = None | |
| success = False | |
| for line in reversed(lines): | |
| timestamp_match = re.match(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}\.\d{2}):', line) | |
| if timestamp_match: | |
| end_time = self.parse_timestamp(timestamp_match.group(1)) | |
| end_line = line.strip() | |
| # Check if build succeeded | |
| if "Job succeeded" in line: | |
| success = True | |
| break | |
| if start_time and end_time: | |
| duration = (end_time - start_time).total_seconds() | |
| return { | |
| 'start_time': start_time, | |
| 'end_time': end_time, | |
| 'duration_seconds': duration, | |
| 'duration_minutes': duration / 60, | |
| 'file_path': log_file_path, | |
| 'success': success, | |
| 'start_line': start_line, | |
| 'end_line': end_line | |
| } | |
| except Exception as e: | |
| print(f"Error processing {log_file_path}: {e}") | |
| return None | |
| def analyze_logs(self): | |
| """ | |
| Analyze all log files in the directory structure. | |
| """ | |
| # Find all date directories | |
| date_pattern = os.path.join(self.log_directory, "20??-??-??") | |
| date_dirs = glob.glob(date_pattern) | |
| if not date_dirs: | |
| print(f"No date directories found in {self.log_directory}") | |
| return | |
| print(f"Found {len(date_dirs)} date directories") | |
| for date_dir in sorted(date_dirs): | |
| # Find all docker build log files in this date directory | |
| log_pattern = os.path.join(date_dir, "*docker-build*.log") | |
| log_files = glob.glob(log_pattern) | |
| print(f"Processing {len(log_files)} log files in {os.path.basename(date_dir)}") | |
| for log_file in log_files: | |
| result = self.extract_build_time(log_file) | |
| if result: | |
| self.build_data.append(result) | |
| print(f"Successfully analyzed {len(self.build_data)} builds") | |
| # Apply filters | |
| self.filter_data() | |
| def filter_data(self): | |
| """ | |
| Filter the build data based on success status and minimum duration. | |
| """ | |
| original_count = len(self.build_data) | |
| self.filtered_data = self.build_data.copy() | |
| # Track what we're filtering out | |
| failed_builds = 0 | |
| quick_builds = 0 | |
| if self.exclude_failed: | |
| failed_count_before = len([b for b in self.filtered_data if not b['success']]) | |
| self.filtered_data = [build for build in self.filtered_data if build['success']] | |
| failed_builds = failed_count_before | |
| # Filter out very quick builds (likely cached) | |
| quick_count_before = len([b for b in self.filtered_data if b['duration_seconds'] < self.min_duration_seconds]) | |
| self.filtered_data = [build for build in self.filtered_data if build['duration_seconds'] >= self.min_duration_seconds] | |
| quick_builds = quick_count_before | |
| filtered_count = len(self.filtered_data) | |
| print(f"\nFiltering results:") | |
| print(f" Original builds: {original_count}") | |
| if self.exclude_failed: | |
| print(f" Failed builds excluded: {failed_builds}") | |
| print(f" Quick builds excluded (< {self.min_duration_seconds}s): {quick_builds}") | |
| print(f" Remaining builds for analysis: {filtered_count}") | |
| if filtered_count == 0: | |
| print("Warning: No builds remaining after filtering!") | |
| elif filtered_count < original_count * 0.1: | |
| print("Warning: Less than 10% of builds remain after filtering. Consider adjusting filter criteria.") | |
| def create_visualizations(self, output_dir='./output', formats=['png', 'pdf']): | |
| """ | |
| Create various visualizations of the build data and save them to files. | |
| Args: | |
| output_dir (str): Directory to save the plots | |
| formats (list): List of formats to save ('png', 'pdf', 'svg', etc.) | |
| """ | |
| if not self.filtered_data: | |
| print("No filtered data to visualize") | |
| return | |
| # Create output directory if it doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Use Agg backend for headless environments (Docker) | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| # Convert to DataFrame for easier manipulation | |
| df = pd.DataFrame(self.filtered_data) | |
| df = df.sort_values('start_time') | |
| # Create figure with subplots - simplified since we only have successful builds | |
| fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12)) | |
| fig.suptitle('Docker Build Analysis (Successful Builds Only)', fontsize=16, fontweight='bold') | |
| # 1. Build times over time (scatter plot) | |
| ax1.scatter(df['start_time'], df['duration_minutes'], | |
| alpha=0.6, color='green', label='Build duration', s=30) | |
| ax1.set_title('Build Duration Over Time') | |
| ax1.set_xlabel('Date') | |
| ax1.set_ylabel('Duration (minutes)') | |
| ax1.grid(True, alpha=0.3) | |
| # Format x-axis dates | |
| ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) | |
| ax1.xaxis.set_major_locator(mdates.DayLocator(interval=max(1, len(df) // 10))) | |
| plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45) | |
| # 2. Build duration histogram | |
| ax2.hist(df['duration_minutes'], bins=20, alpha=0.7, color='skyblue', edgecolor='black') | |
| ax2.set_title('Build Duration Distribution') | |
| ax2.set_xlabel('Duration (minutes)') | |
| ax2.set_ylabel('Frequency') | |
| ax2.grid(True, alpha=0.3) | |
| # Add statistics | |
| mean_duration = df['duration_minutes'].mean() | |
| median_duration = df['duration_minutes'].median() | |
| ax2.axvline(mean_duration, color='red', linestyle='--', label=f'Mean: {mean_duration:.1f}m') | |
| ax2.axvline(median_duration, color='orange', linestyle='--', label=f'Median: {median_duration:.1f}m') | |
| ax2.legend() | |
| # 3. Daily build statistics | |
| df['date'] = df['start_time'].dt.date | |
| daily_stats = df.groupby('date').agg({ | |
| 'duration_minutes': ['count', 'mean', 'min', 'max', 'std'] | |
| }).round(2) | |
| daily_stats.columns = ['_'.join(col).strip() for col in daily_stats.columns] | |
| daily_stats = daily_stats.reset_index() | |
| ax3.plot(daily_stats['date'], daily_stats['duration_minutes_mean'], | |
| marker='o', label='Average duration', linewidth=2) | |
| ax3.fill_between(daily_stats['date'], | |
| daily_stats['duration_minutes_min'], | |
| daily_stats['duration_minutes_max'], | |
| alpha=0.3, label='Min-Max range') | |
| ax3.set_title('Daily Build Performance') | |
| ax3.set_xlabel('Date') | |
| ax3.set_ylabel('Duration (minutes)') | |
| ax3.legend() | |
| ax3.grid(True, alpha=0.3) | |
| plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45) | |
| # 4. Builds per day | |
| ax4.bar(daily_stats['date'], daily_stats['duration_minutes_count'], | |
| alpha=0.7, color='lightblue', edgecolor='darkblue') | |
| ax4.set_title('Number of Builds Per Day') | |
| ax4.set_xlabel('Date') | |
| ax4.set_ylabel('Number of Builds') | |
| ax4.grid(True, alpha=0.3) | |
| plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45) | |
| plt.tight_layout() | |
| # Save in multiple formats | |
| for fmt in formats: | |
| output_file = os.path.join(output_dir, f'docker_build_analysis.{fmt}') | |
| plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight') | |
| print(f"Saved plot: {output_file}") | |
| plt.close() # Close the figure to free memory | |
| # Create individual plots for better readability | |
| self._create_individual_plots(df, daily_stats, output_dir, formats) | |
| # Print summary statistics | |
| self.print_summary(df) | |
| def _create_individual_plots(self, df, daily_stats, output_dir, formats): | |
| """ | |
| Create individual plots for each metric. | |
| """ | |
| # 1. Individual plot: Build times over time | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| ax.scatter(df['start_time'], df['duration_minutes'], | |
| alpha=0.6, color='green', label='Build duration', s=40) | |
| # Add trend line | |
| from scipy import stats | |
| try: | |
| import numpy as np | |
| x_numeric = mdates.date2num(df['start_time']) | |
| slope, intercept, r_value, p_value, std_err = stats.linregress(x_numeric, df['duration_minutes']) | |
| trend_line = slope * x_numeric + intercept | |
| ax.plot(df['start_time'], trend_line, 'r--', alpha=0.8, | |
| label=f'Trend (R²={r_value**2:.3f})') | |
| except ImportError: | |
| pass # Skip trend line if scipy not available | |
| ax.set_title('Build Duration Over Time', fontsize=14, fontweight='bold') | |
| ax.set_xlabel('Date') | |
| ax.set_ylabel('Duration (minutes)') | |
| ax.legend() | |
| ax.grid(True, alpha=0.3) | |
| ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) | |
| plt.setp(ax.xaxis.get_majorticklabels(), rotation=45) | |
| plt.tight_layout() | |
| for fmt in formats: | |
| output_file = os.path.join(output_dir, f'build_times_timeline.{fmt}') | |
| plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| # 2. Individual plot: Daily performance trends with error bars | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| # Add error bars using standard deviation | |
| ax.errorbar(daily_stats['date'], daily_stats['duration_minutes_mean'], | |
| yerr=daily_stats['duration_minutes_std'], | |
| marker='o', label='Average ± Std Dev', linewidth=2, markersize=6, | |
| capsize=5, capthick=2) | |
| ax.fill_between(daily_stats['date'], | |
| daily_stats['duration_minutes_min'], | |
| daily_stats['duration_minutes_max'], | |
| alpha=0.2, label='Min-Max range') | |
| ax.set_title('Daily Build Performance Trends', fontsize=14, fontweight='bold') | |
| ax.set_xlabel('Date') | |
| ax.set_ylabel('Duration (minutes)') | |
| ax.legend() | |
| ax.grid(True, alpha=0.3) | |
| plt.setp(ax.xaxis.get_majorticklabels(), rotation=45) | |
| plt.tight_layout() | |
| for fmt in formats: | |
| output_file = os.path.join(output_dir, f'daily_performance_trends.{fmt}') | |
| plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight') | |
| plt.close() | |
| print(f"Individual plots also saved in {output_dir}") | |
| def print_summary(self, df): | |
| """ | |
| Print summary statistics for filtered data. | |
| """ | |
| print("\n" + "="*50) | |
| print("BUILD ANALYSIS SUMMARY (FILTERED DATA)") | |
| print("="*50) | |
| # Show filtering info | |
| original_total = len(self.build_data) if self.build_data else 0 | |
| filtered_total = len(df) | |
| print(f"Original builds found: {original_total}") | |
| print(f"Builds after filtering: {filtered_total}") | |
| if original_total > 0: | |
| print(f"Filtered out: {original_total - filtered_total} ({(original_total - filtered_total)/original_total*100:.1f}%)") | |
| print(f"Filter criteria: min_duration >= {self.min_duration_seconds}s, exclude_failed = {self.exclude_failed}") | |
| print() | |
| if not df.empty: | |
| print("Duration Statistics (minutes):") | |
| print(f" Mean: {df['duration_minutes'].mean():.2f}") | |
| print(f" Median: {df['duration_minutes'].median():.2f}") | |
| print(f" Min: {df['duration_minutes'].min():.2f}") | |
| print(f" Max: {df['duration_minutes'].max():.2f}") | |
| print(f" Std Dev: {df['duration_minutes'].std():.2f}") | |
| print() | |
| print("Date Range:") | |
| print(f" First build: {df['start_time'].min()}") | |
| print(f" Last build: {df['start_time'].max()}") | |
| print() | |
| # Build frequency | |
| total_days = (df['start_time'].max().date() - df['start_time'].min().date()).days + 1 | |
| print(f"Analysis period: {total_days} days") | |
| print(f"Average builds per day: {len(df) / total_days:.1f}") | |
| print() | |
| # Top 5 longest builds | |
| longest_builds = df.nlargest(5, 'duration_minutes')[['start_time', 'duration_minutes', 'file_path']] | |
| print("Top 5 longest builds:") | |
| for _, build in longest_builds.iterrows(): | |
| print(f" ✓ {build['start_time'].strftime('%Y-%m-%d %H:%M')} - {build['duration_minutes']:.2f}m - {os.path.basename(build['file_path'])}") | |
| # Top 5 shortest builds (but still above threshold) | |
| shortest_builds = df.nsmallest(5, 'duration_minutes')[['start_time', 'duration_minutes', 'file_path']] | |
| print("\nTop 5 shortest builds (above threshold):") | |
| for _, build in shortest_builds.iterrows(): | |
| print(f" ✓ {build['start_time'].strftime('%Y-%m-%d %H:%M')} - {build['duration_minutes']:.2f}m - {os.path.basename(build['file_path'])}") | |
| else: | |
| print("No builds remaining after filtering!") | |
| def export_data(self, output_file='build_analysis.csv'): | |
| """ | |
| Export the analyzed data to CSV. | |
| Args: | |
| output_file (str): Full path for the output CSV file | |
| """ | |
| if not self.filtered_data: | |
| print("No filtered data to export") | |
| return | |
| # Ensure output directory exists | |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
| df = pd.DataFrame(self.filtered_data) | |
| df.to_csv(output_file, index=False) | |
| print(f"Filtered data exported to {output_file}") | |
| # Also export raw data for reference | |
| if self.build_data: | |
| raw_output_file = output_file.replace('.csv', '_raw.csv') | |
| raw_df = pd.DataFrame(self.build_data) | |
| raw_df.to_csv(raw_output_file, index=False) | |
| print(f"Raw data exported to {raw_output_file}") | |
| def main(): | |
| """ | |
| Main function to run the analysis. | |
| """ | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Analyze Docker build logs') | |
| parser.add_argument('--log-dir', '-d', default='/data', | |
| help='Path to log directory containing date folders (default: /data)') | |
| parser.add_argument('--output-dir', '-o', default='/data/output', | |
| help='Output directory for plots and CSV (default: /data/output)') | |
| parser.add_argument('--formats', '-f', nargs='+', default=['png', 'pdf'], | |
| help='Output formats for plots (default: png pdf)') | |
| parser.add_argument('--min-duration', '-m', type=int, default=30, | |
| help='Minimum build duration in seconds to include (filters out cached builds, default: 30)') | |
| parser.add_argument('--include-failed', action='store_true', default=False, | |
| help='Include failed builds in analysis (default: exclude failed builds)') | |
| parser.add_argument('--export-csv', action='store_true', default=True, | |
| help='Export data to CSV (default: enabled)') | |
| parser.add_argument('--interactive', '-i', action='store_true', | |
| help='Run in interactive mode (ask for paths)') | |
| args = parser.parse_args() | |
| # Interactive mode for non-Docker usage | |
| if args.interactive: | |
| log_directory = input("Enter the path to your log directory (containing date folders): ").strip() | |
| output_directory = input(f"Enter output directory (default: {args.output_dir}): ").strip() or args.output_dir | |
| formats = input("Enter plot formats separated by spaces (default: png pdf): ").split() or args.formats | |
| min_duration = input(f"Minimum build duration in seconds (default: {args.min_duration}): ").strip() | |
| min_duration = int(min_duration) if min_duration else args.min_duration | |
| include_failed = input("Include failed builds? (y/N): ").lower().startswith('y') | |
| else: | |
| log_directory = args.log_dir | |
| output_directory = args.output_dir | |
| formats = args.formats | |
| min_duration = args.min_duration | |
| include_failed = args.include_failed | |
| if not os.path.exists(log_directory): | |
| print(f"Directory not found: {log_directory}") | |
| return | |
| # Create analyzer and run analysis | |
| analyzer = DockerLogAnalyzer( | |
| log_directory, | |
| min_duration_seconds=min_duration, | |
| exclude_failed=not include_failed | |
| ) | |
| print(f"Analyzing Docker build logs in: {log_directory}") | |
| print(f"Filters: min_duration={min_duration}s, exclude_failed={not include_failed}") | |
| analyzer.analyze_logs() | |
| if analyzer.filtered_data: | |
| print(f"Creating visualizations in: {output_directory}") | |
| analyzer.create_visualizations(output_dir=output_directory, formats=formats) | |
| if args.export_csv: | |
| csv_path = os.path.join(output_directory, 'build_analysis.csv') | |
| analyzer.export_data(csv_path) | |
| else: | |
| print("No valid build data found after filtering. Please check your log directory structure and file patterns.") | |
| print(f"Expected structure: {log_directory}/YYYY-MM-DD/*docker-build*.log") | |
| print("Consider adjusting filter criteria (--min-duration, --include-failed)") | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run with