Skip to content

Instantly share code, notes, and snippets.

@mtelvers
Created July 25, 2025 10:55
Show Gist options
  • Select an option

  • Save mtelvers/8383fb563e171778bfaf412f3119d50c to your computer and use it in GitHub Desktop.

Select an option

Save mtelvers/8383fb563e171778bfaf412f3119d50c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Docker Build Log Analyzer
Analyzes Docker build logs to extract build times and create visualizations.
"""
import os
import re
import glob
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import defaultdict
import pandas as pd
class DockerLogAnalyzer:
def __init__(self, log_directory, min_duration_seconds=30, exclude_failed=True):
"""
Initialize the analyzer with the root directory containing date folders.
Args:
log_directory (str): Path to directory containing date folders (e.g., 2024-09-24/)
min_duration_seconds (int): Minimum build duration to include (filters out cached builds)
exclude_failed (bool): Whether to exclude failed builds from analysis
"""
self.log_directory = log_directory
self.min_duration_seconds = min_duration_seconds
self.exclude_failed = exclude_failed
self.build_data = []
self.filtered_data = []
def parse_timestamp(self, timestamp_str):
"""
Parse timestamp from log format: '2024-09-24 14:45.02'
Args:
timestamp_str (str): Timestamp string from log
Returns:
datetime: Parsed datetime object
"""
try:
# Handle the format with minutes.seconds
return datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M.%S")
except ValueError:
try:
# Fallback for standard format
return datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
except ValueError:
return None
def extract_build_time(self, log_file_path):
"""
Extract start and end times from a single log file.
Args:
log_file_path (str): Path to the log file
Returns:
dict: Contains start_time, end_time, duration, file_path, and success status
"""
try:
with open(log_file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
return None
# Find start time (first line with timestamp)
start_time = None
start_line = None
for line in lines:
timestamp_match = re.match(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}\.\d{2}):', line)
if timestamp_match:
start_time = self.parse_timestamp(timestamp_match.group(1))
start_line = line.strip()
break
# Find end time (last line with timestamp)
end_time = None
end_line = None
success = False
for line in reversed(lines):
timestamp_match = re.match(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}\.\d{2}):', line)
if timestamp_match:
end_time = self.parse_timestamp(timestamp_match.group(1))
end_line = line.strip()
# Check if build succeeded
if "Job succeeded" in line:
success = True
break
if start_time and end_time:
duration = (end_time - start_time).total_seconds()
return {
'start_time': start_time,
'end_time': end_time,
'duration_seconds': duration,
'duration_minutes': duration / 60,
'file_path': log_file_path,
'success': success,
'start_line': start_line,
'end_line': end_line
}
except Exception as e:
print(f"Error processing {log_file_path}: {e}")
return None
def analyze_logs(self):
"""
Analyze all log files in the directory structure.
"""
# Find all date directories
date_pattern = os.path.join(self.log_directory, "20??-??-??")
date_dirs = glob.glob(date_pattern)
if not date_dirs:
print(f"No date directories found in {self.log_directory}")
return
print(f"Found {len(date_dirs)} date directories")
for date_dir in sorted(date_dirs):
# Find all docker build log files in this date directory
log_pattern = os.path.join(date_dir, "*docker-build*.log")
log_files = glob.glob(log_pattern)
print(f"Processing {len(log_files)} log files in {os.path.basename(date_dir)}")
for log_file in log_files:
result = self.extract_build_time(log_file)
if result:
self.build_data.append(result)
print(f"Successfully analyzed {len(self.build_data)} builds")
# Apply filters
self.filter_data()
def filter_data(self):
"""
Filter the build data based on success status and minimum duration.
"""
original_count = len(self.build_data)
self.filtered_data = self.build_data.copy()
# Track what we're filtering out
failed_builds = 0
quick_builds = 0
if self.exclude_failed:
failed_count_before = len([b for b in self.filtered_data if not b['success']])
self.filtered_data = [build for build in self.filtered_data if build['success']]
failed_builds = failed_count_before
# Filter out very quick builds (likely cached)
quick_count_before = len([b for b in self.filtered_data if b['duration_seconds'] < self.min_duration_seconds])
self.filtered_data = [build for build in self.filtered_data if build['duration_seconds'] >= self.min_duration_seconds]
quick_builds = quick_count_before
filtered_count = len(self.filtered_data)
print(f"\nFiltering results:")
print(f" Original builds: {original_count}")
if self.exclude_failed:
print(f" Failed builds excluded: {failed_builds}")
print(f" Quick builds excluded (< {self.min_duration_seconds}s): {quick_builds}")
print(f" Remaining builds for analysis: {filtered_count}")
if filtered_count == 0:
print("Warning: No builds remaining after filtering!")
elif filtered_count < original_count * 0.1:
print("Warning: Less than 10% of builds remain after filtering. Consider adjusting filter criteria.")
def create_visualizations(self, output_dir='./output', formats=['png', 'pdf']):
"""
Create various visualizations of the build data and save them to files.
Args:
output_dir (str): Directory to save the plots
formats (list): List of formats to save ('png', 'pdf', 'svg', etc.)
"""
if not self.filtered_data:
print("No filtered data to visualize")
return
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Use Agg backend for headless environments (Docker)
import matplotlib
matplotlib.use('Agg')
# Convert to DataFrame for easier manipulation
df = pd.DataFrame(self.filtered_data)
df = df.sort_values('start_time')
# Create figure with subplots - simplified since we only have successful builds
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Docker Build Analysis (Successful Builds Only)', fontsize=16, fontweight='bold')
# 1. Build times over time (scatter plot)
ax1.scatter(df['start_time'], df['duration_minutes'],
alpha=0.6, color='green', label='Build duration', s=30)
ax1.set_title('Build Duration Over Time')
ax1.set_xlabel('Date')
ax1.set_ylabel('Duration (minutes)')
ax1.grid(True, alpha=0.3)
# Format x-axis dates
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax1.xaxis.set_major_locator(mdates.DayLocator(interval=max(1, len(df) // 10)))
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45)
# 2. Build duration histogram
ax2.hist(df['duration_minutes'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
ax2.set_title('Build Duration Distribution')
ax2.set_xlabel('Duration (minutes)')
ax2.set_ylabel('Frequency')
ax2.grid(True, alpha=0.3)
# Add statistics
mean_duration = df['duration_minutes'].mean()
median_duration = df['duration_minutes'].median()
ax2.axvline(mean_duration, color='red', linestyle='--', label=f'Mean: {mean_duration:.1f}m')
ax2.axvline(median_duration, color='orange', linestyle='--', label=f'Median: {median_duration:.1f}m')
ax2.legend()
# 3. Daily build statistics
df['date'] = df['start_time'].dt.date
daily_stats = df.groupby('date').agg({
'duration_minutes': ['count', 'mean', 'min', 'max', 'std']
}).round(2)
daily_stats.columns = ['_'.join(col).strip() for col in daily_stats.columns]
daily_stats = daily_stats.reset_index()
ax3.plot(daily_stats['date'], daily_stats['duration_minutes_mean'],
marker='o', label='Average duration', linewidth=2)
ax3.fill_between(daily_stats['date'],
daily_stats['duration_minutes_min'],
daily_stats['duration_minutes_max'],
alpha=0.3, label='Min-Max range')
ax3.set_title('Daily Build Performance')
ax3.set_xlabel('Date')
ax3.set_ylabel('Duration (minutes)')
ax3.legend()
ax3.grid(True, alpha=0.3)
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45)
# 4. Builds per day
ax4.bar(daily_stats['date'], daily_stats['duration_minutes_count'],
alpha=0.7, color='lightblue', edgecolor='darkblue')
ax4.set_title('Number of Builds Per Day')
ax4.set_xlabel('Date')
ax4.set_ylabel('Number of Builds')
ax4.grid(True, alpha=0.3)
plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45)
plt.tight_layout()
# Save in multiple formats
for fmt in formats:
output_file = os.path.join(output_dir, f'docker_build_analysis.{fmt}')
plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight')
print(f"Saved plot: {output_file}")
plt.close() # Close the figure to free memory
# Create individual plots for better readability
self._create_individual_plots(df, daily_stats, output_dir, formats)
# Print summary statistics
self.print_summary(df)
def _create_individual_plots(self, df, daily_stats, output_dir, formats):
"""
Create individual plots for each metric.
"""
# 1. Individual plot: Build times over time
fig, ax = plt.subplots(figsize=(12, 6))
ax.scatter(df['start_time'], df['duration_minutes'],
alpha=0.6, color='green', label='Build duration', s=40)
# Add trend line
from scipy import stats
try:
import numpy as np
x_numeric = mdates.date2num(df['start_time'])
slope, intercept, r_value, p_value, std_err = stats.linregress(x_numeric, df['duration_minutes'])
trend_line = slope * x_numeric + intercept
ax.plot(df['start_time'], trend_line, 'r--', alpha=0.8,
label=f'Trend (R²={r_value**2:.3f})')
except ImportError:
pass # Skip trend line if scipy not available
ax.set_title('Build Duration Over Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Duration (minutes)')
ax.legend()
ax.grid(True, alpha=0.3)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
plt.tight_layout()
for fmt in formats:
output_file = os.path.join(output_dir, f'build_times_timeline.{fmt}')
plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight')
plt.close()
# 2. Individual plot: Daily performance trends with error bars
fig, ax = plt.subplots(figsize=(12, 6))
# Add error bars using standard deviation
ax.errorbar(daily_stats['date'], daily_stats['duration_minutes_mean'],
yerr=daily_stats['duration_minutes_std'],
marker='o', label='Average ± Std Dev', linewidth=2, markersize=6,
capsize=5, capthick=2)
ax.fill_between(daily_stats['date'],
daily_stats['duration_minutes_min'],
daily_stats['duration_minutes_max'],
alpha=0.2, label='Min-Max range')
ax.set_title('Daily Build Performance Trends', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Duration (minutes)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
plt.tight_layout()
for fmt in formats:
output_file = os.path.join(output_dir, f'daily_performance_trends.{fmt}')
plt.savefig(output_file, format=fmt, dpi=300, bbox_inches='tight')
plt.close()
print(f"Individual plots also saved in {output_dir}")
def print_summary(self, df):
"""
Print summary statistics for filtered data.
"""
print("\n" + "="*50)
print("BUILD ANALYSIS SUMMARY (FILTERED DATA)")
print("="*50)
# Show filtering info
original_total = len(self.build_data) if self.build_data else 0
filtered_total = len(df)
print(f"Original builds found: {original_total}")
print(f"Builds after filtering: {filtered_total}")
if original_total > 0:
print(f"Filtered out: {original_total - filtered_total} ({(original_total - filtered_total)/original_total*100:.1f}%)")
print(f"Filter criteria: min_duration >= {self.min_duration_seconds}s, exclude_failed = {self.exclude_failed}")
print()
if not df.empty:
print("Duration Statistics (minutes):")
print(f" Mean: {df['duration_minutes'].mean():.2f}")
print(f" Median: {df['duration_minutes'].median():.2f}")
print(f" Min: {df['duration_minutes'].min():.2f}")
print(f" Max: {df['duration_minutes'].max():.2f}")
print(f" Std Dev: {df['duration_minutes'].std():.2f}")
print()
print("Date Range:")
print(f" First build: {df['start_time'].min()}")
print(f" Last build: {df['start_time'].max()}")
print()
# Build frequency
total_days = (df['start_time'].max().date() - df['start_time'].min().date()).days + 1
print(f"Analysis period: {total_days} days")
print(f"Average builds per day: {len(df) / total_days:.1f}")
print()
# Top 5 longest builds
longest_builds = df.nlargest(5, 'duration_minutes')[['start_time', 'duration_minutes', 'file_path']]
print("Top 5 longest builds:")
for _, build in longest_builds.iterrows():
print(f" ✓ {build['start_time'].strftime('%Y-%m-%d %H:%M')} - {build['duration_minutes']:.2f}m - {os.path.basename(build['file_path'])}")
# Top 5 shortest builds (but still above threshold)
shortest_builds = df.nsmallest(5, 'duration_minutes')[['start_time', 'duration_minutes', 'file_path']]
print("\nTop 5 shortest builds (above threshold):")
for _, build in shortest_builds.iterrows():
print(f" ✓ {build['start_time'].strftime('%Y-%m-%d %H:%M')} - {build['duration_minutes']:.2f}m - {os.path.basename(build['file_path'])}")
else:
print("No builds remaining after filtering!")
def export_data(self, output_file='build_analysis.csv'):
"""
Export the analyzed data to CSV.
Args:
output_file (str): Full path for the output CSV file
"""
if not self.filtered_data:
print("No filtered data to export")
return
# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df = pd.DataFrame(self.filtered_data)
df.to_csv(output_file, index=False)
print(f"Filtered data exported to {output_file}")
# Also export raw data for reference
if self.build_data:
raw_output_file = output_file.replace('.csv', '_raw.csv')
raw_df = pd.DataFrame(self.build_data)
raw_df.to_csv(raw_output_file, index=False)
print(f"Raw data exported to {raw_output_file}")
def main():
"""
Main function to run the analysis.
"""
import argparse
parser = argparse.ArgumentParser(description='Analyze Docker build logs')
parser.add_argument('--log-dir', '-d', default='/data',
help='Path to log directory containing date folders (default: /data)')
parser.add_argument('--output-dir', '-o', default='/data/output',
help='Output directory for plots and CSV (default: /data/output)')
parser.add_argument('--formats', '-f', nargs='+', default=['png', 'pdf'],
help='Output formats for plots (default: png pdf)')
parser.add_argument('--min-duration', '-m', type=int, default=30,
help='Minimum build duration in seconds to include (filters out cached builds, default: 30)')
parser.add_argument('--include-failed', action='store_true', default=False,
help='Include failed builds in analysis (default: exclude failed builds)')
parser.add_argument('--export-csv', action='store_true', default=True,
help='Export data to CSV (default: enabled)')
parser.add_argument('--interactive', '-i', action='store_true',
help='Run in interactive mode (ask for paths)')
args = parser.parse_args()
# Interactive mode for non-Docker usage
if args.interactive:
log_directory = input("Enter the path to your log directory (containing date folders): ").strip()
output_directory = input(f"Enter output directory (default: {args.output_dir}): ").strip() or args.output_dir
formats = input("Enter plot formats separated by spaces (default: png pdf): ").split() or args.formats
min_duration = input(f"Minimum build duration in seconds (default: {args.min_duration}): ").strip()
min_duration = int(min_duration) if min_duration else args.min_duration
include_failed = input("Include failed builds? (y/N): ").lower().startswith('y')
else:
log_directory = args.log_dir
output_directory = args.output_dir
formats = args.formats
min_duration = args.min_duration
include_failed = args.include_failed
if not os.path.exists(log_directory):
print(f"Directory not found: {log_directory}")
return
# Create analyzer and run analysis
analyzer = DockerLogAnalyzer(
log_directory,
min_duration_seconds=min_duration,
exclude_failed=not include_failed
)
print(f"Analyzing Docker build logs in: {log_directory}")
print(f"Filters: min_duration={min_duration}s, exclude_failed={not include_failed}")
analyzer.analyze_logs()
if analyzer.filtered_data:
print(f"Creating visualizations in: {output_directory}")
analyzer.create_visualizations(output_dir=output_directory, formats=formats)
if args.export_csv:
csv_path = os.path.join(output_directory, 'build_analysis.csv')
analyzer.export_data(csv_path)
else:
print("No valid build data found after filtering. Please check your log directory structure and file patterns.")
print(f"Expected structure: {log_directory}/YYYY-MM-DD/*docker-build*.log")
print("Consider adjusting filter criteria (--min-duration, --include-failed)")
if __name__ == "__main__":
main()
@mtelvers

Copy link
Copy Markdown
Author

Run with

docker run -v $(pwd):/data -w /data -it python:latest bash -c "pip install matplotlib pandas scipy && python docker_log_analyzer.py --min-duration 100"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment