Skip to content

Instantly share code, notes, and snippets.

@recalde
Last active March 7, 2024 20:32
Show Gist options
  • Save recalde/585edfd205d07d26e90554344fb6e65c to your computer and use it in GitHub Desktop.
Save recalde/585edfd205d07d26e90554344fb6e65c to your computer and use it in GitHub Desktop.
import pandas as pd
import pyarrow.csv as pv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import calendar
from pathlib import Path
import os
def read_large_csv(filepath):
"""Read a large CSV file using pyarrow for efficiency."""
table = pv.read_csv(filepath)
return table.to_pandas()
def prepare_and_categorize_data(df):
"""Prepares and categorizes data."""
df[['input1', 'app', 'env', 'input4', 'guid']] = df['input'].str.split('/', expand=True)
df['calc_month'] = df['calc_dt'].str[4:6].astype(int).apply(lambda x: calendar.month_abbr[x])
df['calc_year'] = df['calc_dt'].str[:4]
duration_fields = ['requestQueuedDuration', 'requestDuration', 'businessTxnDuration', 'overallDuration']
for col in duration_fields:
df[col] = df[col] / 1000
conditions = [
(df['businessTxnDuration'] < 5),
(df['businessTxnDuration'] < 30),
(df['businessTxnDuration'] < 90),
(df['businessTxnDuration'] >= 90)
]
categories = ['1-Small-0-5', '2-Medium-5-30', '3-Large-30-90', '4-XLarge-90Plus']
df['category'] = np.select(conditions, categories)
return df
def calculate_statistics(df):
"""Calculates grouped statistics."""
stats = df.groupby(['app', 'cluster', 'businessTxn', 'category', 'calc_year', 'calc_month']).agg({
'requestQueuedDuration': ['min', 'max', 'mean'],
'requestDuration': ['min', 'max', 'mean'],
'overallDuration': ['min', 'max', 'mean'],
'businessTxnDuration': ['min', 'max', 'mean']
}).reset_index()
stats.columns = ['_'.join(col).strip() for col in stats.columns.values]
return stats
def generate_visualizations(df, visualizations_dir="visualizations"):
"""Generates year-over-year bar charts for each app and businessTxn combination."""
Path(visualizations_dir).mkdir(exist_ok=True)
combinations = df[['app', 'businessTxn']].drop_duplicates()
for index, row in combinations.iterrows():
app, business_txn = row['app'], row['businessTxn']
subset = df[(df['app'] == app) & (df['businessTxn'] == business_txn)]
pivot_table = subset.pivot_table(index='calc_month', columns='calc_year', values='businessTxnDuration', aggfunc='mean').fillna(0)
months_order = list(calendar.month_abbr[1:])
pivot_table = pivot_table.reindex(months_order)
pivot_table.plot(kind='bar', stacked=False, figsize=(10, 6))
plt.title(f'{app} - {business_txn} Year-over-Year')
plt.ylabel('Seconds')
plt.xlabel('Month')
plt.xticks(rotation=45)
plt.legend(title='Year')
fig_name = f"{app}_{business_txn}_YoY.png".replace('/', '_').replace(' ', '_')
plt.savefig(os.path.join(visualizations_dir, fig_name))
plt.close()
def generate_html_content(stats, image_html):
"""Generates HTML content with statistics and visualizations."""
stats_html = stats.to_html(index=False)
html_content = f"""
<html>
<head><title>Data Analysis Report</title></head>
<body>
<h1>Data Analysis Summary</h1>
<h2>Statistics</h2>
{stats_html}
<h2>Visualizations</h2>
{image_html}
</body>
</html>
"""
with open('data_analysis_report.html', 'w') as f:
f.write(html_content)
def update_html_for_visualizations(visualizations_dir="visualizations"):
"""Generates HTML for visualizations."""
image_html = ""
for image_name in os.listdir(visualizations_dir):
image_html += f'<img src="{visualizations_dir}/{image_name}" alt="{image_name}" style="width:100%;max-width:600px;"><br>\n'
return image_html
def main(filepath):
df = read_large_csv(filepath)
prepared_df = prepare_and_categorize_data(df)
stats = calculate_statistics(prepared_df)
generate_visualizations(prepared_df)
image_html = update_html_for_visualizations()
generate_html_content(stats, image_html)
if __name__ == "__main__":
csv_file_path = 'timing_info_report.csv' # Replace this with your actual file path
main(csv_file_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment