Last active
March 7, 2024 20:32
-
-
Save recalde/585edfd205d07d26e90554344fb6e65c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import pyarrow.csv as pv | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy as np | |
import calendar | |
from pathlib import Path | |
import os | |
def read_large_csv(filepath): | |
"""Read a large CSV file using pyarrow for efficiency.""" | |
table = pv.read_csv(filepath) | |
return table.to_pandas() | |
def prepare_and_categorize_data(df): | |
"""Prepares and categorizes data.""" | |
df[['input1', 'app', 'env', 'input4', 'guid']] = df['input'].str.split('/', expand=True) | |
df['calc_month'] = df['calc_dt'].str[4:6].astype(int).apply(lambda x: calendar.month_abbr[x]) | |
df['calc_year'] = df['calc_dt'].str[:4] | |
duration_fields = ['requestQueuedDuration', 'requestDuration', 'businessTxnDuration', 'overallDuration'] | |
for col in duration_fields: | |
df[col] = df[col] / 1000 | |
conditions = [ | |
(df['businessTxnDuration'] < 5), | |
(df['businessTxnDuration'] < 30), | |
(df['businessTxnDuration'] < 90), | |
(df['businessTxnDuration'] >= 90) | |
] | |
categories = ['1-Small-0-5', '2-Medium-5-30', '3-Large-30-90', '4-XLarge-90Plus'] | |
df['category'] = np.select(conditions, categories) | |
return df | |
def calculate_statistics(df): | |
"""Calculates grouped statistics.""" | |
stats = df.groupby(['app', 'cluster', 'businessTxn', 'category', 'calc_year', 'calc_month']).agg({ | |
'requestQueuedDuration': ['min', 'max', 'mean'], | |
'requestDuration': ['min', 'max', 'mean'], | |
'overallDuration': ['min', 'max', 'mean'], | |
'businessTxnDuration': ['min', 'max', 'mean'] | |
}).reset_index() | |
stats.columns = ['_'.join(col).strip() for col in stats.columns.values] | |
return stats | |
def generate_visualizations(df, visualizations_dir="visualizations"): | |
"""Generates year-over-year bar charts for each app and businessTxn combination.""" | |
Path(visualizations_dir).mkdir(exist_ok=True) | |
combinations = df[['app', 'businessTxn']].drop_duplicates() | |
for index, row in combinations.iterrows(): | |
app, business_txn = row['app'], row['businessTxn'] | |
subset = df[(df['app'] == app) & (df['businessTxn'] == business_txn)] | |
pivot_table = subset.pivot_table(index='calc_month', columns='calc_year', values='businessTxnDuration', aggfunc='mean').fillna(0) | |
months_order = list(calendar.month_abbr[1:]) | |
pivot_table = pivot_table.reindex(months_order) | |
pivot_table.plot(kind='bar', stacked=False, figsize=(10, 6)) | |
plt.title(f'{app} - {business_txn} Year-over-Year') | |
plt.ylabel('Seconds') | |
plt.xlabel('Month') | |
plt.xticks(rotation=45) | |
plt.legend(title='Year') | |
fig_name = f"{app}_{business_txn}_YoY.png".replace('/', '_').replace(' ', '_') | |
plt.savefig(os.path.join(visualizations_dir, fig_name)) | |
plt.close() | |
def generate_html_content(stats, image_html): | |
"""Generates HTML content with statistics and visualizations.""" | |
stats_html = stats.to_html(index=False) | |
html_content = f""" | |
<html> | |
<head><title>Data Analysis Report</title></head> | |
<body> | |
<h1>Data Analysis Summary</h1> | |
<h2>Statistics</h2> | |
{stats_html} | |
<h2>Visualizations</h2> | |
{image_html} | |
</body> | |
</html> | |
""" | |
with open('data_analysis_report.html', 'w') as f: | |
f.write(html_content) | |
def update_html_for_visualizations(visualizations_dir="visualizations"): | |
"""Generates HTML for visualizations.""" | |
image_html = "" | |
for image_name in os.listdir(visualizations_dir): | |
image_html += f'<img src="{visualizations_dir}/{image_name}" alt="{image_name}" style="width:100%;max-width:600px;"><br>\n' | |
return image_html | |
def main(filepath): | |
df = read_large_csv(filepath) | |
prepared_df = prepare_and_categorize_data(df) | |
stats = calculate_statistics(prepared_df) | |
generate_visualizations(prepared_df) | |
image_html = update_html_for_visualizations() | |
generate_html_content(stats, image_html) | |
if __name__ == "__main__": | |
csv_file_path = 'timing_info_report.csv' # Replace this with your actual file path | |
main(csv_file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment