Skip to content

Instantly share code, notes, and snippets.

@recalde
Last active February 28, 2024 20:20
Show Gist options
  • Save recalde/2ecccf076b5d01ce840cf1801695dc47 to your computer and use it in GitHub Desktop.
Save recalde/2ecccf076b5d01ce840cf1801695dc47 to your computer and use it in GitHub Desktop.
s3-stats
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import os
# Reading data using Apache Arrow
data_dir = '/path/to/your/data/directory'
file_paths = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.parquet')]
dfs = [pq.read_table(file).to_pandas() for file in file_paths]
df = pd.concat(dfs, ignore_index=True)
# Splitting the key column into separate columns
df[['date', 'app', 'environment']] = df['key'].str.split('/', expand=True)
# Convert 'modified' column to datetime
df['modified'] = pd.to_datetime(df['modified'])
# Summarizing the data
summary = df.groupby(['date', 'app']).agg(total_size=('size', 'sum')).reset_index()
# Summarizing to month
summary['month'] = summary['date'].str[:7]
# Stacked bar chart visualization
pivot_table = summary.pivot_table(index='month', columns='app', values='total_size', aggfunc='sum', fill_value=0)
pivot_table.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Month')
plt.ylabel('Sum of Size')
plt.title('Sum of Size per Month by App')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment