Python script to parse and analyze Backblaze Hard Drive Reliability Data raw csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask.dataframe as dd | |
import numpy as np | |
import pandas as pd | |
import os | |
import argparse | |
from dask.diagnostics import ProgressBar | |
ProgressBar().register() | |
#requires dask, pandas, numpy. | |
# this script should be in the folder with the csv files, or you can provide an argument | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Process BB data') | |
parser.add_argument('--dir', | |
action='store_const', | |
const=str, | |
default=None, | |
help='Directory location of csv files.') | |
args = parser.parse_args() | |
if args.dir: | |
os.chdir(args.dir) | |
print(f"Processing in directory {os.getcwd()}") | |
df = dd.read_csv('20*.csv') | |
computed_groupby = df.groupby(['model', 'capacity_bytes' | |
])['failure'].agg(['sum', 'count' | |
]).compute().reset_index() | |
#some drives have a -1 capacity which I think means it's their first day. This adds those -1 capacity | |
# days back in since I grouped by capacity to preserve that value | |
print(computed_groupby.shape) | |
out = pd.concat([ | |
computed_groupby.groupby('model')[['sum', 'count']].sum(), | |
computed_groupby.groupby('model')['capacity_bytes'].max() | |
], | |
axis=1) | |
out['capacity_TB'] = np.round(out['capacity_bytes'] / 1E12, 2) | |
out['annual_failure_rate'] = 100.0 * (1.0 * out['sum']) / (out['count'] / | |
365.0) | |
cleaned_data = out.query('capacity_bytes > 0').rename( | |
{ | |
'sum': 'failure_count', | |
'count': 'drive_days' | |
}, axis=1).sort_values(['annual_failure_rate', 'drive_days'], | |
ascending=[True, | |
False]).query('drive_days > 10000') | |
cleaned_data.to_csv('stats_file.csv') | |
cleaned_data.to_html('stats_summary.html') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment