Created
November 29, 2020 19:03
-
-
Save astrowonk/32f02d127441c138c182744277ecfed5 to your computer and use it in GitHub Desktop.
Python script to parse and analyze Backblaze Hard Drive Reliability Data raw csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dask.dataframe as dd | |
import numpy as np | |
import pandas as pd | |
import os | |
import argparse | |
from dask.diagnostics import ProgressBar | |
ProgressBar().register() | |
#requires dask, pandas, numpy. | |
# this script should be in the folder with the csv files, or you can provide an argument | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Process BB data') | |
parser.add_argument('--dir', | |
action='store_const', | |
const=str, | |
default=None, | |
help='Directory location of csv files.') | |
args = parser.parse_args() | |
if args.dir: | |
os.chdir(args.dir) | |
print(f"Processing in directory {os.getcwd()}") | |
df = dd.read_csv('20*.csv') | |
computed_groupby = df.groupby(['model', 'capacity_bytes' | |
])['failure'].agg(['sum', 'count' | |
]).compute().reset_index() | |
#some drives have a -1 capacity which I think means it's their first day. This adds those -1 capacity | |
# days back in since I grouped by capacity to preserve that value | |
print(computed_groupby.shape) | |
out = pd.concat([ | |
computed_groupby.groupby('model')[['sum', 'count']].sum(), | |
computed_groupby.groupby('model')['capacity_bytes'].max() | |
], | |
axis=1) | |
out['capacity_TB'] = np.round(out['capacity_bytes'] / 1E12, 2) | |
out['annual_failure_rate'] = 100.0 * (1.0 * out['sum']) / (out['count'] / | |
365.0) | |
cleaned_data = out.query('capacity_bytes > 0').rename( | |
{ | |
'sum': 'failure_count', | |
'count': 'drive_days' | |
}, axis=1).sort_values(['annual_failure_rate', 'drive_days'], | |
ascending=[True, | |
False]).query('drive_days > 10000') | |
cleaned_data.to_csv('stats_file.csv') | |
cleaned_data.to_html('stats_summary.html') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment