Skip to content

Instantly share code, notes, and snippets.

@astrowonk
Created November 29, 2020 19:03
Show Gist options
  • Save astrowonk/32f02d127441c138c182744277ecfed5 to your computer and use it in GitHub Desktop.
Save astrowonk/32f02d127441c138c182744277ecfed5 to your computer and use it in GitHub Desktop.
Python script to parse and analyze Backblaze Hard Drive Reliability Data raw csv files
import dask.dataframe as dd
import numpy as np
import pandas as pd
import os
import argparse
from dask.diagnostics import ProgressBar
ProgressBar().register()
#requires dask, pandas, numpy.
# this script should be in the folder with the csv files, or you can provide an argument
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process BB data')
parser.add_argument('--dir',
action='store_const',
const=str,
default=None,
help='Directory location of csv files.')
args = parser.parse_args()
if args.dir:
os.chdir(args.dir)
print(f"Processing in directory {os.getcwd()}")
df = dd.read_csv('20*.csv')
computed_groupby = df.groupby(['model', 'capacity_bytes'
])['failure'].agg(['sum', 'count'
]).compute().reset_index()
#some drives have a -1 capacity which I think means it's their first day. This adds those -1 capacity
# days back in since I grouped by capacity to preserve that value
print(computed_groupby.shape)
out = pd.concat([
computed_groupby.groupby('model')[['sum', 'count']].sum(),
computed_groupby.groupby('model')['capacity_bytes'].max()
],
axis=1)
out['capacity_TB'] = np.round(out['capacity_bytes'] / 1E12, 2)
out['annual_failure_rate'] = 100.0 * (1.0 * out['sum']) / (out['count'] /
365.0)
cleaned_data = out.query('capacity_bytes > 0').rename(
{
'sum': 'failure_count',
'count': 'drive_days'
}, axis=1).sort_values(['annual_failure_rate', 'drive_days'],
ascending=[True,
False]).query('drive_days > 10000')
cleaned_data.to_csv('stats_file.csv')
cleaned_data.to_html('stats_summary.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment