Skip to content

Instantly share code, notes, and snippets.

@zeulb
Created April 14, 2017 05:04
Show Gist options
  • Save zeulb/6d265885c5248dfb5244e17a600f383c to your computer and use it in GitHub Desktop.
Save zeulb/6d265885c5248dfb5244e17a600f383c to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('agg',warn=False, force=True)
from matplotlib import pyplot as plt
import os
from collections import defaultdict
CSV = 'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/data/*.csv'
COLUMNS = ['date', 'serial_number', 'model', 'failure', 'smart_1_raw', 'smart_9_raw']
column = 'smart_1_raw'
splitter = '0,50000000,100000000,150000000,200000000,300000000'
splitter = map(float, (splitter + ',Inf').split(','))
serial_idx = 1
failure_idx = 4
idx = None
prev_data = {}
prev_failed_data = {}
failed_freq = defaultdict(int)
freq = defaultdict(int)
def add_failed(value):
value = abs(value)
index = next(i for i, s in enumerate(splitter) if value <= s)
failed_freq[index] += 1
def add(value):
value = abs(value)
index = next(i for i, s in enumerate(splitter) if value <= s)
freq[index] += 1
def label(index):
if index == 0:
return str(splitter[index])
if index == len(splitter) - 1:
return '> ' + str(int(splitter[index - 1] / 1000000)) + 'm'
p = str(int(splitter[index - 1] / 1000000))
s = str(int(splitter[index] / 1000000))
if p != s:
return p + 'm - ' + s + 'm'
else:
return p + 'm'
for filename in reversed(sorted(os.listdir('data'))):
with open('data/' + filename) as s:
date = filename.split('.')[0]
data = {}
failed_data = {}
for index, line in enumerate(s):
if index == 0:
if idx is None:
idx = line.split(',').index(column)
else:
if idx is None:
continue
entry = line.split(',')
series = entry[serial_idx]
failure = entry[failure_idx]
if len(entry[idx]) > 0:
value = float(entry[idx])
data[series] = value
if failure == '1':
failed_data[series] = value
else:
if series in prev_data:
add(prev_data[series] - value)
if series in prev_failed_data:
add_failed(prev_failed_data[series] - value)
prev_data = data
prev_failed_data = failed_data
def save(title, data, filename):
df = pd.Series({label(k): v for k, v in data.items()}, map(lambda x: label(x), range(len(splitter))))
print title
print df
bar = df.plot(kind='bar', title=title)
fig = bar.get_figure()
fig.tight_layout()
fig.savefig(filename)
fig.clf()
save('Frequency of Delta in SMART 1 (Read Error Rate) Value', freq, 'smart1.svg')
save('Frequency of Delta Before Disk Failure in SMART 1 (Read Error Rate) Value', failed_freq, 'smart1_failure.svg')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment