Created
April 14, 2017 05:04
-
-
Save zeulb/6d265885c5248dfb5244e17a600f383c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib | |
matplotlib.use('agg',warn=False, force=True) | |
from matplotlib import pyplot as plt | |
import os | |
from collections import defaultdict | |
CSV = 'hdfs://ec2-34-204-54-226.compute-1.amazonaws.com:9000/data/*.csv' | |
COLUMNS = ['date', 'serial_number', 'model', 'failure', 'smart_1_raw', 'smart_9_raw'] | |
column = 'smart_1_raw' | |
splitter = '0,50000000,100000000,150000000,200000000,300000000' | |
splitter = map(float, (splitter + ',Inf').split(',')) | |
serial_idx = 1 | |
failure_idx = 4 | |
idx = None | |
prev_data = {} | |
prev_failed_data = {} | |
failed_freq = defaultdict(int) | |
freq = defaultdict(int) | |
def add_failed(value): | |
value = abs(value) | |
index = next(i for i, s in enumerate(splitter) if value <= s) | |
failed_freq[index] += 1 | |
def add(value): | |
value = abs(value) | |
index = next(i for i, s in enumerate(splitter) if value <= s) | |
freq[index] += 1 | |
def label(index): | |
if index == 0: | |
return str(splitter[index]) | |
if index == len(splitter) - 1: | |
return '> ' + str(int(splitter[index - 1] / 1000000)) + 'm' | |
p = str(int(splitter[index - 1] / 1000000)) | |
s = str(int(splitter[index] / 1000000)) | |
if p != s: | |
return p + 'm - ' + s + 'm' | |
else: | |
return p + 'm' | |
for filename in reversed(sorted(os.listdir('data'))): | |
with open('data/' + filename) as s: | |
date = filename.split('.')[0] | |
data = {} | |
failed_data = {} | |
for index, line in enumerate(s): | |
if index == 0: | |
if idx is None: | |
idx = line.split(',').index(column) | |
else: | |
if idx is None: | |
continue | |
entry = line.split(',') | |
series = entry[serial_idx] | |
failure = entry[failure_idx] | |
if len(entry[idx]) > 0: | |
value = float(entry[idx]) | |
data[series] = value | |
if failure == '1': | |
failed_data[series] = value | |
else: | |
if series in prev_data: | |
add(prev_data[series] - value) | |
if series in prev_failed_data: | |
add_failed(prev_failed_data[series] - value) | |
prev_data = data | |
prev_failed_data = failed_data | |
def save(title, data, filename): | |
df = pd.Series({label(k): v for k, v in data.items()}, map(lambda x: label(x), range(len(splitter)))) | |
print title | |
print df | |
bar = df.plot(kind='bar', title=title) | |
fig = bar.get_figure() | |
fig.tight_layout() | |
fig.savefig(filename) | |
fig.clf() | |
save('Frequency of Delta in SMART 1 (Read Error Rate) Value', freq, 'smart1.svg') | |
save('Frequency of Delta Before Disk Failure in SMART 1 (Read Error Rate) Value', failed_freq, 'smart1_failure.svg') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment