Last active
May 26, 2018 11:03
-
-
Save parastuffs/209501951964494f04dc800ea604d671 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: cp1252 -*- | |
from __future__ import division # http://stackoverflow.com/questions/1267869/how-can-i-force-division-to-be-floating-point-division-keeps-rounding-down-to-0 | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from scipy.stats import linregress | |
from matplotlib.dates import date2num | |
from datetime import datetime, date | |
EPS = 1e-7 | |
LOGEPS = -2 | |
# Default values | |
MAXLEN = 300 | |
MINLEN = 150 | |
DELTA = 50 | |
start = 1 # 0: dataset begins with a positive slope | |
# 1: dataset begins with a negatibe slope | |
num = 5 | |
# Parse data | |
def date2int(date_str): | |
try: | |
date = datetime.strptime(date_str, '%d-%m-%y %H:%M') #'%Y/%m/%d %H:%M:%S') | |
except ValueError: | |
date = datetime.strptime(date_str, '%Y/%m/%d %H:%M:%S') | |
return date.toordinal() | |
# no empty value when parsing | |
input_file = 'data-ht-7.csv' | |
timestamp, galery, atm, diff = np.loadtxt(input_file, delimiter=',', | |
dtype=float, unpack = True, | |
converters = { 0 : date2int }) | |
if '1' in input_file: | |
MAXLEN = 300 | |
MINLEN = 150 | |
DELTA = 200 | |
start = 0 | |
elif '2' in input_file: | |
MAXLEN = 300 | |
MINLEN = 150 | |
DELTA = 150 | |
start = 0 | |
elif '5' in input_file: | |
MAXLEN = 300 | |
MINLEN = 150 | |
DELTA = 50 | |
start = 1 | |
elif '6' in input_file: | |
MAXLEN = 300 | |
MINLEN = 100 | |
DELTA = 100 | |
start = 0 | |
elif '7' in input_file: | |
start = 1 | |
MAXLEN = 300 | |
MINLEN = 100 | |
DELTA = 100 | |
# Order data to get the production and pumping cycles | |
# concaténer cycles c1 c2 c3 si len(c2) < EPS et Delta(C2) < x || all elem of C2 in [0.9/1.1 *mean 10 last elem] | |
def concatenation_cycles(cycles, label_cycles): | |
# for cycle in cycles: | |
# print(cycle) | |
new_cycles = [cycles[0]] | |
new_label_cycles = [label_cycles[0]] | |
i = 1 #remove cycle 1 | |
while i < len(cycles) - 1: | |
if len(cycles[i]) < 8 : | |
print("Need to merge cycles") | |
print("Merge the current cycle with the previous one.") | |
print cycles[i] | |
for j in range(len(cycles[i])): | |
new_cycles[-1].append(cycles[i][j]) | |
print ("Also merge the next one.") | |
for elem in cycles[i+1]: | |
new_cycles[-1].append(elem) | |
i = i + 2 | |
else: | |
new_cycles.append(cycles[i]) | |
new_label_cycles.append(label_cycles[i]) | |
i = i + 1 | |
return new_cycles, new_label_cycles | |
def cut_data(galery, timestamp): | |
galery_cycles = [] | |
label_cycles = [timestamp[0]] | |
ref = galery[0] | |
cycle = [] | |
asc = True if galery[1] > galery[0] else False | |
#start_asc = True if asc == True else False | |
for i in range (0, len(galery)) : | |
value = galery[i] | |
if (value >= 0.99*ref and asc) or (value <= 1.01*ref and not asc): ## on continue le cycle | |
cycle.append(value) | |
else : | |
if cycle is not [] : | |
galery_cycles.append(cycle) | |
label_cycles.append(timestamp[i]) | |
cycle = [value] | |
asc = not asc | |
if (asc and ref < value) or (not asc and ref > value) : | |
ref = value # remplace ref que si meilleure | |
if cycle is not []: # ajouter dernier cycle | |
galery_cycles.append(cycle) | |
# return galery_cycles, label_cycles | |
return concatenation_cycles(galery_cycles, label_cycles) | |
def create_corresponding_time(cycles): | |
return [ np.array(range(0, len(cycles[i])) ) for i in range(0, len(cycles)) ] | |
ht = np.array(galery) - np.array(atm) | |
cycles, label_cycles = cut_data(ht, timestamp) | |
print label_cycles | |
times = create_corresponding_time(cycles) | |
# Plot | |
plt.figure(1) | |
# All the data | |
plt.plot(np.array(range(1, len(ht)+1)), ht, label="Ht") | |
#plt.legend() | |
#plt.draw() | |
# Plot each cycle separately | |
#for i in range(0,len(cycles)) : | |
# plt.plot(times1[i], cycles[i], label="ht") | |
#plt.draw() | |
####### ### ### ####### | |
## ## ## ## ## ## ## ## | |
## ## ## ## ## ## | |
####### ## ## ## ## ####### | |
## ######### ######### ## | |
## ## ## ## ## ## ## ## | |
####### ## ## ## ## ####### | |
# Control theory method | |
percentages = {95:3, 87:2, 63:1} | |
coef = {95:[], 87:[], 63:[]} | |
coefMerged = [] | |
coefMeans = [] | |
coefStd = [] | |
for i in range(start,len(cycles),2) : | |
print("Cycle {}".format(i)) | |
cycle = cycles[i] | |
if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA : | |
maxCycle = max(cycle) | |
minCycle = min(cycle) | |
rangeCycle = maxCycle - minCycle | |
# print maxCycle | |
# print minCycle | |
# print rangeCycle | |
# print cycle | |
for p in percentages: | |
ratio = percentages[p] | |
# First, find the threashold at 95, 87 or 63% | |
threshold = p * rangeCycle / 100 | |
# print("expected: {}, exact: {}".format(minCycle + threshold, min(cycle, key=lambda x:abs(minCycle + threshold-x)))) | |
# The target is minCycle (the minimum value in the cycle) + threshold (threshold that is relative). | |
# Then the lambda function will find the closest actual value in the list. | |
# Its index gives us the abscissa of the corresponding value. Since all cycles begin | |
# at zero, this abscissa is also the T, 2T or 3T of our system. | |
time_carac = cycle.index(min(cycle, key=lambda x:abs(minCycle + threshold-x))) / ratio | |
if time_carac > 0: | |
# print float(1/time_carac) | |
coef[p].append(1/time_carac) | |
for c in coef: | |
coefMeans.append(np.mean(coef[c])) | |
coefStd.append(np.std(coef[c])) | |
print("{} mean: {}+/-{}".format(c, coefMeans[-1], coefStd[-1])) | |
for i in coef[c]: | |
coefMerged.append(i) | |
print("SAAS method: {}+/-{}".format(np.mean(coefMeans), np.std(coefMerged))) | |
# Compute the ln | |
def compute_ln(ht): | |
h0 = min(ht) | |
hf = max(ht) | |
# last = ht[-25:] | |
# hf = np.mean(last) | |
# print(np.std(last)) | |
#print(abs((ht[:-idx]-hf+EPS)/(h0-hf))) | |
log_ht = np.log(abs((ht - hf + EPS)/(h0 - hf))) | |
idx = np.argmax(log_ht < LOGEPS) | |
return log_ht, idx # np.log(abs((ht[:-idx] - hf + EPS)/(h0 - hf))) | |
####### ## ##### ######## ######### ####### | |
## ## ## ## ## ## ## ## ## ## | |
## ## ## ## ## ## ## ## | |
####### ## ## ## ####### ###### ####### | |
## ## ## ## ## ## ## | |
## ## ## ## ## ## ## ## ## | |
####### ######### ##### ## ######### ####### | |
ln_cycles = [] | |
len_cycles = [] | |
times_ln = [] | |
slopes = [] | |
intercepts = [] | |
std_errors = [] | |
date_strings = [] | |
# Non-discarded cycles | |
valid_cycles = [] | |
colors = ["#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"] | |
fig, ax = plt.subplots() | |
ax.set_prop_cycle('color', colors) | |
for i in range(start,len(cycles),2) : | |
cycle = cycles[i] | |
if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA : | |
date_string = date.fromordinal(int(label_cycles[i])) | |
time_ln = times[i] | |
#compute everything | |
ln, idx = compute_ln(cycle) | |
if idx != 0 : # otherwise array is empty | |
ln = ln[:idx] | |
time_ln = time_ln[:idx] | |
cycle = cycle[:idx] | |
if len(cycle) > MAXLEN : | |
ln = ln[:MAXLEN] | |
time_ln = time_ln[:MAXLEN] | |
cycle = cycle[:MAXLEN] | |
r_value = 0 | |
offset = 0 | |
while abs(r_value) < 0.99 and offset < len(ln) - 1: | |
offset += 1 | |
slope, intercept, r_value, p_value, std_err = linregress(time_ln[:-offset], ln[:-offset]) | |
# print("slope R value: {}".format(r_value)) | |
# print("Dataset length: {}".format(len(ln))) | |
print("Playtime is over --------------------") | |
time_ln = time_ln[:-offset] | |
ln = ln[:-offset] | |
cycle = cycle[:-offset] | |
if max(cycle) - min(cycle) > DELTA: | |
slope, intercept, r_value, p_value, std_err = linregress(time_ln, ln) | |
print("slope R value: {}".format(r_value)) | |
print("Dataset length: {}".format(len(ln))) | |
print("Slope: {}, date: {}".format(slope, date_string)) | |
# print(slope) | |
#save in lists | |
valid_cycles.append(cycle) | |
ln_cycles.append(ln) | |
len_cycles.append(len(ln)) | |
times_ln.append(time_ln) | |
slopes.append(slope) | |
std_errors.append(std_err) | |
intercepts.append(intercept) | |
date_strings.append(date_string) | |
else: | |
print("Discarding cycle from {}, delta too low: {}/{}".format(date_string, max(cycle) - min(cycle), DELTA)) | |
else: | |
print("Dropping a cycle of length {} and delta {}".format(len(cycle), max(cycle) - min(cycle))) | |
######### ######## ## ########## ######### ######## | |
## ## ## ## ## ## ## | |
## ## ## ## ## ## ## | |
###### ## ## ## ###### ######## | |
## ## ## ## ## ## ## | |
## ## ## ## ## ## ## | |
## ######## ######### ## ######### ## ## | |
# Now that everything has been processed, look for abnormalities and discard them. | |
# Mean slope | |
slopeMean = np.mean(slopes) | |
# Slope standard deviation | |
slopeStd = np.std(slopes) | |
print("Discarding abnormalities. Original mean and std: {}, {}".format(slopeMean, slopeStd)) | |
# If a slope is further than twice the stdev from the mean, throw it the fuck away. | |
slopesCpy = slopes[:] | |
idxSlopes = 0 | |
for i in range(len(slopesCpy)): | |
if abs(slopeMean - slopesCpy[i]) > 2 * slopeStd: | |
print("Slope is too shitty. Slope: {} from {}, mean: {}, stdev: {}".format(slopesCpy[i], date_strings[idxSlopes], slopeMean, slopeStd)) | |
del slopes[idxSlopes] | |
del std_errors[idxSlopes] | |
del intercepts[idxSlopes] | |
del ln_cycles[idxSlopes] | |
del len_cycles[idxSlopes] | |
del date_strings[idxSlopes] | |
else: | |
plt.figure(2) | |
plt.plot(times_ln[idxSlopes], slopes[idxSlopes] * times_ln[idxSlopes] + intercepts[idxSlopes], linestyle='--', label="AL " +str(date_strings[idxSlopes])) | |
plt.legend() | |
plt.figure(3) | |
plt.plot(times_ln[idxSlopes], valid_cycles[idxSlopes], label="Cycle " +str(date_strings[idxSlopes])) | |
# If we do not delete an element from the lists, | |
# we can increment the index. Otherwise we don't. | |
idxSlopes += 1 | |
plt.legend() | |
print("Cycles length: ".format(len_cycles)) | |
print("Slopes: {}".format(slopes)) | |
print("Mean slope: {}".format(-1*np.mean(slopes))) | |
print("Std error : " + str(np.std(slopes))) | |
print("std/mean: {}".format(-1*np.std(slopes)/np.mean(slopes))) | |
print("Min slope : " + str(min(slopes))) | |
print("Max slope : " + str(max(slopes))) | |
# for i in range(0, len(slopes)): | |
# print(str(num) + "," + str(i+1) + "," + str(-slopes[i]*10000)) | |
plt.show() | |
# TODO | |
# IF we discard too many values in the cycle, trash it. => max(cycle) - min(cycle) > DELTA still needs to hold. | |
# Add a feature allowing to tune the time between two measures. By default it is 60 seconds, but this value could change. | |
# Add time in date string | |
# Coefficients exacts : 63.2, 86.5, 95 | |
# Approximation linéaire pour déterminer la bonne abscisse aux coefficient SAAS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment