parastuffs/well_coefficient.py

## well_coefficient.py
# -*- coding: cp1252 -*-
from __future__ import division # http://stackoverflow.com/questions/1267869/how-can-i-force-division-to-be-floating-point-division-keeps-rounding-down-to-0
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
from matplotlib.dates import date2num
from datetime import datetime, date

EPS = 1e-7
LOGEPS = -2

# Default values
MAXLEN = 300
MINLEN = 150
DELTA = 50
start = 1   # 0: dataset begins with a positive slope
            # 1: dataset begins with a negatibe slope

num = 5

# Parse data
def date2int(date_str):
    try:
        date = datetime.strptime(date_str,  '%d-%m-%y %H:%M') #'%Y/%m/%d %H:%M:%S')
    except ValueError:
        date = datetime.strptime(date_str,  '%Y/%m/%d %H:%M:%S')
    return date.toordinal()

# no empty value when parsing
input_file = 'data-ht-7.csv'
timestamp, galery, atm, diff = np.loadtxt(input_file, delimiter=',',
                                     dtype=float,  unpack = True,
                                     converters = { 0 : date2int })


if '1' in input_file:
    MAXLEN = 300
    MINLEN = 150
    DELTA = 200
    start = 0
elif '2' in input_file:
    MAXLEN = 300
    MINLEN = 150
    DELTA = 150
    start = 0
elif '5' in input_file:
    MAXLEN = 300
    MINLEN = 150
    DELTA = 50
    start = 1
elif '6' in input_file:
    MAXLEN = 300
    MINLEN = 100
    DELTA = 100
    start = 0
elif '7' in input_file:
    start = 1
    MAXLEN = 300
    MINLEN = 100
    DELTA = 100


# Order data to get the production and pumping cycles

# concaténer cycles c1 c2 c3 si len(c2) < EPS et Delta(C2) < x || all elem of C2 in [0.9/1.1 *mean 10 last elem]
def concatenation_cycles(cycles, label_cycles):
    # for cycle in cycles:
    #     print(cycle)
    new_cycles = [cycles[0]]
    new_label_cycles = [label_cycles[0]]
    i = 1  #remove cycle 1
    while i < len(cycles) - 1:
        if len(cycles[i]) < 8 :
            print("Need to merge cycles")
            print("Merge the current cycle with the previous one.")
            print cycles[i]
            for j in range(len(cycles[i])):
                new_cycles[-1].append(cycles[i][j])
            print ("Also merge the next one.")
            for elem in cycles[i+1]:
                new_cycles[-1].append(elem)
            i = i + 2
        else:
            new_cycles.append(cycles[i])
            new_label_cycles.append(label_cycles[i])
            i = i + 1
    return new_cycles, new_label_cycles

def cut_data(galery, timestamp):
    galery_cycles = []
    label_cycles = [timestamp[0]]
    ref = galery[0]
    cycle = []
    asc = True if galery[1] > galery[0] else False
    #start_asc = True if asc == True else False
    for i in range (0, len(galery)) :
        value = galery[i]
        if (value >= 0.99*ref and asc) or (value <= 1.01*ref and not asc):  ## on continue le cycle
            cycle.append(value)
        else :
            if cycle is not [] :
                galery_cycles.append(cycle)
                label_cycles.append(timestamp[i])
            cycle = [value]
            asc = not asc
        if (asc and ref < value) or (not asc and ref > value) :
            ref = value # remplace ref que si meilleure
    if cycle is not []: # ajouter dernier cycle
        galery_cycles.append(cycle)
    # return galery_cycles, label_cycles
    return concatenation_cycles(galery_cycles, label_cycles)

def create_corresponding_time(cycles):
    return [ np.array(range(0, len(cycles[i])) ) for i in range(0, len(cycles)) ]

ht = np.array(galery) - np.array(atm)
cycles, label_cycles = cut_data(ht, timestamp)
print label_cycles
times = create_corresponding_time(cycles)

# Plot
plt.figure(1)

# All the data
plt.plot(np.array(range(1, len(ht)+1)), ht, label="Ht")
#plt.legend()
#plt.draw()

# Plot each cycle separately
#for i in range(0,len(cycles)) :
#    plt.plot(times1[i], cycles[i], label="ht")
#plt.draw()


 #######      ###        ###      #######
##     ##    ## ##      ## ##    ##     ##
##          ##   ##    ##   ##   ##
 #######   ##     ##  ##     ##   #######
       ##  #########  #########         ##
##     ##  ##     ##  ##     ##  ##     ##
 #######   ##     ##  ##     ##   #######

# Control theory method

percentages = {95:3, 87:2, 63:1}
coef = {95:[], 87:[], 63:[]}
coefMerged = []
coefMeans = []
coefStd = []

for i in range(start,len(cycles),2) :
    print("Cycle {}".format(i))
    cycle = cycles[i]
    if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA :
        maxCycle = max(cycle)
        minCycle = min(cycle)
        rangeCycle = maxCycle - minCycle
        # print maxCycle
        # print minCycle
        # print rangeCycle
        # print cycle
        for p in percentages:
            ratio = percentages[p]
            # First, find the threashold at 95, 87 or 63%
            threshold = p * rangeCycle / 100
            # print("expected: {}, exact: {}".format(minCycle + threshold, min(cycle, key=lambda x:abs(minCycle + threshold-x))))
            # The target is minCycle (the minimum value in the cycle) + threshold (threshold that is relative).
            # Then the lambda function will find the closest actual value in the list.
            # Its index gives us the abscissa of the corresponding value. Since all cycles begin
            # at zero, this abscissa is also the T, 2T or 3T of our system.
            time_carac = cycle.index(min(cycle, key=lambda x:abs(minCycle + threshold-x))) / ratio
            if time_carac > 0:
                # print float(1/time_carac)
                coef[p].append(1/time_carac)

for c in coef:
    coefMeans.append(np.mean(coef[c]))
    coefStd.append(np.std(coef[c]))
    print("{} mean: {}+/-{}".format(c, coefMeans[-1], coefStd[-1]))
    for i in coef[c]:
        coefMerged.append(i)

print("SAAS method: {}+/-{}".format(np.mean(coefMeans), np.std(coefMerged)))


# Compute the ln
def compute_ln(ht):
    h0 = min(ht)
    hf = max(ht)
    # last = ht[-25:]
    # hf = np.mean(last)
    # print(np.std(last))
    #print(abs((ht[:-idx]-hf+EPS)/(h0-hf)))
    log_ht = np.log(abs((ht - hf + EPS)/(h0 - hf)))
    idx = np.argmax(log_ht < LOGEPS)
    return log_ht, idx # np.log(abs((ht[:-idx] - hf + EPS)/(h0 - hf)))


 #######   ##           #####    ########   #########   #######
##     ##  ##         ##     ##  ##     ##  ##         ##     ##
##         ##         ##     ##  ##     ##  ##         ##
 #######   ##         ##     ##  #######    ######      #######
       ##  ##         ##     ##  ##         ##                ##
##     ##  ##         ##     ##  ##         ##         ##     ##
 #######   #########    #####    ##         #########   #######

ln_cycles = []
len_cycles = []
times_ln = []
slopes = []
intercepts = []
std_errors = []
date_strings = []
# Non-discarded cycles
valid_cycles = []

colors = ["#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"]
fig, ax = plt.subplots()
ax.set_prop_cycle('color', colors)

for i in range(start,len(cycles),2) :
    cycle = cycles[i]
    if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA :
        date_string = date.fromordinal(int(label_cycles[i]))
        time_ln = times[i]
        #compute everything
        ln, idx = compute_ln(cycle)
        if idx != 0 :       # otherwise array is empty
            ln = ln[:idx]
            time_ln = time_ln[:idx]
            cycle = cycle[:idx]
        if len(cycle) > MAXLEN :
            ln = ln[:MAXLEN]
            time_ln = time_ln[:MAXLEN]
            cycle = cycle[:MAXLEN]


        r_value = 0
        offset = 0
        while abs(r_value) < 0.99 and offset < len(ln) - 1:
            offset += 1
            slope, intercept, r_value, p_value, std_err = linregress(time_ln[:-offset], ln[:-offset])

            # print("slope R value: {}".format(r_value))
            # print("Dataset length: {}".format(len(ln)))

        print("Playtime is over --------------------")
        time_ln = time_ln[:-offset]
        ln = ln[:-offset]
        cycle = cycle[:-offset]

        if max(cycle) - min(cycle) > DELTA:

            slope, intercept, r_value, p_value, std_err = linregress(time_ln, ln)

            print("slope R value: {}".format(r_value))
            print("Dataset length: {}".format(len(ln)))
            print("Slope: {}, date: {}".format(slope, date_string))

            # print(slope)

            #save in lists
            valid_cycles.append(cycle)
            ln_cycles.append(ln)
            len_cycles.append(len(ln))
            times_ln.append(time_ln)
            slopes.append(slope)
            std_errors.append(std_err)
            intercepts.append(intercept)
            date_strings.append(date_string)
        else:
            print("Discarding cycle from {}, delta too low: {}/{}".format(date_string, max(cycle) - min(cycle), DELTA))

    else:
        print("Dropping a cycle of length {} and delta {}".format(len(cycle), max(cycle) - min(cycle)))


#########  ########   ##         ##########  #########  ########
##            ##      ##             ##      ##         ##     ##
##            ##      ##             ##      ##         ##     ##
######        ##      ##             ##      ######     ########
##            ##      ##             ##      ##         ##   ##
##            ##      ##             ##      ##         ##    ##
##         ########   #########      ##      #########  ##     ##

# Now that everything has been processed, look for abnormalities and discard them.

# Mean slope
slopeMean = np.mean(slopes)
# Slope standard deviation
slopeStd = np.std(slopes)

print("Discarding abnormalities. Original mean and std: {}, {}".format(slopeMean, slopeStd))

# If a slope is further than twice the stdev from the mean, throw it the fuck away.
slopesCpy = slopes[:]
idxSlopes = 0
for i in range(len(slopesCpy)):
    if abs(slopeMean - slopesCpy[i]) > 2 * slopeStd:
        print("Slope is too shitty. Slope: {} from {}, mean: {}, stdev: {}".format(slopesCpy[i], date_strings[idxSlopes], slopeMean, slopeStd))
        del slopes[idxSlopes]
        del std_errors[idxSlopes]
        del intercepts[idxSlopes]
        del ln_cycles[idxSlopes]
        del len_cycles[idxSlopes]
        del date_strings[idxSlopes]
    else:
        plt.figure(2)
        plt.plot(times_ln[idxSlopes], slopes[idxSlopes] * times_ln[idxSlopes] + intercepts[idxSlopes], linestyle='--', label="AL " +str(date_strings[idxSlopes]))
        plt.legend()
        plt.figure(3)
        plt.plot(times_ln[idxSlopes], valid_cycles[idxSlopes], label="Cycle " +str(date_strings[idxSlopes]))
        # If we do not delete an element from the lists,
        # we can increment the index. Otherwise we don't.
        idxSlopes += 1


plt.legend()
print("Cycles length: ".format(len_cycles))
print("Slopes: {}".format(slopes))
print("Mean slope: {}".format(-1*np.mean(slopes)))
print("Std error : " + str(np.std(slopes)))
print("std/mean: {}".format(-1*np.std(slopes)/np.mean(slopes)))
print("Min slope : " + str(min(slopes)))
print("Max slope : " + str(max(slopes)))
# for i in range(0, len(slopes)):
    # print(str(num) + "," + str(i+1) + "," + str(-slopes[i]*10000))


plt.show()

# TODO

# IF we discard too many values in the cycle, trash it. => max(cycle) - min(cycle) > DELTA  still needs to hold.

# Add a feature allowing to tune the time between two measures. By default it is 60 seconds, but this value could change.

# Add time in date string

# Coefficients exacts : 63.2, 86.5, 95

# Approximation linéaire pour déterminer la bonne abscisse aux coefficient SAAS
	# -- coding: cp1252 --
	from __future__ import division # http://stackoverflow.com/questions/1267869/how-can-i-force-division-to-be-floating-point-division-keeps-rounding-down-to-0
	import numpy as np
	import matplotlib.pyplot as plt
	from scipy.stats import linregress
	from matplotlib.dates import date2num
	from datetime import datetime, date

	EPS = 1e-7
	LOGEPS = -2

	# Default values
	MAXLEN = 300
	MINLEN = 150
	DELTA = 50
	start = 1 # 0: dataset begins with a positive slope
	# 1: dataset begins with a negatibe slope

	num = 5

	# Parse data
	def date2int(date_str):
	try:
	date = datetime.strptime(date_str, '%d-%m-%y %H:%M') #'%Y/%m/%d %H:%M:%S')
	except ValueError:
	date = datetime.strptime(date_str, '%Y/%m/%d %H:%M:%S')
	return date.toordinal()

	# no empty value when parsing
	input_file = 'data-ht-7.csv'
	timestamp, galery, atm, diff = np.loadtxt(input_file, delimiter=',',
	dtype=float, unpack = True,
	converters = { 0 : date2int })


	if '1' in input_file:
	MAXLEN = 300
	MINLEN = 150
	DELTA = 200
	start = 0
	elif '2' in input_file:
	MAXLEN = 300
	MINLEN = 150
	DELTA = 150
	start = 0
	elif '5' in input_file:
	MAXLEN = 300
	MINLEN = 150
	DELTA = 50
	start = 1
	elif '6' in input_file:
	MAXLEN = 300
	MINLEN = 100
	DELTA = 100
	start = 0
	elif '7' in input_file:
	start = 1
	MAXLEN = 300
	MINLEN = 100
	DELTA = 100


	# Order data to get the production and pumping cycles

	# concaténer cycles c1 c2 c3 si len(c2) < EPS et Delta(C2) < x \|\| all elem of C2 in [0.9/1.1 *mean 10 last elem]
	def concatenation_cycles(cycles, label_cycles):
	# for cycle in cycles:
	# print(cycle)
	new_cycles = [cycles[0]]
	new_label_cycles = [label_cycles[0]]
	i = 1 #remove cycle 1
	while i < len(cycles) - 1:
	if len(cycles[i]) < 8 :
	print("Need to merge cycles")
	print("Merge the current cycle with the previous one.")
	print cycles[i]
	for j in range(len(cycles[i])):
	new_cycles[-1].append(cycles[i][j])
	print ("Also merge the next one.")
	for elem in cycles[i+1]:
	new_cycles[-1].append(elem)
	i = i + 2
	else:
	new_cycles.append(cycles[i])
	new_label_cycles.append(label_cycles[i])
	i = i + 1
	return new_cycles, new_label_cycles

	def cut_data(galery, timestamp):
	galery_cycles = []
	label_cycles = [timestamp[0]]
	ref = galery[0]
	cycle = []
	asc = True if galery[1] > galery[0] else False
	#start_asc = True if asc == True else False
	for i in range (0, len(galery)) :
	value = galery[i]
	if (value >= 0.99ref and asc) or (value <= 1.01ref and not asc): ## on continue le cycle
	cycle.append(value)
	else :
	if cycle is not [] :
	galery_cycles.append(cycle)
	label_cycles.append(timestamp[i])
	cycle = [value]
	asc = not asc
	if (asc and ref < value) or (not asc and ref > value) :
	ref = value # remplace ref que si meilleure
	if cycle is not []: # ajouter dernier cycle
	galery_cycles.append(cycle)
	# return galery_cycles, label_cycles
	return concatenation_cycles(galery_cycles, label_cycles)

	def create_corresponding_time(cycles):
	return [ np.array(range(0, len(cycles[i])) ) for i in range(0, len(cycles)) ]

	ht = np.array(galery) - np.array(atm)
	cycles, label_cycles = cut_data(ht, timestamp)
	print label_cycles
	times = create_corresponding_time(cycles)

	# Plot
	plt.figure(1)

	# All the data
	plt.plot(np.array(range(1, len(ht)+1)), ht, label="Ht")
	#plt.legend()
	#plt.draw()

	# Plot each cycle separately
	#for i in range(0,len(cycles)) :
	# plt.plot(times1[i], cycles[i], label="ht")
	#plt.draw()




	####### ### ### #######
	## ## ## ## ## ## ## ##
	## ## ## ## ## ##
	####### ## ## ## ## #######
	## ######### ######### ##
	## ## ## ## ## ## ## ##
	####### ## ## ## ## #######

	# Control theory method

	percentages = {95:3, 87:2, 63:1}
	coef = {95:[], 87:[], 63:[]}
	coefMerged = []
	coefMeans = []
	coefStd = []

	for i in range(start,len(cycles),2) :
	print("Cycle {}".format(i))
	cycle = cycles[i]
	if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA :
	maxCycle = max(cycle)
	minCycle = min(cycle)
	rangeCycle = maxCycle - minCycle
	# print maxCycle
	# print minCycle
	# print rangeCycle
	# print cycle
	for p in percentages:
	ratio = percentages[p]
	# First, find the threashold at 95, 87 or 63%
	threshold = p * rangeCycle / 100
	# print("expected: {}, exact: {}".format(minCycle + threshold, min(cycle, key=lambda x:abs(minCycle + threshold-x))))
	# The target is minCycle (the minimum value in the cycle) + threshold (threshold that is relative).
	# Then the lambda function will find the closest actual value in the list.
	# Its index gives us the abscissa of the corresponding value. Since all cycles begin
	# at zero, this abscissa is also the T, 2T or 3T of our system.
	time_carac = cycle.index(min(cycle, key=lambda x:abs(minCycle + threshold-x))) / ratio
	if time_carac > 0:
	# print float(1/time_carac)
	coef[p].append(1/time_carac)

	for c in coef:
	coefMeans.append(np.mean(coef[c]))
	coefStd.append(np.std(coef[c]))
	print("{} mean: {}+/-{}".format(c, coefMeans[-1], coefStd[-1]))
	for i in coef[c]:
	coefMerged.append(i)

	print("SAAS method: {}+/-{}".format(np.mean(coefMeans), np.std(coefMerged)))





	# Compute the ln
	def compute_ln(ht):
	h0 = min(ht)
	hf = max(ht)
	# last = ht[-25:]
	# hf = np.mean(last)
	# print(np.std(last))
	#print(abs((ht[:-idx]-hf+EPS)/(h0-hf)))
	log_ht = np.log(abs((ht - hf + EPS)/(h0 - hf)))
	idx = np.argmax(log_ht < LOGEPS)
	return log_ht, idx # np.log(abs((ht[:-idx] - hf + EPS)/(h0 - hf)))



	####### ## ##### ######## ######### #######
	## ## ## ## ## ## ## ## ## ##
	## ## ## ## ## ## ## ##
	####### ## ## ## ####### ###### #######
	## ## ## ## ## ## ##
	## ## ## ## ## ## ## ## ##
	####### ######### ##### ## ######### #######

	ln_cycles = []
	len_cycles = []
	times_ln = []
	slopes = []
	intercepts = []
	std_errors = []
	date_strings = []
	# Non-discarded cycles
	valid_cycles = []

	colors = ["#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"]
	fig, ax = plt.subplots()
	ax.set_prop_cycle('color', colors)

	for i in range(start,len(cycles),2) :
	cycle = cycles[i]
	if len(cycle) > MINLEN and max(cycle) - min(cycle) > DELTA :
	date_string = date.fromordinal(int(label_cycles[i]))
	time_ln = times[i]
	#compute everything
	ln, idx = compute_ln(cycle)
	if idx != 0 : # otherwise array is empty
	ln = ln[:idx]
	time_ln = time_ln[:idx]
	cycle = cycle[:idx]
	if len(cycle) > MAXLEN :
	ln = ln[:MAXLEN]
	time_ln = time_ln[:MAXLEN]
	cycle = cycle[:MAXLEN]


	r_value = 0
	offset = 0
	while abs(r_value) < 0.99 and offset < len(ln) - 1:
	offset += 1
	slope, intercept, r_value, p_value, std_err = linregress(time_ln[:-offset], ln[:-offset])

	# print("slope R value: {}".format(r_value))
	# print("Dataset length: {}".format(len(ln)))

	print("Playtime is over --------------------")
	time_ln = time_ln[:-offset]
	ln = ln[:-offset]
	cycle = cycle[:-offset]

	if max(cycle) - min(cycle) > DELTA:

	slope, intercept, r_value, p_value, std_err = linregress(time_ln, ln)

	print("slope R value: {}".format(r_value))
	print("Dataset length: {}".format(len(ln)))
	print("Slope: {}, date: {}".format(slope, date_string))

	# print(slope)

	#save in lists
	valid_cycles.append(cycle)
	ln_cycles.append(ln)
	len_cycles.append(len(ln))
	times_ln.append(time_ln)
	slopes.append(slope)
	std_errors.append(std_err)
	intercepts.append(intercept)
	date_strings.append(date_string)
	else:
	print("Discarding cycle from {}, delta too low: {}/{}".format(date_string, max(cycle) - min(cycle), DELTA))

	else:
	print("Dropping a cycle of length {} and delta {}".format(len(cycle), max(cycle) - min(cycle)))




	######### ######## ## ########## ######### ########
	## ## ## ## ## ## ##
	## ## ## ## ## ## ##
	###### ## ## ## ###### ########
	## ## ## ## ## ## ##
	## ## ## ## ## ## ##
	## ######## ######### ## ######### ## ##

	# Now that everything has been processed, look for abnormalities and discard them.

	# Mean slope
	slopeMean = np.mean(slopes)
	# Slope standard deviation
	slopeStd = np.std(slopes)

	print("Discarding abnormalities. Original mean and std: {}, {}".format(slopeMean, slopeStd))

	# If a slope is further than twice the stdev from the mean, throw it the fuck away.
	slopesCpy = slopes[:]
	idxSlopes = 0
	for i in range(len(slopesCpy)):
	if abs(slopeMean - slopesCpy[i]) > 2 * slopeStd:
	print("Slope is too shitty. Slope: {} from {}, mean: {}, stdev: {}".format(slopesCpy[i], date_strings[idxSlopes], slopeMean, slopeStd))
	del slopes[idxSlopes]
	del std_errors[idxSlopes]
	del intercepts[idxSlopes]
	del ln_cycles[idxSlopes]
	del len_cycles[idxSlopes]
	del date_strings[idxSlopes]
	else:
	plt.figure(2)
	plt.plot(times_ln[idxSlopes], slopes[idxSlopes] * times_ln[idxSlopes] + intercepts[idxSlopes], linestyle='--', label="AL " +str(date_strings[idxSlopes]))
	plt.legend()
	plt.figure(3)
	plt.plot(times_ln[idxSlopes], valid_cycles[idxSlopes], label="Cycle " +str(date_strings[idxSlopes]))
	# If we do not delete an element from the lists,
	# we can increment the index. Otherwise we don't.
	idxSlopes += 1





	plt.legend()
	print("Cycles length: ".format(len_cycles))
	print("Slopes: {}".format(slopes))
	print("Mean slope: {}".format(-1*np.mean(slopes)))
	print("Std error : " + str(np.std(slopes)))
	print("std/mean: {}".format(-1*np.std(slopes)/np.mean(slopes)))
	print("Min slope : " + str(min(slopes)))
	print("Max slope : " + str(max(slopes)))
	# for i in range(0, len(slopes)):
	# print(str(num) + "," + str(i+1) + "," + str(-slopes[i]*10000))


	plt.show()

	# TODO

	# IF we discard too many values in the cycle, trash it. => max(cycle) - min(cycle) > DELTA still needs to hold.

	# Add a feature allowing to tune the time between two measures. By default it is 60 seconds, but this value could change.

	# Add time in date string

	# Coefficients exacts : 63.2, 86.5, 95

	# Approximation linéaire pour déterminer la bonne abscisse aux coefficient SAAS