terjehaukaas/VariableInference.py

## VariableInference.py
# ------------------------------------------------------------------------
# The following Python code is implemented by Professor Terje Haukaas at
# the University of British Columbia in Vancouver, Canada. It is made
# freely available online at terje.civil.ubc.ca together with notes,
# examples, and additional Python code. Please be cautious when using
# this code; it may contain bugs and comes without warranty of any form.
# ------------------------------------------------------------------------


# Import useful libraries
import numpy as np
from scipy.stats import norm, lognorm, uniform
import matplotlib.pyplot as plt


# Input data (realization of one random variable in each column)
data = np.array([(146517, 205.4),
                 (153074.5, 204.5),
                 (132614.9, 190.9),
                 (149219.6, 203.6),
                 (141182.6, 196.3),
                 (142407.4, 191.1),
                 (143403.5, 192.8),
                 (135465.1, 191.3),
                 (154740.3, 204.5),
                 (133172.4, 187.9),
                 (138539.4, 193.3),
                 (149498.5, 201.2),
                 (143589.6, 201.2),
                 (152134.4, 203.0),
                 (128544.3, 195.5),
                 (147239.6, 203.0),
                 (151755.2, 194.0),
                 (136670.6, 198.0),
                 (173146.0, 214.6),
                 (146297.5, 203.4),
                 (148197.5, 203.9),
                 (160348.1, 212.7),
                 (158164.2, 211.5),
                 (143375.3, 194.0),
                 (156667.0, 206.0),
                 (139503.7, 188.4),
                 (138317.7, 197.4),
                 (149375.6, 200.8),
                 (162411.9, 214.1),
                 (143337.1, 200.3),
                 (118184.1, 181.9),
                 (155273.8, 209.6),
                 (148961.0, 200.6),
                 (146463.4, 192.3),
                 (145491.6, 198.7),
                 (146639.7, 196.5),
                 (145159.5, 207.4),
                 (141255.9, 197.4),
                 (133833.3, 196.4),
                 (151665.2, 197.3),
                 (133124.5, 195.0),
                 (171142.9, 213.1),
                 (163021.5, 209.7),
                 (152944.9, 207.8),
                 (150891.1, 208.5),
                 (141719.6, 200.2),
                 (122068.6, 178.9),
                 (142461.7, 196.2),
                 (131519.7, 196.3),
                 (165271.3, 209.3)])


# Get number of random variables (numRVs) and number of observations (N)
N = data.shape[0]
numRVs = len(data[0])


# Initialize vectors and matrices
variableVector = np.zeros(N)
R = np.eye(numRVs)
means = []
stdvs = []
plotCounter = 0


# Loop over the random variables, i.e., the columns of the data
for j in range(numRVs):

    # Collect the realizations for this random variable
    for i in range(N):
        variableVector[i] = data[i, j]


    # Sort the elements of the vector
    variableVector.sort()


    # Compute the sum, the sum squared, and record points for the cumulative frequency diagram
    vectorSum = 0.0
    vectorSumSquared = 0.0
    xValues = []
    yValues = []
    for i in range(N):
        value = variableVector[i]
        vectorSum += value
        vectorSumSquared += value * value
        xValues.append(value)
        yValues.append((i + 1) / (float (N)))


    # Compute mean, variance, and standard deviation
    mean = vectorSum / (float (N))
    means.append(mean)
    variance = 1.0 / (float(N) - 1.0) * (vectorSumSquared - float(N) * mean*mean)
    stdv = np.sqrt(variance)
    stdvs.append(stdv)


    # Print results to the screen
    print('\n'"Variable %i: Mean=%.2f, Stdv=%.2f, cov=%.2f percent" % (j+1, mean, stdv, 100.0 * stdv / mean))


    # Get min and max realizations and lay out the x-axis
    vectorMin = np.min(variableVector)
    vectorMax = np.max(variableVector)
    myStep = (vectorMax-vectorMin)/30.0
    x = np.arange(start=vectorMin, stop=vectorMax+0.01*myStep, step=myStep)


    # Normal distribution
    normalCDF = norm.cdf(x, mean, stdv)

    # Lognormal distribution
    meanY = np.log(mean) - 0.5 * np.log(1.0 + (stdv / mean) ** 2)
    stdvY = np.sqrt(np.log((stdv / mean) ** 2 + 1.0))
    lognormalCDF = lognorm.cdf(x, stdvY, 0.0, np.exp(meanY))


    # Plot the cumulative frequency diagram and CDFs
    plotCounter += 1
    plt.ion()
    plt.figure(plotCounter)
    plt.plot(xValues, yValues, 'k-', label='Data')
    plt.plot(x, normalCDF, 'b-', label='Normal')
    plt.plot(x, lognormalCDF, 'r-', label='Lognormal')
    plt.title('Cumulative Curves')
    plt.legend(loc='upper left')
    plt.show()
    print('\n'"Press any key to continue...")
    plt.waitforbuttonpress()


# Compute correlation matrix
if (numRVs > 1):


    # Initialize vectors and the correlation matrix
    variableVectori = np.zeros(N)
    variableVectorj = np.zeros(N)
    R = np.eye(numRVs)

    for i in range(numRVs):

        # Pick up the i'th vector
        for k in range(N):
            variableVectori[k] = data[k, i]

        for j in range(i):

            # Pick up the j'th vector
            for k in range(N):
                variableVectorj[k] = data[k, j]

            # Create scatter plot
            plotCounter += 1
            plt.ion()
            plt.figure(plotCounter)
            plt.plot(variableVectori, variableVectorj, 'ko')
            plt.title('Scatter Diagram')
            plt.show()
            print('\n'"Press any key to continue...")
            plt.waitforbuttonpress()


            # Compute the correlation coefficient
            productSum = 0.0
            for k in range(N):
                productSum += variableVectori[k] * variableVectorj[k]

            value = 1.0 / (float(N) - 1.0) * (productSum - N * means[i] * means[j]) / (stdvs[i] * stdvs[j])
            R[i, j] = value
            R[j, i] = value


print('\n'"The correlation matrix is:")
print(R)
	# ------------------------------------------------------------------------
	# The following Python code is implemented by Professor Terje Haukaas at
	# the University of British Columbia in Vancouver, Canada. It is made
	# freely available online at terje.civil.ubc.ca together with notes,
	# examples, and additional Python code. Please be cautious when using
	# this code; it may contain bugs and comes without warranty of any form.
	# ------------------------------------------------------------------------


	# Import useful libraries
	import numpy as np
	from scipy.stats import norm, lognorm, uniform
	import matplotlib.pyplot as plt


	# Input data (realization of one random variable in each column)
	data = np.array([(146517, 205.4),
	(153074.5, 204.5),
	(132614.9, 190.9),
	(149219.6, 203.6),
	(141182.6, 196.3),
	(142407.4, 191.1),
	(143403.5, 192.8),
	(135465.1, 191.3),
	(154740.3, 204.5),
	(133172.4, 187.9),
	(138539.4, 193.3),
	(149498.5, 201.2),
	(143589.6, 201.2),
	(152134.4, 203.0),
	(128544.3, 195.5),
	(147239.6, 203.0),
	(151755.2, 194.0),
	(136670.6, 198.0),
	(173146.0, 214.6),
	(146297.5, 203.4),
	(148197.5, 203.9),
	(160348.1, 212.7),
	(158164.2, 211.5),
	(143375.3, 194.0),
	(156667.0, 206.0),
	(139503.7, 188.4),
	(138317.7, 197.4),
	(149375.6, 200.8),
	(162411.9, 214.1),
	(143337.1, 200.3),
	(118184.1, 181.9),
	(155273.8, 209.6),
	(148961.0, 200.6),
	(146463.4, 192.3),
	(145491.6, 198.7),
	(146639.7, 196.5),
	(145159.5, 207.4),
	(141255.9, 197.4),
	(133833.3, 196.4),
	(151665.2, 197.3),
	(133124.5, 195.0),
	(171142.9, 213.1),
	(163021.5, 209.7),
	(152944.9, 207.8),
	(150891.1, 208.5),
	(141719.6, 200.2),
	(122068.6, 178.9),
	(142461.7, 196.2),
	(131519.7, 196.3),
	(165271.3, 209.3)])


	# Get number of random variables (numRVs) and number of observations (N)
	N = data.shape[0]
	numRVs = len(data[0])


	# Initialize vectors and matrices
	variableVector = np.zeros(N)
	R = np.eye(numRVs)
	means = []
	stdvs = []
	plotCounter = 0


	# Loop over the random variables, i.e., the columns of the data
	for j in range(numRVs):

	# Collect the realizations for this random variable
	for i in range(N):
	variableVector[i] = data[i, j]


	# Sort the elements of the vector
	variableVector.sort()


	# Compute the sum, the sum squared, and record points for the cumulative frequency diagram
	vectorSum = 0.0
	vectorSumSquared = 0.0
	xValues = []
	yValues = []
	for i in range(N):
	value = variableVector[i]
	vectorSum += value
	vectorSumSquared += value * value
	xValues.append(value)
	yValues.append((i + 1) / (float (N)))


	# Compute mean, variance, and standard deviation
	mean = vectorSum / (float (N))
	means.append(mean)
	variance = 1.0 / (float(N) - 1.0) * (vectorSumSquared - float(N) * mean*mean)
	stdv = np.sqrt(variance)
	stdvs.append(stdv)


	# Print results to the screen
	print('\n'"Variable %i: Mean=%.2f, Stdv=%.2f, cov=%.2f percent" % (j+1, mean, stdv, 100.0 * stdv / mean))


	# Get min and max realizations and lay out the x-axis
	vectorMin = np.min(variableVector)
	vectorMax = np.max(variableVector)
	myStep = (vectorMax-vectorMin)/30.0
	x = np.arange(start=vectorMin, stop=vectorMax+0.01*myStep, step=myStep)


	# Normal distribution
	normalCDF = norm.cdf(x, mean, stdv)

	# Lognormal distribution
	meanY = np.log(mean) - 0.5 * np.log(1.0 + (stdv / mean) ** 2)
	stdvY = np.sqrt(np.log((stdv / mean) ** 2 + 1.0))
	lognormalCDF = lognorm.cdf(x, stdvY, 0.0, np.exp(meanY))


	# Plot the cumulative frequency diagram and CDFs
	plotCounter += 1
	plt.ion()
	plt.figure(plotCounter)
	plt.plot(xValues, yValues, 'k-', label='Data')
	plt.plot(x, normalCDF, 'b-', label='Normal')
	plt.plot(x, lognormalCDF, 'r-', label='Lognormal')
	plt.title('Cumulative Curves')
	plt.legend(loc='upper left')
	plt.show()
	print('\n'"Press any key to continue...")
	plt.waitforbuttonpress()


	# Compute correlation matrix
	if (numRVs > 1):


	# Initialize vectors and the correlation matrix
	variableVectori = np.zeros(N)
	variableVectorj = np.zeros(N)
	R = np.eye(numRVs)

	for i in range(numRVs):

	# Pick up the i'th vector
	for k in range(N):
	variableVectori[k] = data[k, i]

	for j in range(i):

	# Pick up the j'th vector
	for k in range(N):
	variableVectorj[k] = data[k, j]

	# Create scatter plot
	plotCounter += 1
	plt.ion()
	plt.figure(plotCounter)
	plt.plot(variableVectori, variableVectorj, 'ko')
	plt.title('Scatter Diagram')
	plt.show()
	print('\n'"Press any key to continue...")
	plt.waitforbuttonpress()


	# Compute the correlation coefficient
	productSum = 0.0
	for k in range(N):
	productSum += variableVectori[k] * variableVectorj[k]

	value = 1.0 / (float(N) - 1.0) * (productSum - N * means[i] * means[j]) / (stdvs[i] * stdvs[j])
	R[i, j] = value
	R[j, i] = value


	print('\n'"The correlation matrix is:")
	print(R)