LukasMosser/k_fold_training_seg

## k_fold_training_seg
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from pandas import set_option
from sklearn import preprocessing

"""
    More or less unchanged code from original contest notebook.
    Changes:
        Removed dropping any wells
"""
def label_facies(row, labels):
    return labels[ row['Facies'] -1]

set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

filename = 'facies_vectors.csv'
training_data = pd.read_csv(filename)

training_data['Well Name'] = training_data['Well Name'].astype('category')
training_data['Formation'] = training_data['Formation'].astype('category')
training_data['Well Name'].unique()

# 1=sandstone  2=c_siltstone   3=f_siltstone
# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
# 8=packstone 9=bafflestone
facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
       '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']

#facies_color_map is a dictionary that maps facies labels
#to their respective colors
facies_color_map = {}
for ind, label in enumerate(facies_labels):
    facies_color_map[label] = facies_colors[ind]

training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
training_data.describe()

PE_mask = training_data['PE'].notnull().values

training_data = training_data[PE_mask]

correct_facies_labels = training_data['Facies'].values

"""
    End of original tutorial code
"""

#Create a set of unique well names
names = list(set(training_data["Well Name"]))

#Create a dicitionary of the well datasets, continued from original contest notebook
#But perform dropping for each well individually
#Maybe not necessary.

well_datas = {}
for name in names:
    well = training_data[training_data["Well Name"]==name]
    well_labels = well['Facies'].values.astype(np.int64)
    well = well.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1).values
    well_datas[name] = [well, well_labels]


X_data = {}
Y_data = {}
for name, (data, labels) in well_datas.iteritems():
    Y_data[name] = np.array(labels, dtype=np.int64)
    X_data[name] = np.array(data, dtype=np.float32)

training_sets = []
test_sets = []

for i in range(len(names)):
    X_train = []
    Y_train = []

    X_test = []
    Y_test = []

    for name, data in X_data.iteritems():
        if name is not names[i]:
            for row in data:
                X_train.append(row)
        else:
            for row in data:
                X_test.append(row)

    for name, labels in Y_data.iteritems():
        if name is not names[i]:
            for val in labels:
                Y_train.append(val)
        else:
            for val in labels:
                Y_test.append(val)

    X_train = np.array(X_train, dtype=np.float32)
    Y_train = np.array(Y_train, dtype=np.int64).reshape(len(Y_train), 1)

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)


    X_test = np.array(X_test, dtype=np.float32)
    X_test = scaler.transform(X_test)

    Y_test = np.array(Y_test, dtype=np.int32)
    training_sets.append([X_train, Y_train, X_test, Y_test])

#Use as follows:
scores = []
for i, (X_train, Y_train, X_test, Y_test) in enumerate(training_sets):
    #classifier = some_classifier()
    #classifier.train(X_train, Y_train)
    #Y_Predict = classifier.predict(X_test)
    #Scoring
    #scores.append(score)
    print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape
    pass
#print np.mean(scores)
	%matplotlib inline
	import pandas as pd
	import numpy as np
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import matplotlib.colors as colors
	from mpl_toolkits.axes_grid1 import make_axes_locatable
	from pandas import set_option
	from sklearn import preprocessing

	"""
	More or less unchanged code from original contest notebook.
	Changes:
	Removed dropping any wells
	"""
	def label_facies(row, labels):
	return labels[ row['Facies'] -1]

	set_option("display.max_rows", 10)
	pd.options.mode.chained_assignment = None

	filename = 'facies_vectors.csv'
	training_data = pd.read_csv(filename)

	training_data['Well Name'] = training_data['Well Name'].astype('category')
	training_data['Formation'] = training_data['Formation'].astype('category')
	training_data['Well Name'].unique()

	# 1=sandstone 2=c_siltstone 3=f_siltstone
	# 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite
	# 8=packstone 9=bafflestone
	facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',
	'#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']

	facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
	'WS', 'D','PS', 'BS']

	#facies_color_map is a dictionary that maps facies labels
	#to their respective colors
	facies_color_map = {}
	for ind, label in enumerate(facies_labels):
	facies_color_map[label] = facies_colors[ind]

	training_data.loc[:,'FaciesLabels'] = training_data.apply(lambda row: label_facies(row, facies_labels), axis=1)
	training_data.describe()

	PE_mask = training_data['PE'].notnull().values

	training_data = training_data[PE_mask]

	correct_facies_labels = training_data['Facies'].values

	"""
	End of original tutorial code
	"""

	#Create a set of unique well names
	names = list(set(training_data["Well Name"]))

	#Create a dicitionary of the well datasets, continued from original contest notebook
	#But perform dropping for each well individually
	#Maybe not necessary.

	well_datas = {}
	for name in names:
	well = training_data[training_data["Well Name"]==name]
	well_labels = well['Facies'].values.astype(np.int64)
	well = well.drop(['Formation', 'Well Name', 'Depth','Facies','FaciesLabels'], axis=1).values
	well_datas[name] = [well, well_labels]


	X_data = {}
	Y_data = {}
	for name, (data, labels) in well_datas.iteritems():
	Y_data[name] = np.array(labels, dtype=np.int64)
	X_data[name] = np.array(data, dtype=np.float32)

	training_sets = []
	test_sets = []

	for i in range(len(names)):
	X_train = []
	Y_train = []

	X_test = []
	Y_test = []

	for name, data in X_data.iteritems():
	if name is not names[i]:
	for row in data:
	X_train.append(row)
	else:
	for row in data:
	X_test.append(row)

	for name, labels in Y_data.iteritems():
	if name is not names[i]:
	for val in labels:
	Y_train.append(val)
	else:
	for val in labels:
	Y_test.append(val)

	X_train = np.array(X_train, dtype=np.float32)
	Y_train = np.array(Y_train, dtype=np.int64).reshape(len(Y_train), 1)

	scaler = preprocessing.StandardScaler().fit(X_train)
	X_train = scaler.transform(X_train)


	X_test = np.array(X_test, dtype=np.float32)
	X_test = scaler.transform(X_test)

	Y_test = np.array(Y_test, dtype=np.int32)
	training_sets.append([X_train, Y_train, X_test, Y_test])

	#Use as follows:
	scores = []
	for i, (X_train, Y_train, X_test, Y_test) in enumerate(training_sets):
	#classifier = some_classifier()
	#classifier.train(X_train, Y_train)
	#Y_Predict = classifier.predict(X_test)
	#Scoring
	#scores.append(score)
	print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape
	pass
	#print np.mean(scores)