crearo/fastai-classify-max-in-list.py

## fastai-classify-max-in-list.py
"""
Note: I still need to work on the fastai api more to code this without a tabularlist.
-
I've been doing the fastai course and all the examples put up there are on complicated data - images, text, tables.
I wanted to take a seemingly simple problem of finding the maximum in a list.
I've only been able to get a 94% accuracy with whatever I've learnt so far. I thought it'd be an accurate 100%,
but maybe I'm just not deep enough into the course yet.

Note, I originally didn't want to classify, but make it a regression problem, but I wasn't able to use the fastai api
to do so. There's this `FloatList` option, but i can't seem to get it to work.

Any help would be great!
"""

from fastai.tabular import *


def get_data_and_labels(n):
    # setup data and labels
    data = []
    labels = []
    for i in range(0, n):
        a = [-50 + int(100 * random.random()) for j in range(0, 10)]
        data.append(a)
        labels.append([a.index(max(a))])
    return data, labels


def write_to_file(n):
    """
    Writes to file a list of n lists; each containing 11 numbers:
    The first number is the index of the max of the following 10 numbers.
    """
    with open('out.csv', 'w') as f:
        data, labels = get_data_and_labels(n)
        f.write(','.join([str(x) for x in range(1,12)]))
        f.write('\n')
        for i in range(0, len(data)):
            f.write('%d,%s\n' % (labels[i][0], ','.join([str(x) for x in data[i]])))

# create the out.csv containing our data
write_to_file(1000)

path = Path('./')
df = pd.read_csv(path/'out.csv')

dep_var = "1"
cont_names = ["2", "3", "4", "5", "6", "7", "8", "9", "10", "11"]
procs = [Normalize, Categorify, FillMissing]

test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cont_names=cont_names, cat_names=None)
data = (TabularList.from_df(df, path=path, cat_names=None, cont_names=cont_names, procs=procs)
                    .split_by_idx(list(range(800, 1000)))
                    .label_from_df(cols=dep_var) # they say to force regression you should use, label_cls=FloatList, log=True. I'm using classification cuz I can't get regression to work.
                    .add_test(test)
                    .databunch())

learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(10, 3e-2)

## keras-classify-max-between-two.py
"""
The same classifier but in keras.
Keras doesn't do the abstract things away non sense that fastai was doing and so i know exactly what I've written.
But, this still doesn't give me an accuracy of 100%. And I don't quite know why yet.
"""

import random
import keras
import numpy as np
from keras import Sequential
from keras.layers import Dense


def get_data_and_labels(n):
    # setup data and labels
    data = []
    labels = []
    for i in range(0, n):
        a = [random.randint(-50, 50), random.randint(-50, 50)]
        data.append(a)
        labels.append([a.index(max(a))])
    return np.array(data), np.array(labels)


X_train, Y_train = get_data_and_labels(1000)
dims = X_train.shape[1]
Y_train = keras.utils.to_categorical(Y_train)

X_valid, Y_valid = get_data_and_labels(500)
Y_valid = keras.utils.to_categorical(Y_valid)

nb_classes = Y_train.shape[1]
print(nb_classes, "classes")

model = Sequential()
model.add(Dense(nb_classes, input_shape=(dims,), activation='softmax'))
# model.add(Activation('softmax'))

model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=10, validation_data=(X_valid, Y_valid))

print('test!')
print(model.predict(np.array([[0., .1]])))
	"""
	Note: I still need to work on the fastai api more to code this without a tabularlist.
	-
	I've been doing the fastai course and all the examples put up there are on complicated data - images, text, tables.
	I wanted to take a seemingly simple problem of finding the maximum in a list.
	I've only been able to get a 94% accuracy with whatever I've learnt so far. I thought it'd be an accurate 100%,
	but maybe I'm just not deep enough into the course yet.

	Note, I originally didn't want to classify, but make it a regression problem, but I wasn't able to use the fastai api
	to do so. There's this `FloatList` option, but i can't seem to get it to work.

	Any help would be great!
	"""

	from fastai.tabular import *


	def get_data_and_labels(n):
	# setup data and labels
	data = []
	labels = []
	for i in range(0, n):
	a = [-50 + int(100 * random.random()) for j in range(0, 10)]
	data.append(a)
	labels.append([a.index(max(a))])
	return data, labels


	def write_to_file(n):
	"""
	Writes to file a list of n lists; each containing 11 numbers:
	The first number is the index of the max of the following 10 numbers.
	"""
	with open('out.csv', 'w') as f:
	data, labels = get_data_and_labels(n)
	f.write(','.join([str(x) for x in range(1,12)]))
	f.write('\n')
	for i in range(0, len(data)):
	f.write('%d,%s\n' % (labels[i][0], ','.join([str(x) for x in data[i]])))

	# create the out.csv containing our data
	write_to_file(1000)

	path = Path('./')
	df = pd.read_csv(path/'out.csv')

	dep_var = "1"
	cont_names = ["2", "3", "4", "5", "6", "7", "8", "9", "10", "11"]
	procs = [Normalize, Categorify, FillMissing]

	test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cont_names=cont_names, cat_names=None)
	data = (TabularList.from_df(df, path=path, cat_names=None, cont_names=cont_names, procs=procs)
	.split_by_idx(list(range(800, 1000)))
	.label_from_df(cols=dep_var) # they say to force regression you should use, label_cls=FloatList, log=True. I'm using classification cuz I can't get regression to work.
	.add_test(test)
	.databunch())

	learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
	learn.fit_one_cycle(10, 3e-2)
	"""
	The same classifier but in keras.
	Keras doesn't do the abstract things away non sense that fastai was doing and so i know exactly what I've written.
	But, this still doesn't give me an accuracy of 100%. And I don't quite know why yet.
	"""

	import random
	import keras
	import numpy as np
	from keras import Sequential
	from keras.layers import Dense


	def get_data_and_labels(n):
	# setup data and labels
	data = []
	labels = []
	for i in range(0, n):
	a = [random.randint(-50, 50), random.randint(-50, 50)]
	data.append(a)
	labels.append([a.index(max(a))])
	return np.array(data), np.array(labels)


	X_train, Y_train = get_data_and_labels(1000)
	dims = X_train.shape[1]
	Y_train = keras.utils.to_categorical(Y_train)

	X_valid, Y_valid = get_data_and_labels(500)
	Y_valid = keras.utils.to_categorical(Y_valid)

	nb_classes = Y_train.shape[1]
	print(nb_classes, "classes")

	model = Sequential()
	model.add(Dense(nb_classes, input_shape=(dims,), activation='softmax'))
	# model.add(Activation('softmax'))

	model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
	model.fit(X_train, Y_train, epochs=10, validation_data=(X_valid, Y_valid))

	print('test!')
	print(model.predict(np.array([[0., .1]])))