Skip to content

Instantly share code, notes, and snippets.

@caiogranero
Last active June 3, 2018 13:02
Show Gist options
  • Save caiogranero/6863e9621fa99ddb8be3d74e07da5935 to your computer and use it in GitHub Desktop.
Save caiogranero/6863e9621fa99ddb8be3d74e07da5935 to your computer and use it in GitHub Desktop.
neural-network
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil
import os
# class NeuralNetwork:
# def __init__(self):
# The data needs to be split into a training set and a test set
# To use 80/20, set the training size to .8
training_set_size_portion = .8
# Set to True to shuffle the data before you split into training and test sets
do_shuffle = True
# Keep track of the accuracy score
accuracy_score = 0
# The DNN has hidden units, set the spec for them here
# which means there’ll be a # layer of 10 neurons, with each connected to 20 neurons
# in the next layer, each of which is connected to 10 neurons in the third layer
hidden_units_spec = [10, 20, 10]
n_classes_spec = 3
# Define the temp directory for keeping the model and checkpoints
tmp_dir_spec = "tmp/model"
# The number of training steps
steps_spec = 2000
# The number of epochs
epochs_spec = 10
# File Name - be sure to change this if you upload something else
file_name = "NasaExoplanetDataMay2018.csv"
# Here's a set of our features. If you look at the CSV,
# you'll see these are the names of the columns.
# In this case, we'll just use all of them:
features = [
'st_mass', # stellar tempmass
'pl_masse', # planet mass
'pl_rade' # planet radius
# 'area'
]
# Here's the label that we want to predict -- it's also a column in the CSV
labels = ['pl_eqt']
# remove the tmp folder
if os.path.isdir("./tmp") and os.path.exists("./tmp"):
shutil.rmtree("./tmp")
my_data = pd.read_csv('./' + file_name, delimiter=';')
# The pandas DataFrame allows you to shuffle with the reindex method
# Docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html#pandas.DataFrame.reindex
# If the doShuffle property is true, we will shuffle with this
# You really SHOULD shuffle to make sure that trends in data don't affect your learning
# but I make it optional here so you can choose
if do_shuffle:
randomized_data = my_data.reindex(np.random.permutation(my_data.index))
else:
randomized_data = my_data
# removing NA rows
randomized_data = randomized_data[np.concatenate([features, labels])].dropna()
for column in features:
randomized_data[column] = randomized_data[column].astype(float)
for column in labels:
randomized_data[column][(randomized_data[column] > 273) & (randomized_data[column] < 373)] = 1
randomized_data[column][randomized_data[column] != 1] = 0
randomized_data[column] = randomized_data[column].astype(int)
# This code gives you the size of each set
total_records = len(randomized_data)
training_set_size = int(total_records * training_set_size_portion)
print("")
print("- Size of traning set: ", training_set_size)
test_set_size = total_records - training_set_size
print("")
print("- Size of test set: ", test_set_size)
# and this code then splits your data into the training features and labels sets
# based on the size I specified for my training features and labels.
# Build the training features and labels
training_features=randomized_data.head(training_set_size)[features].copy()
training_labels=randomized_data.head(training_set_size)[labels].copy()
print("")
print("- - Features - -")
print(training_features.head())
print("")
print("- - Labels - -")
print(training_labels.head(50))
# Build the testing features and labels
testing_features=randomized_data.tail(test_set_size)[features].copy()
testing_labels=randomized_data.tail(test_set_size)[labels].copy()
# The Neural Network classifier expects the feature columns to be specified as tf.feature_column types.
feature_columns = [tf.feature_column.numeric_column(key) for key in features]
classifier=tf.estimator.DNNClassifier(
feature_columns = feature_columns,
hidden_units = hidden_units_spec,
n_classes = n_classes_spec,
model_dir = tmp_dir_spec)
# Define the training input function
train_input_fn=tf.estimator.inputs.pandas_input_fn(
x = training_features, y = training_labels, num_epochs = epochs_spec, shuffle = True)
# Train the model using the classifer.
classifier.train(input_fn = train_input_fn, steps = steps_spec)
# Define the test input function
test_input_fn=tf.estimator.inputs.pandas_input_fn(
x = testing_features, y = testing_labels, num_epochs = epochs_spec, shuffle = False)
# Evaluate accuracy.
accuracy_score=classifier.evaluate(input_fn = test_input_fn)["accuracy"]
print("Accuracy = {}".format(accuracy_score))
# Create a prediction set -- this is a list of input features that you want to classify
prediction_set = pd.DataFrame({
'st_mass': [2.78],
'pl_masse': [468481420],
'pl_rade': [16141],
})
predict_input_fn = tf.estimator.inputs.pandas_input_fn(x=prediction_set, num_epochs=1, shuffle=False)
# # Get a list of the predictions
predictions = list(classifier.predict(input_fn=predict_input_fn))
predicted_classes = [p["classes"] for p in predictions]
results=np.concatenate(predicted_classes)
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment