Last active
June 3, 2018 13:02
-
-
Save caiogranero/6863e9621fa99ddb8be3d74e07da5935 to your computer and use it in GitHub Desktop.
neural-network
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import pandas as pd | |
import numpy as np | |
import shutil | |
import os | |
# class NeuralNetwork: | |
# def __init__(self): | |
# The data needs to be split into a training set and a test set | |
# To use 80/20, set the training size to .8 | |
training_set_size_portion = .8 | |
# Set to True to shuffle the data before you split into training and test sets | |
do_shuffle = True | |
# Keep track of the accuracy score | |
accuracy_score = 0 | |
# The DNN has hidden units, set the spec for them here | |
# which means there’ll be a # layer of 10 neurons, with each connected to 20 neurons | |
# in the next layer, each of which is connected to 10 neurons in the third layer | |
hidden_units_spec = [10, 20, 10] | |
n_classes_spec = 3 | |
# Define the temp directory for keeping the model and checkpoints | |
tmp_dir_spec = "tmp/model" | |
# The number of training steps | |
steps_spec = 2000 | |
# The number of epochs | |
epochs_spec = 10 | |
# File Name - be sure to change this if you upload something else | |
file_name = "NasaExoplanetDataMay2018.csv" | |
# Here's a set of our features. If you look at the CSV, | |
# you'll see these are the names of the columns. | |
# In this case, we'll just use all of them: | |
features = [ | |
'st_mass', # stellar tempmass | |
'pl_masse', # planet mass | |
'pl_rade' # planet radius | |
# 'area' | |
] | |
# Here's the label that we want to predict -- it's also a column in the CSV | |
labels = ['pl_eqt'] | |
# remove the tmp folder | |
if os.path.isdir("./tmp") and os.path.exists("./tmp"): | |
shutil.rmtree("./tmp") | |
my_data = pd.read_csv('./' + file_name, delimiter=';') | |
# The pandas DataFrame allows you to shuffle with the reindex method | |
# Docs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html#pandas.DataFrame.reindex | |
# If the doShuffle property is true, we will shuffle with this | |
# You really SHOULD shuffle to make sure that trends in data don't affect your learning | |
# but I make it optional here so you can choose | |
if do_shuffle: | |
randomized_data = my_data.reindex(np.random.permutation(my_data.index)) | |
else: | |
randomized_data = my_data | |
# removing NA rows | |
randomized_data = randomized_data[np.concatenate([features, labels])].dropna() | |
for column in features: | |
randomized_data[column] = randomized_data[column].astype(float) | |
for column in labels: | |
randomized_data[column][(randomized_data[column] > 273) & (randomized_data[column] < 373)] = 1 | |
randomized_data[column][randomized_data[column] != 1] = 0 | |
randomized_data[column] = randomized_data[column].astype(int) | |
# This code gives you the size of each set | |
total_records = len(randomized_data) | |
training_set_size = int(total_records * training_set_size_portion) | |
print("") | |
print("- Size of traning set: ", training_set_size) | |
test_set_size = total_records - training_set_size | |
print("") | |
print("- Size of test set: ", test_set_size) | |
# and this code then splits your data into the training features and labels sets | |
# based on the size I specified for my training features and labels. | |
# Build the training features and labels | |
training_features=randomized_data.head(training_set_size)[features].copy() | |
training_labels=randomized_data.head(training_set_size)[labels].copy() | |
print("") | |
print("- - Features - -") | |
print(training_features.head()) | |
print("") | |
print("- - Labels - -") | |
print(training_labels.head(50)) | |
# Build the testing features and labels | |
testing_features=randomized_data.tail(test_set_size)[features].copy() | |
testing_labels=randomized_data.tail(test_set_size)[labels].copy() | |
# The Neural Network classifier expects the feature columns to be specified as tf.feature_column types. | |
feature_columns = [tf.feature_column.numeric_column(key) for key in features] | |
classifier=tf.estimator.DNNClassifier( | |
feature_columns = feature_columns, | |
hidden_units = hidden_units_spec, | |
n_classes = n_classes_spec, | |
model_dir = tmp_dir_spec) | |
# Define the training input function | |
train_input_fn=tf.estimator.inputs.pandas_input_fn( | |
x = training_features, y = training_labels, num_epochs = epochs_spec, shuffle = True) | |
# Train the model using the classifer. | |
classifier.train(input_fn = train_input_fn, steps = steps_spec) | |
# Define the test input function | |
test_input_fn=tf.estimator.inputs.pandas_input_fn( | |
x = testing_features, y = testing_labels, num_epochs = epochs_spec, shuffle = False) | |
# Evaluate accuracy. | |
accuracy_score=classifier.evaluate(input_fn = test_input_fn)["accuracy"] | |
print("Accuracy = {}".format(accuracy_score)) | |
# Create a prediction set -- this is a list of input features that you want to classify | |
prediction_set = pd.DataFrame({ | |
'st_mass': [2.78], | |
'pl_masse': [468481420], | |
'pl_rade': [16141], | |
}) | |
predict_input_fn = tf.estimator.inputs.pandas_input_fn(x=prediction_set, num_epochs=1, shuffle=False) | |
# # Get a list of the predictions | |
predictions = list(classifier.predict(input_fn=predict_input_fn)) | |
predicted_classes = [p["classes"] for p in predictions] | |
results=np.concatenate(predicted_classes) | |
print(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment