Skip to content

Instantly share code, notes, and snippets.

@alexandremcosta
Created August 30, 2017 18:57
Show Gist options
  • Save alexandremcosta/3914dc6fa5229883fd78c6226a5c58f8 to your computer and use it in GitHub Desktop.
Save alexandremcosta/3914dc6fa5229883fd78c6226a5c58f8 to your computer and use it in GitHub Desktop.
import pandas as pd
import scipy.stats as st
import pylab as pl
import math
# Download dataframe
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst"
df = pd.read_csv(url, header=None, delimiter=' ')
# From http://odds.cs.stonybrook.edu/shuttle-dataset/
# The smallest five classes, i.e. 2, 3, 5, 6, 7 are combined to form the outliers class,
# while class 1 forms the inlier class. Data for class 4 is discarded.
df = df.loc[df[9] != 4]
# Plot Distribution and Kurtosis of Columns
df_norm = (df - df.mean()) / (df.max() - df.min())
def plot_normal(data, mu, var):
sigma = math.sqrt(var)
pl.plot(data,st.norm.pdf(data, mu, sigma), "-o")
pl.hist(data, normed=True)
print("Kurtosis: " + str(st.kurtosis(data)))
pl.show()
for col in range(1,len(df_norm.iloc[0])-1):
print("Column: " + str(col))
values = df_norm.iloc[:, col].values
plot_normal(sorted(values), values.mean(), values.var())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment