Skip to content

Instantly share code, notes, and snippets.

@logasja
Created October 24, 2018 16:07
Show Gist options
  • Save logasja/0134b362d0372dacd1bb0d08b00aaa72 to your computer and use it in GitHub Desktop.
Save logasja/0134b362d0372dacd1bb0d08b00aaa72 to your computer and use it in GitHub Desktop.
Useful data utils that I use across several of my projects.
__author__ = "j.logas"
from numpy.random import seed
from seaborn import load_dataset
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
def moons_data():
seed(0)
X, y = make_moons(200, noise=0.20)
df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
return df
# Gets pandas dataframe of iris dataset
def iris_data():
return load_dataset('iris')
def planets_data():
return load_dataset('planets')
def tips_data():
return load_dataset('tips')
def titanic_data():
return load_dataset('titanic')
def flights_data():
return load_dataset('flights')
# Splits the dataframe into a training and testing set with default 20% split
def split_training_test(df, test_size=0.2):
return train_test_split(df, test_size=test_size)
# Enumerates strings in dataframe returns the label encoder that can reverse transform
def enumerate_strings(df):
column_enums = {}
for column in df.columns:
# If the column is made up of strings convert
if not pd.api.types.is_numeric_dtype(df[column]):
le = preprocessing.LabelEncoder()
le.fit(df[column])
df[column] = le.transform(df[column])
column_enums[column] = le
return column_enums
# Normalizes given X and returns a new X
def normalize_data(df, target):
scaler = preprocessing.StandardScaler()
scaler.fit(df.loc[:, df.columns != target])
df.loc[:,df.columns != target] = scaler.transform(df.loc[:,df.columns != target])
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment