Skip to content

Instantly share code, notes, and snippets.

View otaviomguerra's full-sized avatar

Otávio Guerra otaviomguerra

  • CE, Brasil
View GitHub Profile
@otaviomguerra
otaviomguerra / fill_missing_data_bygroup.py
Last active July 19, 2018 16:07
Fill in missing data by group with pandas
#EXAMPLE USING TITANIC DATASET
# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])
# Write a function that imputes median
def impute_median(series):
return series.fillna(series.median())
# Impute age and assign to titanic['age']
@otaviomguerra
otaviomguerra / ECDF.py
Created July 28, 2018 03:51
computes ECDF for 1D array
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
@otaviomguerra
otaviomguerra / bootstrap_replicate.py
Created July 29, 2018 01:50
Replicate a given array randomly and compute a given function(mean, median, etc)
def bootstrap_replicate_1d(data, func):
return func(np.random.choice(data, size=len(data)))
@otaviomguerra
otaviomguerra / draw_bootstrap_replicates.py
Created July 29, 2018 01:58
Perform various bootstrap replications using bootstrap_replicates.py function
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
@otaviomguerra
otaviomguerra / simple_regression_with_RMSE.py
Created July 31, 2018 01:27
Simple regression using RMSE and R² to evaluate
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
# Create the regressor: reg_all
reg_all = LinearRegression()
@otaviomguerra
otaviomguerra / decision_tree_with_RandomizedSearch.py
Created July 31, 2018 03:41
Simple decision tree classifier with Hyperparameter tuning using RandomizedSearch
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
"max_features": randint(1, 9),
"min_samples_leaf": randint(1, 9),
"criterion": ["gini", "entropy"]}
@otaviomguerra
otaviomguerra / pipeline_example.py
Created July 31, 2018 05:32
A machine Learning pipeline example using Imputer and SVC
# Import the Imputer module
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC
# Setup the Imputation transformer: imp
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
# Instantiate the SVC classifier: clf
clf = SVC()
@otaviomguerra
otaviomguerra / multi_label_train_test_split.py
Created July 31, 2018 21:42
Multi Label train_test_split
from warnings import warn
import numpy as np
import pandas as pd
def multilabel_sample(y, size=1000, min_count=5, seed=None):
""" Takes a matrix of binary labels `y` and returns
the indices for a sample of size `size` if
`size` > 1 or `size` * len(y) if size =< 1.
The sample is guaranteed to have > `min_count` of
@otaviomguerra
otaviomguerra / BFS_networkx.py
Created August 4, 2018 02:31
BFS algorithm implemented with networkx library
import networkx as nx
def path_exists(G, node1, node2):
"""
This function checks whether a path exists between two nodes (node1, node2) in graph G.
"""
visited_nodes = set()
queue = [node1]
for node in queue:
neighbors = G.neighbors(node)
@otaviomguerra
otaviomguerra / select_k_rows.py
Created December 6, 2018 15:53
Selecionar algumas linhas de dataset grande
import pandas as pd
import random
filename = "data.csv"
n = sum(1 for line in open(filename)) - 1 #numero de linhas no arquivo
s = 10000 #numero de amostras desejado (linhas do dataset que se quer)
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(filename, skiprows=skip)