Otávio Guerra otaviomguerra

## fill_missing_data_bygroup.py
#EXAMPLE USING TITANIC DATASET

# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['age']

## ECDF.py
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n

## bootstrap_replicate.py
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

## draw_bootstrap_replicates.py
def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)

## simple_regression_with_RMSE.py
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# Create the regressor: reg_all
reg_all = LinearRegression()

## decision_tree_with_RandomizedSearch.py
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

## pipeline_example.py
# Import the Imputer module
from sklearn.preprocessing import Imputer
from sklearn.svm import SVC

# Setup the Imputation transformer: imp
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

# Instantiate the SVC classifier: clf
clf = SVC()

## multi_label_train_test_split.py
from warnings import warn

import numpy as np
import pandas as pd

def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of

## BFS_networkx.py
import networkx as nx
def path_exists(G, node1, node2):
    """
    This function checks whether a path exists between two nodes (node1, node2) in graph G.
    """
    visited_nodes = set()
    queue = [node1]

    for node in queue:
        neighbors = G.neighbors(node)

## select_k_rows.py
import pandas as pd
import random

filename = "data.csv"
n = sum(1 for line in open(filename)) - 1 #numero de linhas no arquivo
s = 10000 #numero de amostras desejado (linhas do dataset que se quer)
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(filename, skiprows=skip)
	#EXAMPLE USING TITANIC DATASET

	# Create a groupby object: by_sex_class
	by_sex_class = titanic.groupby(['sex', 'pclass'])

	# Write a function that imputes median
	def impute_median(series):
	return series.fillna(series.median())

	# Impute age and assign to titanic['age']
	def ecdf(data):
	"""Compute ECDF for a one-dimensional array of measurements."""
	# Number of data points: n
	n = len(data)

	# x-data for the ECDF: x
	x = np.sort(data)

	# y-data for the ECDF: y
	y = np.arange(1, n+1) / n
	def bootstrap_replicate_1d(data, func):
	return func(np.random.choice(data, size=len(data)))
	def draw_bs_reps(data, func, size=1):
	"""Draw bootstrap replicates."""

	# Initialize array of replicates: bs_replicates
	bs_replicates = np.empty(size)

	# Generate replicates
	for i in range(size):
	bs_replicates[i] = bootstrap_replicate_1d(data, func)
	# Import necessary modules
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error
	from sklearn.model_selection import train_test_split

	# Create training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

	# Create the regressor: reg_all
	reg_all = LinearRegression()
	# Import necessary modules
	from scipy.stats import randint
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import RandomizedSearchCV

	# Setup the parameters and distributions to sample from: param_dist
	param_dist = {"max_depth": [3, None],
	"max_features": randint(1, 9),
	"min_samples_leaf": randint(1, 9),
	"criterion": ["gini", "entropy"]}
	# Import the Imputer module
	from sklearn.preprocessing import Imputer
	from sklearn.svm import SVC

	# Setup the Imputation transformer: imp
	imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

	# Instantiate the SVC classifier: clf
	clf = SVC()
	from warnings import warn

	import numpy as np
	import pandas as pd

	def multilabel_sample(y, size=1000, min_count=5, seed=None):
	""" Takes a matrix of binary labels `y` and returns
	the indices for a sample of size `size` if
	`size` > 1 or `size` * len(y) if size =< 1.
	The sample is guaranteed to have > `min_count` of
	import networkx as nx
	def path_exists(G, node1, node2):
	"""
	This function checks whether a path exists between two nodes (node1, node2) in graph G.
	"""
	visited_nodes = set()
	queue = [node1]

	for node in queue:
	neighbors = G.neighbors(node)
	import pandas as pd
	import random

	filename = "data.csv"
	n = sum(1 for line in open(filename)) - 1 #numero de linhas no arquivo
	s = 10000 #numero de amostras desejado (linhas do dataset que se quer)
	skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
	df = pd.read_csv(filename, skiprows=skip)