Andreas Mueller amueller

## sklearn_tutorial_draft.rst

      
              1 file
            
          
              0 forks
            
          
              2 comments
            
          
              0 stars
            
          
                amueller
                / sklearn_tutorial_draft.rst
            
            
              Last active
              August 29, 2015 14:16
            
              
                scipy scikit-learn tutorial draft
              
          
    Tutorial Topic

This tutorial aims to provide an introduction to machine learning and scikit-learn "from the ground up". We will start with basic concepts of machine learning and implementing these using scikit-learn. Going in detail through the characteristics of several methods, we will discuss how to pick an algorithm for your application, how to set its parameters, and how to evaluate performance.
Please provide a more detailed abstract of your tutorial (again, see last years tutorials).

Machine learning is the task of extracting knowledge from data, often with the goal to generalize to new, unseen data. Applications of machine learning now touch nearly every aspect of everyday life, from the face detection in our

  
## elkan_bench.py
from sklearn.cluster import KMeans
from time import time
from sklearn.datasets import load_digits, fetch_mldata, load_iris, fetch_20newsgroups_vectorized

def bench_kmeans(data, n_clusters=5, init='random', n_init=1):
    start = time()
    km1 = KMeans(algorithm='lloyd', n_clusters=n_clusters, random_state=0, init=init, n_init=n_init).fit(X)
    print("lloyd time: %f inertia: %f" % (time() - start, km1.inertia_))
    start = time()
    km2 = KMeans(algorithm='elkan', n_clusters=n_clusters, random_state=0, init=init, n_init=n_init).fit(X)

## magic_constructor_estimator.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                amueller
                / magic_constructor_estimator.ipynb
            
            
              Created
              April 14, 2015 00:33
            
              
                No more double underscores in sklearn.
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## knn_imputation_speed.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                amueller
                / knn_imputation_speed.ipynb
            
            
              Created
              August 25, 2015 15:52
            
              
                np.multiply test for knn imputation
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## colormap_extraction.py
from colorspacious import cspace_convert
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import euclidean_distances
import scipy.sparse as sp

from colorspacious import cspace_convert
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import euclidean_distances
import scipy.sparse as sp

## abomination.py
from sklearn.base import BaseEstimator

def piper(self, other):
    from sklearn.pipeline import make_pipeline, Pipeline
    if isinstance(self, Pipeline):
        steps = ([estimator for (name, estimator) in self.steps] + [other])
        return make_pipeline(*steps)
    else:
        return make_pipeline(self, other)

## curving.py
import numpy as np
import matplotlib.pyplot as plt

class Curve(object):
    def __init__(self, scores, to="B+", std_adjust=0):
        self.to = to
        self.scores = scores
        self.letters = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
        idx = self.letters.index(to)
        # +3 is because we do D and F manually

## constant_values.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                amueller
                / constant_values.ipynb
            
            
              Created
              June 7, 2017 15:52
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## printer.py
class Formatter(object):
    def __init__(self, indent_est='step'):
        self.indent_est = indent_est
        self.types = {}
        self.htchar = ' '
        self.lfchar = '\n'
        self.indent = 0
        self.step = 4
        self.width = 79
        self.set_formater(object, self.__class__.format_object)

## bench_feat_agg.py
"""
Benchmarks np.bincount method vs np.mean for feature agglomeration in
../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides
a significant speed up if the pooling function is np.mean.

np.bincount performs better especially as the size of X and n_clusters
increase.
"""
import matplotlib.pyplot as plt
import numpy as np
	from sklearn.cluster import KMeans
	from time import time
	from sklearn.datasets import load_digits, fetch_mldata, load_iris, fetch_20newsgroups_vectorized

	def bench_kmeans(data, n_clusters=5, init='random', n_init=1):
	start = time()
	km1 = KMeans(algorithm='lloyd', n_clusters=n_clusters, random_state=0, init=init, n_init=n_init).fit(X)
	print("lloyd time: %f inertia: %f" % (time() - start, km1.inertia_))
	start = time()
	km2 = KMeans(algorithm='elkan', n_clusters=n_clusters, random_state=0, init=init, n_init=n_init).fit(X)
	from colorspacious import cspace_convert
	from scipy.sparse.csgraph import minimum_spanning_tree
	from sklearn.metrics import euclidean_distances
	import scipy.sparse as sp

	from colorspacious import cspace_convert
	from scipy.sparse.csgraph import minimum_spanning_tree
	from sklearn.metrics import euclidean_distances
	import scipy.sparse as sp
	from sklearn.base import BaseEstimator

	def piper(self, other):
	from sklearn.pipeline import make_pipeline, Pipeline
	if isinstance(self, Pipeline):
	steps = ([estimator for (name, estimator) in self.steps] + [other])
	return make_pipeline(*steps)
	else:
	return make_pipeline(self, other)
	import numpy as np
	import matplotlib.pyplot as plt

	class Curve(object):
	def __init__(self, scores, to="B+", std_adjust=0):
	self.to = to
	self.scores = scores
	self.letters = ["A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D", "F"]
	idx = self.letters.index(to)
	# +3 is because we do D and F manually
	class Formatter(object):
	def __init__(self, indent_est='step'):
	self.indent_est = indent_est
	self.types = {}
	self.htchar = ' '
	self.lfchar = '\n'
	self.indent = 0
	self.step = 4
	self.width = 79
	self.set_formater(object, self.__class__.format_object)
	"""
	Benchmarks np.bincount method vs np.mean for feature agglomeration in
	../sklearn/cluster/_feature_agglomeration. Use of np.bincount provides
	a significant speed up if the pooling function is np.mean.

	np.bincount performs better especially as the size of X and n_clusters
	increase.
	"""
	import matplotlib.pyplot as plt
	import numpy as np