Skip to content

Instantly share code, notes, and snippets.

@anirudhjayaraman
Created July 18, 2016 11:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anirudhjayaraman/b50392bfe155fa1118437a1b778806e2 to your computer and use it in GitHub Desktop.
Save anirudhjayaraman/b50392bfe155fa1118437a1b778806e2 to your computer and use it in GitHub Desktop.
KNN algorithm implemented with scikit learn
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### k Nearest Neighbors in scikit learn"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# getting our data playground ready\n",
"from sklearn import datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# getting KNN related artillery ready\n",
"from sklearn import neighbors"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"target_names\n",
"['setosa' 'versicolor' 'virginica']\n",
"\n",
"data\n",
"[[ 5.1 3.5 1.4 0.2]\n",
" [ 4.9 3. 1.4 0.2]\n",
" [ 4.7 3.2 1.3 0.2]\n",
" [ 4.6 3.1 1.5 0.2]\n",
" [ 5. 3.6 1.4 0.2]\n",
" [ 5.4 3.9 1.7 0.4]\n",
" [ 4.6 3.4 1.4 0.3]\n",
" [ 5. 3.4 1.5 0.2]\n",
" [ 4.4 2.9 1.4 0.2]\n",
" [ 4.9 3.1 1.5 0.1]\n",
" [ 5.4 3.7 1.5 0.2]\n",
" [ 4.8 3.4 1.6 0.2]\n",
" [ 4.8 3. 1.4 0.1]\n",
" [ 4.3 3. 1.1 0.1]\n",
" [ 5.8 4. 1.2 0.2]\n",
" [ 5.7 4.4 1.5 0.4]\n",
" [ 5.4 3.9 1.3 0.4]\n",
" [ 5.1 3.5 1.4 0.3]\n",
" [ 5.7 3.8 1.7 0.3]\n",
" [ 5.1 3.8 1.5 0.3]\n",
" [ 5.4 3.4 1.7 0.2]\n",
" [ 5.1 3.7 1.5 0.4]\n",
" [ 4.6 3.6 1. 0.2]\n",
" [ 5.1 3.3 1.7 0.5]\n",
" [ 4.8 3.4 1.9 0.2]\n",
" [ 5. 3. 1.6 0.2]\n",
" [ 5. 3.4 1.6 0.4]\n",
" [ 5.2 3.5 1.5 0.2]\n",
" [ 5.2 3.4 1.4 0.2]\n",
" [ 4.7 3.2 1.6 0.2]\n",
" [ 4.8 3.1 1.6 0.2]\n",
" [ 5.4 3.4 1.5 0.4]\n",
" [ 5.2 4.1 1.5 0.1]\n",
" [ 5.5 4.2 1.4 0.2]\n",
" [ 4.9 3.1 1.5 0.1]\n",
" [ 5. 3.2 1.2 0.2]\n",
" [ 5.5 3.5 1.3 0.2]\n",
" [ 4.9 3.1 1.5 0.1]\n",
" [ 4.4 3. 1.3 0.2]\n",
" [ 5.1 3.4 1.5 0.2]\n",
" [ 5. 3.5 1.3 0.3]\n",
" [ 4.5 2.3 1.3 0.3]\n",
" [ 4.4 3.2 1.3 0.2]\n",
" [ 5. 3.5 1.6 0.6]\n",
" [ 5.1 3.8 1.9 0.4]\n",
" [ 4.8 3. 1.4 0.3]\n",
" [ 5.1 3.8 1.6 0.2]\n",
" [ 4.6 3.2 1.4 0.2]\n",
" [ 5.3 3.7 1.5 0.2]\n",
" [ 5. 3.3 1.4 0.2]\n",
" [ 7. 3.2 4.7 1.4]\n",
" [ 6.4 3.2 4.5 1.5]\n",
" [ 6.9 3.1 4.9 1.5]\n",
" [ 5.5 2.3 4. 1.3]\n",
" [ 6.5 2.8 4.6 1.5]\n",
" [ 5.7 2.8 4.5 1.3]\n",
" [ 6.3 3.3 4.7 1.6]\n",
" [ 4.9 2.4 3.3 1. ]\n",
" [ 6.6 2.9 4.6 1.3]\n",
" [ 5.2 2.7 3.9 1.4]\n",
" [ 5. 2. 3.5 1. ]\n",
" [ 5.9 3. 4.2 1.5]\n",
" [ 6. 2.2 4. 1. ]\n",
" [ 6.1 2.9 4.7 1.4]\n",
" [ 5.6 2.9 3.6 1.3]\n",
" [ 6.7 3.1 4.4 1.4]\n",
" [ 5.6 3. 4.5 1.5]\n",
" [ 5.8 2.7 4.1 1. ]\n",
" [ 6.2 2.2 4.5 1.5]\n",
" [ 5.6 2.5 3.9 1.1]\n",
" [ 5.9 3.2 4.8 1.8]\n",
" [ 6.1 2.8 4. 1.3]\n",
" [ 6.3 2.5 4.9 1.5]\n",
" [ 6.1 2.8 4.7 1.2]\n",
" [ 6.4 2.9 4.3 1.3]\n",
" [ 6.6 3. 4.4 1.4]\n",
" [ 6.8 2.8 4.8 1.4]\n",
" [ 6.7 3. 5. 1.7]\n",
" [ 6. 2.9 4.5 1.5]\n",
" [ 5.7 2.6 3.5 1. ]\n",
" [ 5.5 2.4 3.8 1.1]\n",
" [ 5.5 2.4 3.7 1. ]\n",
" [ 5.8 2.7 3.9 1.2]\n",
" [ 6. 2.7 5.1 1.6]\n",
" [ 5.4 3. 4.5 1.5]\n",
" [ 6. 3.4 4.5 1.6]\n",
" [ 6.7 3.1 4.7 1.5]\n",
" [ 6.3 2.3 4.4 1.3]\n",
" [ 5.6 3. 4.1 1.3]\n",
" [ 5.5 2.5 4. 1.3]\n",
" [ 5.5 2.6 4.4 1.2]\n",
" [ 6.1 3. 4.6 1.4]\n",
" [ 5.8 2.6 4. 1.2]\n",
" [ 5. 2.3 3.3 1. ]\n",
" [ 5.6 2.7 4.2 1.3]\n",
" [ 5.7 3. 4.2 1.2]\n",
" [ 5.7 2.9 4.2 1.3]\n",
" [ 6.2 2.9 4.3 1.3]\n",
" [ 5.1 2.5 3. 1.1]\n",
" [ 5.7 2.8 4.1 1.3]\n",
" [ 6.3 3.3 6. 2.5]\n",
" [ 5.8 2.7 5.1 1.9]\n",
" [ 7.1 3. 5.9 2.1]\n",
" [ 6.3 2.9 5.6 1.8]\n",
" [ 6.5 3. 5.8 2.2]\n",
" [ 7.6 3. 6.6 2.1]\n",
" [ 4.9 2.5 4.5 1.7]\n",
" [ 7.3 2.9 6.3 1.8]\n",
" [ 6.7 2.5 5.8 1.8]\n",
" [ 7.2 3.6 6.1 2.5]\n",
" [ 6.5 3.2 5.1 2. ]\n",
" [ 6.4 2.7 5.3 1.9]\n",
" [ 6.8 3. 5.5 2.1]\n",
" [ 5.7 2.5 5. 2. ]\n",
" [ 5.8 2.8 5.1 2.4]\n",
" [ 6.4 3.2 5.3 2.3]\n",
" [ 6.5 3. 5.5 1.8]\n",
" [ 7.7 3.8 6.7 2.2]\n",
" [ 7.7 2.6 6.9 2.3]\n",
" [ 6. 2.2 5. 1.5]\n",
" [ 6.9 3.2 5.7 2.3]\n",
" [ 5.6 2.8 4.9 2. ]\n",
" [ 7.7 2.8 6.7 2. ]\n",
" [ 6.3 2.7 4.9 1.8]\n",
" [ 6.7 3.3 5.7 2.1]\n",
" [ 7.2 3.2 6. 1.8]\n",
" [ 6.2 2.8 4.8 1.8]\n",
" [ 6.1 3. 4.9 1.8]\n",
" [ 6.4 2.8 5.6 2.1]\n",
" [ 7.2 3. 5.8 1.6]\n",
" [ 7.4 2.8 6.1 1.9]\n",
" [ 7.9 3.8 6.4 2. ]\n",
" [ 6.4 2.8 5.6 2.2]\n",
" [ 6.3 2.8 5.1 1.5]\n",
" [ 6.1 2.6 5.6 1.4]\n",
" [ 7.7 3. 6.1 2.3]\n",
" [ 6.3 3.4 5.6 2.4]\n",
" [ 6.4 3.1 5.5 1.8]\n",
" [ 6. 3. 4.8 1.8]\n",
" [ 6.9 3.1 5.4 2.1]\n",
" [ 6.7 3.1 5.6 2.4]\n",
" [ 6.9 3.1 5.1 2.3]\n",
" [ 5.8 2.7 5.1 1.9]\n",
" [ 6.8 3.2 5.9 2.3]\n",
" [ 6.7 3.3 5.7 2.5]\n",
" [ 6.7 3. 5.2 2.3]\n",
" [ 6.3 2.5 5. 1.9]\n",
" [ 6.5 3. 5.2 2. ]\n",
" [ 6.2 3.4 5.4 2.3]\n",
" [ 5.9 3. 5.1 1.8]]\n",
"\n",
"target\n",
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
" 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
" 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2\n",
" 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2\n",
" 2 2]\n",
"\n",
"DESCR\n",
"Iris Plants Database\n",
"\n",
"Notes\n",
"-----\n",
"Data Set Characteristics:\n",
" :Number of Instances: 150 (50 in each of three classes)\n",
" :Number of Attributes: 4 numeric, predictive attributes and the class\n",
" :Attribute Information:\n",
" - sepal length in cm\n",
" - sepal width in cm\n",
" - petal length in cm\n",
" - petal width in cm\n",
" - class:\n",
" - Iris-Setosa\n",
" - Iris-Versicolour\n",
" - Iris-Virginica\n",
" :Summary Statistics:\n",
" ============== ==== ==== ======= ===== ====================\n",
" Min Max Mean SD Class Correlation\n",
" ============== ==== ==== ======= ===== ====================\n",
" sepal length: 4.3 7.9 5.84 0.83 0.7826\n",
" sepal width: 2.0 4.4 3.05 0.43 -0.4194\n",
" petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n",
" petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n",
" ============== ==== ==== ======= ===== ====================\n",
" :Missing Attribute Values: None\n",
" :Class Distribution: 33.3% for each of 3 classes.\n",
" :Creator: R.A. Fisher\n",
" :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n",
" :Date: July, 1988\n",
"\n",
"This is a copy of UCI ML iris datasets.\n",
"http://archive.ics.uci.edu/ml/datasets/Iris\n",
"\n",
"The famous Iris database, first used by Sir R.A Fisher\n",
"\n",
"This is perhaps the best known database to be found in the\n",
"pattern recognition literature. Fisher's paper is a classic in the field and\n",
"is referenced frequently to this day. (See Duda & Hart, for example.) The\n",
"data set contains 3 classes of 50 instances each, where each class refers to a\n",
"type of iris plant. One class is linearly separable from the other 2; the\n",
"latter are NOT linearly separable from each other.\n",
"\n",
"References\n",
"----------\n",
" - Fisher,R.A. \"The use of multiple measurements in taxonomic problems\"\n",
" Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n",
" Mathematical Statistics\" (John Wiley, NY, 1950).\n",
" - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n",
" (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n",
" - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n",
" Structure and Classification Rule for Recognition in Partially Exposed\n",
" Environments\". IEEE Transactions on Pattern Analysis and Machine\n",
" Intelligence, Vol. PAMI-2, No. 1, 67-71.\n",
" - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n",
" on Information Theory, May 1972, 431-433.\n",
" - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n",
" conceptual clustering system finds 3 classes in the data.\n",
" - Many, many more ...\n",
"\n",
"\n",
"feature_names\n",
"['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n",
"\n"
]
}
],
"source": [
"iris = datasets.load_iris()\n",
"# the dataset is loaded as a dictionary of key-value pairs\n",
"# the relevant attributes:\n",
"\n",
"for i in range(len(iris.keys())):\n",
" print(iris.keys()[i])\n",
" print(iris.values()[i])\n",
" print"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split\n",
"X,y = iris.data, iris.target\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on package sklearn.neighbors in sklearn:\n",
"\n",
"NAME\n",
" sklearn.neighbors\n",
"\n",
"FILE\n",
" /home/anirudh/anaconda/lib/python2.7/site-packages/sklearn/neighbors/__init__.py\n",
"\n",
"DESCRIPTION\n",
" The :mod:`sklearn.neighbors` module implements the k-nearest neighbors\n",
" algorithm.\n",
"\n",
"PACKAGE CONTENTS\n",
" approximate\n",
" ball_tree\n",
" base\n",
" classification\n",
" dist_metrics\n",
" graph\n",
" kd_tree\n",
" kde\n",
" nearest_centroid\n",
" regression\n",
" setup\n",
" typedefs\n",
" unsupervised\n",
"\n",
"CLASSES\n",
" __builtin__.object\n",
" sklearn.neighbors.dist_metrics.DistanceMetric\n",
" sklearn.base.BaseEstimator(__builtin__.object)\n",
" sklearn.neighbors.approximate.LSHForest(sklearn.base.BaseEstimator, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin)\n",
" sklearn.neighbors.kde.KernelDensity\n",
" sklearn.neighbors.nearest_centroid.NearestCentroid(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin)\n",
" sklearn.base.ClassifierMixin(__builtin__.object)\n",
" sklearn.neighbors.classification.KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.classification.RadiusNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.nearest_centroid.NearestCentroid(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin)\n",
" sklearn.base.RegressorMixin(__builtin__.object)\n",
" sklearn.neighbors.regression.KNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.regression.RadiusNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.ball_tree.BinaryTree(__builtin__.object)\n",
" sklearn.neighbors.ball_tree.BallTree\n",
" sklearn.neighbors.base.KNeighborsMixin(__builtin__.object)\n",
" sklearn.neighbors.approximate.LSHForest(sklearn.base.BaseEstimator, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin)\n",
" sklearn.neighbors.classification.KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.regression.KNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.unsupervised.NearestNeighbors(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.UnsupervisedMixin)\n",
" sklearn.neighbors.base.NeighborsBase(abc.NewBase)\n",
" sklearn.neighbors.classification.KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.classification.RadiusNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.regression.KNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.regression.RadiusNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.unsupervised.NearestNeighbors(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.UnsupervisedMixin)\n",
" sklearn.neighbors.base.RadiusNeighborsMixin(__builtin__.object)\n",
" sklearn.neighbors.approximate.LSHForest(sklearn.base.BaseEstimator, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin)\n",
" sklearn.neighbors.classification.RadiusNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.regression.RadiusNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.unsupervised.NearestNeighbors(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.UnsupervisedMixin)\n",
" sklearn.neighbors.base.SupervisedFloatMixin(__builtin__.object)\n",
" sklearn.neighbors.regression.KNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.regression.RadiusNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" sklearn.neighbors.base.SupervisedIntegerMixin(__builtin__.object)\n",
" sklearn.neighbors.classification.KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.classification.RadiusNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" sklearn.neighbors.base.UnsupervisedMixin(__builtin__.object)\n",
" sklearn.neighbors.unsupervised.NearestNeighbors(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.UnsupervisedMixin)\n",
" sklearn.neighbors.kd_tree.BinaryTree(__builtin__.object)\n",
" sklearn.neighbors.kd_tree.KDTree\n",
" \n",
" class BallTree(BinaryTree)\n",
" | BallTree for fast generalized N-point problems\n",
" | \n",
" | BallTree(X, leaf_size=40, metric='minkowski', \\**kwargs)\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features]\n",
" | n_samples is the number of points in the data set, and\n",
" | n_features is the dimension of the parameter space.\n",
" | Note: if X is a C-contiguous array of doubles then data will\n",
" | not be copied. Otherwise, an internal copy will be made.\n",
" | \n",
" | leaf_size : positive integer (default = 20)\n",
" | Number of points at which to switch to brute-force. Changing\n",
" | leaf_size will not affect the results of a query, but can\n",
" | significantly impact the speed of a query and the memory required\n",
" | to store the constructed tree. The amount of memory needed to\n",
" | store the tree scales as approximately n_samples / leaf_size.\n",
" | For a specified ``leaf_size``, a leaf node is guaranteed to\n",
" | satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in\n",
" | the case that ``n_samples < leaf_size``.\n",
" | \n",
" | metric : string or DistanceMetric object\n",
" | the distance metric to use for the tree. Default='minkowski'\n",
" | with p=2 (that is, a euclidean metric). See the documentation\n",
" | of the DistanceMetric class for a list of available metrics.\n",
" | ball_tree.valid_metrics gives a list of the metrics which\n",
" | are valid for BallTree.\n",
" | \n",
" | Additional keywords are passed to the distance metric class.\n",
" | \n",
" | Attributes\n",
" | ----------\n",
" | data : np.ndarray\n",
" | The training data\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for k-nearest neighbors\n",
" | \n",
" | >>> import numpy as np\n",
" | \n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BallTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> dist, ind = tree.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | Pickle and Unpickle a tree. Note that the state of the tree is saved in the\n",
" | pickle operation: the tree needs not be rebuilt upon unpickling.\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> import pickle\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BallTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> s = pickle.dumps(tree) # doctest: +SKIP\n",
" | >>> tree_copy = pickle.loads(s) # doctest: +SKIP\n",
" | >>> dist, ind = tree_copy.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | Query for neighbors within a given radius\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BallTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> print tree.query_radius(X[0], r=0.3, count_only=True)\n",
" | 3\n",
" | >>> ind = tree.query_radius(X[0], r=0.3) # doctest: +SKIP\n",
" | >>> print ind # indices of neighbors within distance 0.3\n",
" | [3 0 1]\n",
" | \n",
" | \n",
" | Compute a gaussian kernel density estimate:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(1)\n",
" | >>> X = np.random.random((100, 3))\n",
" | >>> tree = BallTree(X) # doctest: +SKIP\n",
" | >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')\n",
" | array([ 6.94114649, 7.83281226, 7.2071716 ])\n",
" | \n",
" | Compute a two-point auto-correlation function\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((30, 3))\n",
" | >>> r = np.linspace(0, 1, 5)\n",
" | >>> tree = BallTree(X) # doctest: +SKIP\n",
" | >>> tree.two_point_correlation(X, r)\n",
" | array([ 30, 62, 278, 580, 820])\n",
" | \n",
" | Method resolution order:\n",
" | BallTree\n",
" | BinaryTree\n",
" | __builtin__.object\n",
" | \n",
" | Data and other attributes defined here:\n",
" | \n",
" | __new__ = <built-in method __new__ of type object>\n",
" | T.__new__(S, ...) -> a new object with type S, a subtype of T\n",
" | \n",
" | __pyx_vtable__ = <capsule object NULL>\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from BinaryTree:\n",
" | \n",
" | __getstate__(...)\n",
" | get state for pickling\n",
" | \n",
" | __init__(...)\n",
" | x.__init__(...) initializes x; see help(type(x)) for signature\n",
" | \n",
" | __reduce__(...)\n",
" | reduce method used for pickling\n",
" | \n",
" | __setstate__(...)\n",
" | set state for pickling\n",
" | \n",
" | get_arrays(...)\n",
" | \n",
" | get_n_calls(...)\n",
" | \n",
" | get_tree_stats(...)\n",
" | \n",
" | kernel_density(...)\n",
" | kernel_density(self, X, h, kernel='gaussian', atol=0, rtol=1E-8,\n",
" | breadth_first=True, return_log=False)\n",
" | \n",
" | Compute the kernel density estimate at points X with the given kernel,\n",
" | using the distance metric specified at tree creation.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like\n",
" | An array of points to query. Last dimension should match dimension\n",
" | of training data.\n",
" | h : float\n",
" | the bandwidth of the kernel\n",
" | kernel : string\n",
" | specify the kernel to use. Options are\n",
" | - 'gaussian'\n",
" | - 'tophat'\n",
" | - 'epanechnikov'\n",
" | - 'exponential'\n",
" | - 'linear'\n",
" | - 'cosine'\n",
" | Default is kernel = 'gaussian'\n",
" | atol, rtol : float (default = 0)\n",
" | Specify the desired relative and absolute tolerance of the result.\n",
" | If the true result is K_true, then the returned result K_ret\n",
" | satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``\n",
" | The default is zero (i.e. machine precision) for both.\n",
" | breadth_first : boolean (default = False)\n",
" | if True, use a breadth-first search. If False (default) use a\n",
" | depth-first search. Breadth-first is generally faster for\n",
" | compact kernels and/or high tolerances.\n",
" | return_log : boolean (default = False)\n",
" | return the logarithm of the result. This can be more accurate\n",
" | than returning the result itself for narrow kernels.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | density : ndarray\n",
" | The array of (log)-density evaluations, shape = X.shape[:-1]\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Compute a gaussian kernel density estimate:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(1)\n",
" | >>> X = np.random.random((100, 3))\n",
" | >>> tree = BinaryTree(X) # doctest: +SKIP\n",
" | >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')\n",
" | array([ 6.94114649, 7.83281226, 7.2071716 ])\n",
" | \n",
" | query(...)\n",
" | query(X, k=1, return_distance=True,\n",
" | dualtree=False, breadth_first=False)\n",
" | \n",
" | query the tree for the k nearest neighbors\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension self.dim\n",
" | An array of points to query\n",
" | k : integer (default = 1)\n",
" | The number of nearest neighbors to return\n",
" | return_distance : boolean (default = True)\n",
" | if True, return a tuple (d, i) of distances and indices\n",
" | if False, return array i\n",
" | dualtree : boolean (default = False)\n",
" | if True, use the dual tree formalism for the query: a tree is\n",
" | built for the query points, and the pair of trees is used to\n",
" | efficiently search this space. This can lead to better\n",
" | performance as the number of points grows large.\n",
" | breadth_first : boolean (default = False)\n",
" | if True, then query the nodes in a breadth-first manner.\n",
" | Otherwise, query the nodes in a depth-first manner.\n",
" | sort_results : boolean (default = True)\n",
" | if True, then distances and indices of each point are sorted\n",
" | on return, so that the first column contains the closest points.\n",
" | Otherwise, neighbors are returned in an arbitrary order.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | i : if return_distance == False\n",
" | (d,i) : if return_distance == True\n",
" | \n",
" | d : array of doubles - shape: x.shape[:-1] + (k,)\n",
" | each entry gives the list of distances to the\n",
" | neighbors of the corresponding point\n",
" | \n",
" | i : array of integers - shape: x.shape[:-1] + (k,)\n",
" | each entry gives the list of indices of\n",
" | neighbors of the corresponding point\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for k-nearest neighbors\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BinaryTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> dist, ind = tree.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | query_radius(...)\n",
" | query_radius(self, X, r, count_only = False):\n",
" | \n",
" | query the tree for neighbors within a radius r\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension self.dim\n",
" | An array of points to query\n",
" | r : distance within which neighbors are returned\n",
" | r can be a single value, or an array of values of shape\n",
" | x.shape[:-1] if different radii are desired for each point.\n",
" | return_distance : boolean (default = False)\n",
" | if True, return distances to neighbors of each point\n",
" | if False, return only neighbors\n",
" | Note that unlike the query() method, setting return_distance=True\n",
" | here adds to the computation time. Not all distances need to be\n",
" | calculated explicitly for return_distance=False. Results are\n",
" | not sorted by default: see ``sort_results`` keyword.\n",
" | count_only : boolean (default = False)\n",
" | if True, return only the count of points within distance r\n",
" | if False, return the indices of all points within distance r\n",
" | If return_distance==True, setting count_only=True will\n",
" | result in an error.\n",
" | sort_results : boolean (default = False)\n",
" | if True, the distances and indices will be sorted before being\n",
" | returned. If False, the results will not be sorted. If\n",
" | return_distance == False, setting sort_results = True will\n",
" | result in an error.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | count : if count_only == True\n",
" | ind : if count_only == False and return_distance == False\n",
" | (ind, dist) : if count_only == False and return_distance == True\n",
" | \n",
" | count : array of integers, shape = X.shape[:-1]\n",
" | each entry gives the number of neighbors within\n",
" | a distance r of the corresponding point.\n",
" | \n",
" | ind : array of objects, shape = X.shape[:-1]\n",
" | each element is a numpy integer array listing the indices of\n",
" | neighbors of the corresponding point. Note that unlike\n",
" | the results of a k-neighbors query, the returned neighbors\n",
" | are not sorted by distance by default.\n",
" | \n",
" | dist : array of objects, shape = X.shape[:-1]\n",
" | each element is a numpy double array\n",
" | listing the distances corresponding to indices in i.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for neighbors in a given radius\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BinaryTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> print tree.query_radius(X[0], r=0.3, count_only=True)\n",
" | 3\n",
" | >>> ind = tree.query_radius(X[0], r=0.3) # doctest: +SKIP\n",
" | >>> print ind # indices of neighbors within distance 0.3\n",
" | [3 0 1]\n",
" | \n",
" | reset_n_calls(...)\n",
" | \n",
" | two_point_correlation(...)\n",
" | Compute the two-point correlation function\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like\n",
" | An array of points to query. Last dimension should match dimension\n",
" | of training data.\n",
" | r : array_like\n",
" | A one-dimensional array of distances\n",
" | dualtree : boolean (default = False)\n",
" | If true, use a dualtree algorithm. Otherwise, use a single-tree\n",
" | algorithm. Dual tree algorithms can have better scaling for\n",
" | large N.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | counts : ndarray\n",
" | counts[i] contains the number of pairs of points with distance\n",
" | less than or equal to r[i]\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Compute the two-point autocorrelation function of X:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((30, 3))\n",
" | >>> r = np.linspace(0, 1, 5)\n",
" | >>> tree = BinaryTree(X) # doctest: +SKIP\n",
" | >>> tree.two_point_correlation(X, r)\n",
" | array([ 30, 62, 278, 580, 820])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from BinaryTree:\n",
" | \n",
" | data\n",
" | \n",
" | idx_array\n",
" | \n",
" | node_bounds\n",
" | \n",
" | node_data\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes inherited from BinaryTree:\n",
" | \n",
" | valid_metrics = ['chebyshev', 'sokalmichener', 'canberra', 'haversine'...\n",
" \n",
" class DistanceMetric(__builtin__.object)\n",
" | DistanceMetric class\n",
" | \n",
" | This class provides a uniform interface to fast distance metric\n",
" | functions. The various metrics can be accessed via the `get_metric`\n",
" | class method and the metric string identifier (see below).\n",
" | For example, to use the Euclidean distance:\n",
" | \n",
" | >>> dist = DistanceMetric.get_metric('euclidean')\n",
" | >>> X = [[0, 1, 2],\n",
" | [3, 4, 5]])\n",
" | >>> dist.pairwise(X)\n",
" | array([[ 0. , 5.19615242],\n",
" | [ 5.19615242, 0. ]])\n",
" | \n",
" | Available Metrics\n",
" | The following lists the string metric identifiers and the associated\n",
" | distance metric classes:\n",
" | \n",
" | **Metrics intended for real-valued vector spaces:**\n",
" | \n",
" | ============== ==================== ======== ===============================\n",
" | identifier class name args distance function\n",
" | -------------- -------------------- -------- -------------------------------\n",
" | \"euclidean\" EuclideanDistance - ``sqrt(sum((x - y)^2))``\n",
" | \"manhattan\" ManhattanDistance - ``sum(|x - y|)``\n",
" | \"chebyshev\" ChebyshevDistance - ``sum(max(|x - y|))``\n",
" | \"minkowski\" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)``\n",
" | \"wminkowski\" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)``\n",
" | \"seuclidean\" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))``\n",
" | \"mahalanobis\" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))``\n",
" | ============== ==================== ======== ===============================\n",
" | \n",
" | **Metrics intended for two-dimensional vector spaces:**\n",
" | \n",
" | ============ ================== ========================================\n",
" | identifier class name distance function\n",
" | ------------ ------------------ ----------------------------------------\n",
" | \"haversine\" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx)\n",
" | + cos(x1)cos(x2)sin^2(0.5*dy)))\n",
" | ============ ================== ========================================\n",
" | \n",
" | \n",
" | **Metrics intended for integer-valued vector spaces:** Though intended\n",
" | for integer-valued vectors, these are also valid metrics in the case of\n",
" | real-valued vectors.\n",
" | \n",
" | ============= ==================== ========================================\n",
" | identifier class name distance function\n",
" | ------------- -------------------- ----------------------------------------\n",
" | \"hamming\" HammingDistance ``N_unequal(x, y) / N_tot``\n",
" | \"canberra\" CanberraDistance ``sum(|x - y| / (|x| + |y|))``\n",
" | \"braycurtis\" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))``\n",
" | ============= ==================== ========================================\n",
" | \n",
" | **Metrics intended for boolean-valued vector spaces:** Any nonzero entry\n",
" | is evaluated to \"True\". In the listings below, the following\n",
" | abbreviations are used:\n",
" | \n",
" | - N : number of dimensions\n",
" | - NTT : number of dims in which both values are True\n",
" | - NTF : number of dims in which the first value is True, second is False\n",
" | - NFT : number of dims in which the first value is False, second is True\n",
" | - NFF : number of dims in which both values are False\n",
" | - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT\n",
" | - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT\n",
" | \n",
" | ================= ======================= ===============================\n",
" | identifier class name distance function\n",
" | ----------------- ----------------------- -------------------------------\n",
" | \"jaccard\" JaccardDistance NNEQ / NNZ\n",
" | \"maching\" MatchingDistance NNEQ / N\n",
" | \"dice\" DiceDistance NNEQ / (NTT + NNZ)\n",
" | \"kulsinski\" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N)\n",
" | \"rogerstanimoto\" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ)\n",
" | \"russellrao\" RussellRaoDistance NNZ / N\n",
" | \"sokalmichener\" SokalMichenerDistance 2 * NNEQ / (N + NNEQ)\n",
" | \"sokalsneath\" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT)\n",
" | ================= ======================= ===============================\n",
" | \n",
" | **User-defined distance:**\n",
" | \n",
" | =========== =============== =======\n",
" | identifier class name args\n",
" | ----------- --------------- -------\n",
" | \"pyfunc\" PyFuncDistance func\n",
" | =========== =============== =======\n",
" | \n",
" | Here ``func`` is a function which takes two one-dimensional numpy\n",
" | arrays, and returns a distance. Note that in order to be used within\n",
" | the BallTree, the distance must be a true metric:\n",
" | i.e. it must satisfy the following properties\n",
" | \n",
" | 1) Non-negativity: d(x, y) >= 0\n",
" | 2) Identity: d(x, y) = 0 if and only if x == y\n",
" | 3) Symmetry: d(x, y) = d(y, x)\n",
" | 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)\n",
" | \n",
" | Because of the Python object overhead involved in calling the python\n",
" | function, this will be fairly slow, but it will have the same\n",
" | scaling as other distances.\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __getstate__(...)\n",
" | get state for pickling\n",
" | \n",
" | __init__(...)\n",
" | x.__init__(...) initializes x; see help(type(x)) for signature\n",
" | \n",
" | __reduce__(...)\n",
" | reduce method used for pickling\n",
" | \n",
" | __setstate__(...)\n",
" | set state for pickling\n",
" | \n",
" | dist_to_rdist(...)\n",
" | Convert the true distance to the reduced distance.\n",
" | \n",
" | The reduced distance, defined for some metrics, is a computationally\n",
" | more efficent measure which preserves the rank of the true distance.\n",
" | For example, in the Euclidean distance metric, the reduced distance\n",
" | is the squared-euclidean distance.\n",
" | \n",
" | get_metric(...)\n",
" | Get the given distance metric from the string identifier.\n",
" | \n",
" | See the docstring of DistanceMetric for a list of available metrics.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | metric : string or class name\n",
" | The distance metric to use\n",
" | **kwargs\n",
" | additional arguments will be passed to the requested metric\n",
" | \n",
" | pairwise(...)\n",
" | Compute the pairwise distances between X and Y\n",
" | \n",
" | This is a convenience routine for the sake of testing. For many\n",
" | metrics, the utilities in scipy.spatial.distance.cdist and\n",
" | scipy.spatial.distance.pdist will be faster.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like\n",
" | Array of shape (Nx, D), representing Nx points in D dimensions.\n",
" | Y : array_like (optional)\n",
" | Array of shape (Ny, D), representing Ny points in D dimensions.\n",
" | If not specified, then Y=X.\n",
" | Returns\n",
" | -------\n",
" | dist : ndarray\n",
" | The shape (Nx, Ny) array of pairwise distances between points in\n",
" | X and Y.\n",
" | \n",
" | rdist_to_dist(...)\n",
" | Convert the Reduced distance to the true distance.\n",
" | \n",
" | The reduced distance, defined for some metrics, is a computationally\n",
" | more efficent measure which preserves the rank of the true distance.\n",
" | For example, in the Euclidean distance metric, the reduced distance\n",
" | is the squared-euclidean distance.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __new__ = <built-in method __new__ of type object>\n",
" | T.__new__(S, ...) -> a new object with type S, a subtype of T\n",
" | \n",
" | __pyx_vtable__ = <capsule object NULL>\n",
" \n",
" class KDTree(BinaryTree)\n",
" | KDTree for fast generalized N-point problems\n",
" | \n",
" | KDTree(X, leaf_size=40, metric='minkowski', \\**kwargs)\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features]\n",
" | n_samples is the number of points in the data set, and\n",
" | n_features is the dimension of the parameter space.\n",
" | Note: if X is a C-contiguous array of doubles then data will\n",
" | not be copied. Otherwise, an internal copy will be made.\n",
" | \n",
" | leaf_size : positive integer (default = 20)\n",
" | Number of points at which to switch to brute-force. Changing\n",
" | leaf_size will not affect the results of a query, but can\n",
" | significantly impact the speed of a query and the memory required\n",
" | to store the constructed tree. The amount of memory needed to\n",
" | store the tree scales as approximately n_samples / leaf_size.\n",
" | For a specified ``leaf_size``, a leaf node is guaranteed to\n",
" | satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in\n",
" | the case that ``n_samples < leaf_size``.\n",
" | \n",
" | metric : string or DistanceMetric object\n",
" | the distance metric to use for the tree. Default='minkowski'\n",
" | with p=2 (that is, a euclidean metric). See the documentation\n",
" | of the DistanceMetric class for a list of available metrics.\n",
" | kd_tree.valid_metrics gives a list of the metrics which\n",
" | are valid for KDTree.\n",
" | \n",
" | Additional keywords are passed to the distance metric class.\n",
" | \n",
" | Attributes\n",
" | ----------\n",
" | data : np.ndarray\n",
" | The training data\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for k-nearest neighbors\n",
" | \n",
" | >>> import numpy as np\n",
" | \n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = KDTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> dist, ind = tree.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | Pickle and Unpickle a tree. Note that the state of the tree is saved in the\n",
" | pickle operation: the tree needs not be rebuilt upon unpickling.\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> import pickle\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = KDTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> s = pickle.dumps(tree) # doctest: +SKIP\n",
" | >>> tree_copy = pickle.loads(s) # doctest: +SKIP\n",
" | >>> dist, ind = tree_copy.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | Query for neighbors within a given radius\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = KDTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> print tree.query_radius(X[0], r=0.3, count_only=True)\n",
" | 3\n",
" | >>> ind = tree.query_radius(X[0], r=0.3) # doctest: +SKIP\n",
" | >>> print ind # indices of neighbors within distance 0.3\n",
" | [3 0 1]\n",
" | \n",
" | \n",
" | Compute a gaussian kernel density estimate:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(1)\n",
" | >>> X = np.random.random((100, 3))\n",
" | >>> tree = KDTree(X) # doctest: +SKIP\n",
" | >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')\n",
" | array([ 6.94114649, 7.83281226, 7.2071716 ])\n",
" | \n",
" | Compute a two-point auto-correlation function\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((30, 3))\n",
" | >>> r = np.linspace(0, 1, 5)\n",
" | >>> tree = KDTree(X) # doctest: +SKIP\n",
" | >>> tree.two_point_correlation(X, r)\n",
" | array([ 30, 62, 278, 580, 820])\n",
" | \n",
" | Method resolution order:\n",
" | KDTree\n",
" | BinaryTree\n",
" | __builtin__.object\n",
" | \n",
" | Data and other attributes defined here:\n",
" | \n",
" | __new__ = <built-in method __new__ of type object>\n",
" | T.__new__(S, ...) -> a new object with type S, a subtype of T\n",
" | \n",
" | __pyx_vtable__ = <capsule object NULL>\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from BinaryTree:\n",
" | \n",
" | __getstate__(...)\n",
" | get state for pickling\n",
" | \n",
" | __init__(...)\n",
" | x.__init__(...) initializes x; see help(type(x)) for signature\n",
" | \n",
" | __reduce__(...)\n",
" | reduce method used for pickling\n",
" | \n",
" | __setstate__(...)\n",
" | set state for pickling\n",
" | \n",
" | get_arrays(...)\n",
" | \n",
" | get_n_calls(...)\n",
" | \n",
" | get_tree_stats(...)\n",
" | \n",
" | kernel_density(...)\n",
" | kernel_density(self, X, h, kernel='gaussian', atol=0, rtol=1E-8,\n",
" | breadth_first=True, return_log=False)\n",
" | \n",
" | Compute the kernel density estimate at points X with the given kernel,\n",
" | using the distance metric specified at tree creation.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like\n",
" | An array of points to query. Last dimension should match dimension\n",
" | of training data.\n",
" | h : float\n",
" | the bandwidth of the kernel\n",
" | kernel : string\n",
" | specify the kernel to use. Options are\n",
" | - 'gaussian'\n",
" | - 'tophat'\n",
" | - 'epanechnikov'\n",
" | - 'exponential'\n",
" | - 'linear'\n",
" | - 'cosine'\n",
" | Default is kernel = 'gaussian'\n",
" | atol, rtol : float (default = 0)\n",
" | Specify the desired relative and absolute tolerance of the result.\n",
" | If the true result is K_true, then the returned result K_ret\n",
" | satisfies ``abs(K_true - K_ret) < atol + rtol * K_ret``\n",
" | The default is zero (i.e. machine precision) for both.\n",
" | breadth_first : boolean (default = False)\n",
" | if True, use a breadth-first search. If False (default) use a\n",
" | depth-first search. Breadth-first is generally faster for\n",
" | compact kernels and/or high tolerances.\n",
" | return_log : boolean (default = False)\n",
" | return the logarithm of the result. This can be more accurate\n",
" | than returning the result itself for narrow kernels.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | density : ndarray\n",
" | The array of (log)-density evaluations, shape = X.shape[:-1]\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Compute a gaussian kernel density estimate:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(1)\n",
" | >>> X = np.random.random((100, 3))\n",
" | >>> tree = BinaryTree(X) # doctest: +SKIP\n",
" | >>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')\n",
" | array([ 6.94114649, 7.83281226, 7.2071716 ])\n",
" | \n",
" | query(...)\n",
" | query(X, k=1, return_distance=True,\n",
" | dualtree=False, breadth_first=False)\n",
" | \n",
" | query the tree for the k nearest neighbors\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension self.dim\n",
" | An array of points to query\n",
" | k : integer (default = 1)\n",
" | The number of nearest neighbors to return\n",
" | return_distance : boolean (default = True)\n",
" | if True, return a tuple (d, i) of distances and indices\n",
" | if False, return array i\n",
" | dualtree : boolean (default = False)\n",
" | if True, use the dual tree formalism for the query: a tree is\n",
" | built for the query points, and the pair of trees is used to\n",
" | efficiently search this space. This can lead to better\n",
" | performance as the number of points grows large.\n",
" | breadth_first : boolean (default = False)\n",
" | if True, then query the nodes in a breadth-first manner.\n",
" | Otherwise, query the nodes in a depth-first manner.\n",
" | sort_results : boolean (default = True)\n",
" | if True, then distances and indices of each point are sorted\n",
" | on return, so that the first column contains the closest points.\n",
" | Otherwise, neighbors are returned in an arbitrary order.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | i : if return_distance == False\n",
" | (d,i) : if return_distance == True\n",
" | \n",
" | d : array of doubles - shape: x.shape[:-1] + (k,)\n",
" | each entry gives the list of distances to the\n",
" | neighbors of the corresponding point\n",
" | \n",
" | i : array of integers - shape: x.shape[:-1] + (k,)\n",
" | each entry gives the list of indices of\n",
" | neighbors of the corresponding point\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for k-nearest neighbors\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BinaryTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> dist, ind = tree.query(X[0], k=3) # doctest: +SKIP\n",
" | >>> print ind # indices of 3 closest neighbors\n",
" | [0 3 1]\n",
" | >>> print dist # distances to 3 closest neighbors\n",
" | [ 0. 0.19662693 0.29473397]\n",
" | \n",
" | query_radius(...)\n",
" | query_radius(self, X, r, count_only = False):\n",
" | \n",
" | query the tree for neighbors within a radius r\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension self.dim\n",
" | An array of points to query\n",
" | r : distance within which neighbors are returned\n",
" | r can be a single value, or an array of values of shape\n",
" | x.shape[:-1] if different radii are desired for each point.\n",
" | return_distance : boolean (default = False)\n",
" | if True, return distances to neighbors of each point\n",
" | if False, return only neighbors\n",
" | Note that unlike the query() method, setting return_distance=True\n",
" | here adds to the computation time. Not all distances need to be\n",
" | calculated explicitly for return_distance=False. Results are\n",
" | not sorted by default: see ``sort_results`` keyword.\n",
" | count_only : boolean (default = False)\n",
" | if True, return only the count of points within distance r\n",
" | if False, return the indices of all points within distance r\n",
" | If return_distance==True, setting count_only=True will\n",
" | result in an error.\n",
" | sort_results : boolean (default = False)\n",
" | if True, the distances and indices will be sorted before being\n",
" | returned. If False, the results will not be sorted. If\n",
" | return_distance == False, setting sort_results = True will\n",
" | result in an error.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | count : if count_only == True\n",
" | ind : if count_only == False and return_distance == False\n",
" | (ind, dist) : if count_only == False and return_distance == True\n",
" | \n",
" | count : array of integers, shape = X.shape[:-1]\n",
" | each entry gives the number of neighbors within\n",
" | a distance r of the corresponding point.\n",
" | \n",
" | ind : array of objects, shape = X.shape[:-1]\n",
" | each element is a numpy integer array listing the indices of\n",
" | neighbors of the corresponding point. Note that unlike\n",
" | the results of a k-neighbors query, the returned neighbors\n",
" | are not sorted by distance by default.\n",
" | \n",
" | dist : array of objects, shape = X.shape[:-1]\n",
" | each element is a numpy double array\n",
" | listing the distances corresponding to indices in i.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Query for neighbors in a given radius\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((10, 3)) # 10 points in 3 dimensions\n",
" | >>> tree = BinaryTree(X, leaf_size=2) # doctest: +SKIP\n",
" | >>> print tree.query_radius(X[0], r=0.3, count_only=True)\n",
" | 3\n",
" | >>> ind = tree.query_radius(X[0], r=0.3) # doctest: +SKIP\n",
" | >>> print ind # indices of neighbors within distance 0.3\n",
" | [3 0 1]\n",
" | \n",
" | reset_n_calls(...)\n",
" | \n",
" | two_point_correlation(...)\n",
" | Compute the two-point correlation function\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like\n",
" | An array of points to query. Last dimension should match dimension\n",
" | of training data.\n",
" | r : array_like\n",
" | A one-dimensional array of distances\n",
" | dualtree : boolean (default = False)\n",
" | If true, use a dualtree algorithm. Otherwise, use a single-tree\n",
" | algorithm. Dual tree algorithms can have better scaling for\n",
" | large N.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | counts : ndarray\n",
" | counts[i] contains the number of pairs of points with distance\n",
" | less than or equal to r[i]\n",
" | \n",
" | Examples\n",
" | --------\n",
" | Compute the two-point autocorrelation function of X:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> np.random.seed(0)\n",
" | >>> X = np.random.random((30, 3))\n",
" | >>> r = np.linspace(0, 1, 5)\n",
" | >>> tree = BinaryTree(X) # doctest: +SKIP\n",
" | >>> tree.two_point_correlation(X, r)\n",
" | array([ 30, 62, 278, 580, 820])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from BinaryTree:\n",
" | \n",
" | data\n",
" | \n",
" | idx_array\n",
" | \n",
" | node_bounds\n",
" | \n",
" | node_data\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes inherited from BinaryTree:\n",
" | \n",
" | valid_metrics = ['chebyshev', 'euclidean', 'cityblock', 'manhattan', '...\n",
" \n",
" class KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" | Classifier implementing the k-nearest neighbors vote.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | n_neighbors : int, optional (default = 5)\n",
" | Number of neighbors to use by default for :meth:`k_neighbors` queries.\n",
" | \n",
" | weights : str or callable\n",
" | weight function used in prediction. Possible values:\n",
" | \n",
" | - 'uniform' : uniform weights. All points in each neighborhood\n",
" | are weighted equally.\n",
" | - 'distance' : weight points by the inverse of their distance.\n",
" | in this case, closer neighbors of a query point will have a\n",
" | greater influence than neighbors which are further away.\n",
" | - [callable] : a user-defined function which accepts an\n",
" | array of distances, and returns an array of the same shape\n",
" | containing the weights.\n",
" | \n",
" | Uniform weights are used by default.\n",
" | \n",
" | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional\n",
" | Algorithm used to compute the nearest neighbors:\n",
" | \n",
" | - 'ball_tree' will use :class:`BallTree`\n",
" | - 'kd_tree' will use :class:`KDTree`\n",
" | - 'brute' will use a brute-force search.\n",
" | - 'auto' will attempt to decide the most appropriate algorithm\n",
" | based on the values passed to :meth:`fit` method.\n",
" | \n",
" | Note: fitting on sparse input will override the setting of\n",
" | this parameter, using brute force.\n",
" | \n",
" | leaf_size : int, optional (default = 30)\n",
" | Leaf size passed to BallTree or KDTree. This can affect the\n",
" | speed of the construction and query, as well as the memory\n",
" | required to store the tree. The optimal value depends on the\n",
" | nature of the problem.\n",
" | \n",
" | metric : string or DistanceMetric object (default = 'minkowski')\n",
" | the distance metric to use for the tree. The default metric is\n",
" | minkowski, and with p=2 is equivalent to the standard Euclidean\n",
" | metric. See the documentation of the DistanceMetric class for a\n",
" | list of available metrics.\n",
" | \n",
" | p : integer, optional (default = 2)\n",
" | Power parameter for the Minkowski metric. When p = 1, this is\n",
" | equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" | \n",
" | metric_params: dict, optional (default = None)\n",
" | additional keyword arguments for the metric function.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [1], [2], [3]]\n",
" | >>> y = [0, 0, 1, 1]\n",
" | >>> from sklearn.neighbors import KNeighborsClassifier\n",
" | >>> neigh = KNeighborsClassifier(n_neighbors=3)\n",
" | >>> neigh.fit(X, y) # doctest: +ELLIPSIS\n",
" | KNeighborsClassifier(...)\n",
" | >>> print(neigh.predict([[1.1]]))\n",
" | [0]\n",
" | >>> print(neigh.predict_proba([[0.9]]))\n",
" | [[ 0.66666667 0.33333333]]\n",
" | \n",
" | See also\n",
" | --------\n",
" | RadiusNeighborsClassifier\n",
" | KNeighborsRegressor\n",
" | RadiusNeighborsRegressor\n",
" | NearestNeighbors\n",
" | \n",
" | Notes\n",
" | -----\n",
" | See :ref:`Nearest Neighbors <neighbors>` in the online documentation\n",
" | for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n",
" | \n",
" | .. warning::\n",
" | \n",
" | Regarding the Nearest Neighbors algorithms, if it is found that two\n",
" | neighbors, neighbor `k+1` and `k`, have identical distances but\n",
" | but different labels, the results will depend on the ordering of the\n",
" | training data.\n",
" | \n",
" | http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n",
" | \n",
" | Method resolution order:\n",
" | KNeighborsClassifier\n",
" | sklearn.neighbors.base.NeighborsBase\n",
" | abc.NewBase\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.KNeighborsMixin\n",
" | sklearn.neighbors.base.SupervisedIntegerMixin\n",
" | sklearn.base.ClassifierMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs)\n",
" | \n",
" | predict(self, X)\n",
" | Predict the class labels for the provided data\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array of shape [n_samples, n_features]\n",
" | A 2-D array representing the test points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | y : array of shape [n_samples] or [n_samples, n_outputs]\n",
" | Class labels for each data sample.\n",
" | \n",
" | predict_proba(self, X)\n",
" | Return probability estimates for the test data X.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array, shape = (n_samples, n_features)\n",
" | A 2-D array representing the test points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | p : array of shape = [n_samples, n_classes], or a list of n_outputs\n",
" | of such arrays if n_outputs > 1.\n",
" | The class probabilities of the input samples. Classes are ordered\n",
" | by lexicographic order.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __abstractmethods__ = frozenset([])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.KNeighborsMixin:\n",
" | \n",
" | kneighbors(self, X=None, n_neighbors=None, return_distance=True)\n",
" | Finds the K-neighbors of a point.\n",
" | \n",
" | Returns distance\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors to get (default is the value\n",
" | passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array\n",
" | Array representing the lengths to points, only present if\n",
" | return_distance=True\n",
" | \n",
" | ind : array\n",
" | Indices of the nearest points in the population matrix.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1,1,1]\n",
" | \n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=1)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> print(neigh.kneighbors([1., 1., 1.])) # doctest: +ELLIPSIS\n",
" | (array([[ 0.5]]), array([[2]]...))\n",
" | \n",
" | As you can see, it returns [[0.5]], and [[2]], which means that the\n",
" | element is at distance 0.5 and is the third element of samples\n",
" | (indexes start at 0). You can also query for multiple points:\n",
" | \n",
" | >>> X = [[0., 1., 0.], [1., 0., 1.]]\n",
" | >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS\n",
" | array([[1],\n",
" | [2]]...)\n",
" | \n",
" | kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity')\n",
" | Computes the (weighted) graph of k-Neighbors for points in X\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors for each sample.\n",
" | (default is value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]\n",
" | n_samples_fit is the number of samples in the fitted data\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=2)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.kneighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 1.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors.radius_neighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.SupervisedIntegerMixin:\n",
" | \n",
" | fit(self, X, y)\n",
" | Fit the model using X as training data and y as target values\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix, BallTree, KDTree}\n",
" | Training data. If array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | y : {array-like, sparse matrix}\n",
" | Target values of shape = [n_samples] or [n_samples, n_outputs]\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.ClassifierMixin:\n",
" | \n",
" | score(self, X, y, sample_weight=None)\n",
" | Returns the mean accuracy on the given test data and labels.\n",
" | \n",
" | In multi-label classification, this is the subset accuracy\n",
" | which is a harsh metric since you require for each sample that\n",
" | each label set be correctly predicted.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = (n_samples, n_features)\n",
" | Test samples.\n",
" | \n",
" | y : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
" | True labels for X.\n",
" | \n",
" | sample_weight : array-like, shape = [n_samples], optional\n",
" | Sample weights.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | score : float\n",
" | Mean accuracy of self.predict(X) wrt. y.\n",
" \n",
" class KNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" | Regression based on k-nearest neighbors.\n",
" | \n",
" | The target is predicted by local interpolation of the targets\n",
" | associated of the nearest neighbors in the training set.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | n_neighbors : int, optional (default = 5)\n",
" | Number of neighbors to use by default for :meth:`k_neighbors` queries.\n",
" | \n",
" | weights : str or callable\n",
" | weight function used in prediction. Possible values:\n",
" | \n",
" | - 'uniform' : uniform weights. All points in each neighborhood\n",
" | are weighted equally.\n",
" | - 'distance' : weight points by the inverse of their distance.\n",
" | in this case, closer neighbors of a query point will have a\n",
" | greater influence than neighbors which are further away.\n",
" | - [callable] : a user-defined function which accepts an\n",
" | array of distances, and returns an array of the same shape\n",
" | containing the weights.\n",
" | \n",
" | Uniform weights are used by default.\n",
" | \n",
" | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional\n",
" | Algorithm used to compute the nearest neighbors:\n",
" | \n",
" | - 'ball_tree' will use :class:`BallTree`\n",
" | - 'kd_tree' will use :class:`KDtree`\n",
" | - 'brute' will use a brute-force search.\n",
" | - 'auto' will attempt to decide the most appropriate algorithm\n",
" | based on the values passed to :meth:`fit` method.\n",
" | \n",
" | Note: fitting on sparse input will override the setting of\n",
" | this parameter, using brute force.\n",
" | \n",
" | leaf_size : int, optional (default = 30)\n",
" | Leaf size passed to BallTree or KDTree. This can affect the\n",
" | speed of the construction and query, as well as the memory\n",
" | required to store the tree. The optimal value depends on the\n",
" | nature of the problem.\n",
" | \n",
" | metric : string or DistanceMetric object (default='minkowski')\n",
" | the distance metric to use for the tree. The default metric is\n",
" | minkowski, and with p=2 is equivalent to the standard Euclidean\n",
" | metric. See the documentation of the DistanceMetric class for a\n",
" | list of available metrics.\n",
" | \n",
" | p : integer, optional (default = 2)\n",
" | Power parameter for the Minkowski metric. When p = 1, this is\n",
" | equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" | \n",
" | metric_params: dict, optional (default = None)\n",
" | additional keyword arguments for the metric function.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [1], [2], [3]]\n",
" | >>> y = [0, 0, 1, 1]\n",
" | >>> from sklearn.neighbors import KNeighborsRegressor\n",
" | >>> neigh = KNeighborsRegressor(n_neighbors=2)\n",
" | >>> neigh.fit(X, y) # doctest: +ELLIPSIS\n",
" | KNeighborsRegressor(...)\n",
" | >>> print(neigh.predict([[1.5]]))\n",
" | [ 0.5]\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors\n",
" | RadiusNeighborsRegressor\n",
" | KNeighborsClassifier\n",
" | RadiusNeighborsClassifier\n",
" | \n",
" | Notes\n",
" | -----\n",
" | See :ref:`Nearest Neighbors <neighbors>` in the online documentation\n",
" | for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n",
" | \n",
" | .. warning::\n",
" | \n",
" | Regarding the Nearest Neighbors algorithms, if it is found that two\n",
" | neighbors, neighbor `k+1` and `k`, have identical distances but\n",
" | but different labels, the results will depend on the ordering of the\n",
" | training data.\n",
" | \n",
" | http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n",
" | \n",
" | Method resolution order:\n",
" | KNeighborsRegressor\n",
" | sklearn.neighbors.base.NeighborsBase\n",
" | abc.NewBase\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.KNeighborsMixin\n",
" | sklearn.neighbors.base.SupervisedFloatMixin\n",
" | sklearn.base.RegressorMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs)\n",
" | \n",
" | predict(self, X)\n",
" | Predict the target for the provided data\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | \n",
" | Returns\n",
" | -------\n",
" | y : array of int, shape = [n_samples] or [n_samples, n_outputs]\n",
" | Target values\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __abstractmethods__ = frozenset([])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.KNeighborsMixin:\n",
" | \n",
" | kneighbors(self, X=None, n_neighbors=None, return_distance=True)\n",
" | Finds the K-neighbors of a point.\n",
" | \n",
" | Returns distance\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors to get (default is the value\n",
" | passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array\n",
" | Array representing the lengths to points, only present if\n",
" | return_distance=True\n",
" | \n",
" | ind : array\n",
" | Indices of the nearest points in the population matrix.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1,1,1]\n",
" | \n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=1)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> print(neigh.kneighbors([1., 1., 1.])) # doctest: +ELLIPSIS\n",
" | (array([[ 0.5]]), array([[2]]...))\n",
" | \n",
" | As you can see, it returns [[0.5]], and [[2]], which means that the\n",
" | element is at distance 0.5 and is the third element of samples\n",
" | (indexes start at 0). You can also query for multiple points:\n",
" | \n",
" | >>> X = [[0., 1., 0.], [1., 0., 1.]]\n",
" | >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS\n",
" | array([[1],\n",
" | [2]]...)\n",
" | \n",
" | kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity')\n",
" | Computes the (weighted) graph of k-Neighbors for points in X\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors for each sample.\n",
" | (default is value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]\n",
" | n_samples_fit is the number of samples in the fitted data\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=2)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.kneighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 1.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors.radius_neighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.SupervisedFloatMixin:\n",
" | \n",
" | fit(self, X, y)\n",
" | Fit the model using X as training data and y as target values\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix, BallTree, KDTree}\n",
" | Training data. If array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | y : {array-like, sparse matrix}\n",
" | Target values, array of float values, shape = [n_samples]\n",
" | or [n_samples, n_outputs]\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.RegressorMixin:\n",
" | \n",
" | score(self, X, y, sample_weight=None)\n",
" | Returns the coefficient of determination R^2 of the prediction.\n",
" | \n",
" | The coefficient R^2 is defined as (1 - u/v), where u is the regression\n",
" | sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual\n",
" | sum of squares ((y_true - y_true.mean()) ** 2).sum().\n",
" | Best possible score is 1.0, lower values are worse.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = (n_samples, n_features)\n",
" | Test samples.\n",
" | \n",
" | y : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
" | True values for X.\n",
" | \n",
" | sample_weight : array-like, shape = [n_samples], optional\n",
" | Sample weights.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | score : float\n",
" | R^2 of self.predict(X) wrt. y.\n",
" \n",
" class KernelDensity(sklearn.base.BaseEstimator)\n",
" | Kernel Density Estimation\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | bandwidth : float\n",
" | The bandwidth of the kernel.\n",
" | \n",
" | algorithm : string\n",
" | The tree algorithm to use. Valid options are\n",
" | ['kd_tree'|'ball_tree'|'auto']. Default is 'auto'.\n",
" | \n",
" | kernel : string\n",
" | The kernel to use. Valid kernels are\n",
" | ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']\n",
" | Default is 'gaussian'.\n",
" | \n",
" | metric : string\n",
" | The distance metric to use. Note that not all metrics are\n",
" | valid with all algorithms. Refer to the documentation of\n",
" | :class:`BallTree` and :class:`KDTree` for a description of\n",
" | available algorithms. Note that the normalization of the density\n",
" | output is correct only for the Euclidean distance metric. Default\n",
" | is 'euclidean'.\n",
" | \n",
" | atol : float\n",
" | The desired absolute tolerance of the result. A larger tolerance will\n",
" | generally lead to faster execution. Default is 0.\n",
" | \n",
" | rtol : float\n",
" | The desired relative tolerance of the result. A larger tolerance will\n",
" | generally lead to faster execution. Default is 1E-8.\n",
" | \n",
" | breadth_first : boolean\n",
" | If true (default), use a breadth-first approach to the problem.\n",
" | Otherwise use a depth-first approach.\n",
" | \n",
" | leaf_size : int\n",
" | Specify the leaf size of the underlying tree. See :class:`BallTree`\n",
" | or :class:`KDTree` for details. Default is 40.\n",
" | \n",
" | metric_params : dict\n",
" | Additional parameters to be passed to the tree for use with the\n",
" | metric. For more information, see the documentation of\n",
" | :class:`BallTree` or :class:`KDTree`.\n",
" | \n",
" | Method resolution order:\n",
" | KernelDensity\n",
" | sklearn.base.BaseEstimator\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric='euclidean', atol=0, rtol=0, breadth_first=True, leaf_size=40, metric_params=None)\n",
" | \n",
" | fit(self, X, y=None)\n",
" | Fit the Kernel Density model on the data.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like, shape (n_samples, n_features)\n",
" | List of n_features-dimensional data points. Each row\n",
" | corresponds to a single data point.\n",
" | \n",
" | sample(self, n_samples=1, random_state=None)\n",
" | Generate random samples from the model.\n",
" | \n",
" | Currently, this is implemented only for gaussian and tophat kernels.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | n_samples : int, optional\n",
" | Number of samples to generate. Defaults to 1.\n",
" | \n",
" | random_state : RandomState or an int seed (0 by default)\n",
" | A random number generator instance.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | X : array_like, shape (n_samples, n_features)\n",
" | List of samples.\n",
" | \n",
" | score(self, X, y=None)\n",
" | Compute the total log probability under the model.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like, shape (n_samples, n_features)\n",
" | List of n_features-dimensional data points. Each row\n",
" | corresponds to a single data point.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | logprob : float\n",
" | Total log-likelihood of the data in X.\n",
" | \n",
" | score_samples(self, X)\n",
" | Evaluate the density model on the data.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like, shape (n_samples, n_features)\n",
" | An array of points to query. Last dimension should match dimension\n",
" | of training data (n_features).\n",
" | \n",
" | Returns\n",
" | -------\n",
" | density : ndarray, shape (n_samples,)\n",
" | The array of log(density) evaluations.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" \n",
" class LSHForest(sklearn.base.BaseEstimator, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin)\n",
" | Performs approximate nearest neighbor search using LSH forest.\n",
" | \n",
" | LSH Forest: Locality Sensitive Hashing forest [1] is an alternative\n",
" | method for vanilla approximate nearest neighbor search methods.\n",
" | LSH forest data structure has been implemented using sorted\n",
" | arrays and binary search and 32 bit fixed-length hashes.\n",
" | Random projection is used as the hash family which approximates\n",
" | cosine distance.\n",
" | \n",
" | The cosine distance is defined as ``1 - cosine_similarity``: the lowest\n",
" | value is 0 (identical point) but it is bounded above by 2 for the farthest\n",
" | points. Its value does not depend on the norm of the vector points but\n",
" | only on their relative angles.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | \n",
" | n_estimators : int (default = 10)\n",
" | Number of trees in the LSH Forest.\n",
" | \n",
" | min_hash_match : int (default = 4)\n",
" | lowest hash length to be searched when candidate selection is\n",
" | performed for nearest neighbors.\n",
" | \n",
" | n_candidates : int (default = 10)\n",
" | Minimum number of candidates evaluated per estimator, assuming enough\n",
" | items meet the `min_hash_match` constraint.\n",
" | \n",
" | n_neighbors : int (default = 5)\n",
" | Number of neighbors to be returned from query function when\n",
" | it is not provided to the :meth:`kneighbors` method.\n",
" | \n",
" | radius : float, optinal (default = 1.0)\n",
" | Radius from the data point to its neighbors. This is the parameter\n",
" | space to use by default for the :meth`radius_neighbors` queries.\n",
" | \n",
" | radius_cutoff_ratio : float, optional (default = 0.9)\n",
" | A value ranges from 0 to 1. Radius neighbors will be searched until\n",
" | the ratio between total neighbors within the radius and the total\n",
" | candidates becomes less than this value unless it is terminated by\n",
" | hash length reaching `min_hash_match`.\n",
" | \n",
" | random_state : int, RandomState instance or None, optional (default=None)\n",
" | If int, random_state is the seed used by the random number generator;\n",
" | If RandomState instance, random_state is the random number generator;\n",
" | If None, the random number generator is the RandomState instance used\n",
" | by `np.random`.\n",
" | \n",
" | Attributes\n",
" | ----------\n",
" | \n",
" | hash_functions_ : list of GaussianRandomProjectionHash objects\n",
" | Hash function g(p,x) for a tree is an array of 32 randomly generated\n",
" | float arrays with the same dimenstion as the data set. This array is\n",
" | stored in GaussianRandomProjectionHash object and can be obtained\n",
" | from ``components_`` attribute.\n",
" | \n",
" | trees_ : array, shape (n_estimators, n_samples)\n",
" | Each tree (corresponding to a hash function) contains an array of\n",
" | sorted hashed values. The array representation may change in future\n",
" | versions.\n",
" | \n",
" | original_indices_ : array, shape (n_estimators, n_samples)\n",
" | Original indices of sorted hashed values in the fitted index.\n",
" | \n",
" | References\n",
" | ----------\n",
" | \n",
" | .. [1] M. Bawa, T. Condie and P. Ganesan, \"LSH Forest: Self-Tuning\n",
" | Indexes for Similarity Search\", WWW '05 Proceedings of the\n",
" | 14th international conference on World Wide Web, 651-660,\n",
" | 2005.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> from sklearn.neighbors import LSHForest\n",
" | \n",
" | >>> X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]\n",
" | >>> X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]\n",
" | >>> lshf = LSHForest()\n",
" | >>> lshf.fit(X_train) # doctest: +NORMALIZE_WHITESPACE\n",
" | LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10,\n",
" | n_neighbors=5, radius=1.0, radius_cutoff_ratio=0.9,\n",
" | random_state=None)\n",
" | >>> distances, indices = lshf.kneighbors(X_test, n_neighbors=2)\n",
" | >>> distances # doctest: +ELLIPSIS\n",
" | array([[ 0.069..., 0.149...],\n",
" | [ 0.229..., 0.481...],\n",
" | [ 0.004..., 0.014...]])\n",
" | >>> indices\n",
" | array([[1, 2],\n",
" | [2, 0],\n",
" | [4, 0]])\n",
" | \n",
" | Method resolution order:\n",
" | LSHForest\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.KNeighborsMixin\n",
" | sklearn.neighbors.base.RadiusNeighborsMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, n_estimators=10, radius=1.0, n_candidates=50, n_neighbors=5, min_hash_match=4, radius_cutoff_ratio=0.9, random_state=None)\n",
" | \n",
" | fit(self, X, y=None)\n",
" | Fit the LSH forest on the data.\n",
" | \n",
" | This creates binary hashes of input data points by getting the\n",
" | dot product of input points and hash_function then\n",
" | transforming the projection into a binary string array based\n",
" | on the sign (positive/negative) of the projection.\n",
" | A sorted array of binary hashes is created.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)\n",
" | List of n_features-dimensional data points. Each row\n",
" | corresponds to a single data point.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self : object\n",
" | Returns self.\n",
" | \n",
" | kneighbors(self, X, n_neighbors=None, return_distance=True)\n",
" | Returns n_neighbors of approximate nearest neighbors.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)\n",
" | List of n_features-dimensional data points. Each row\n",
" | corresponds to a single query.\n",
" | \n",
" | n_neighbors : int, opitonal (default = None)\n",
" | Number of neighbors required. If not provided, this will\n",
" | return the number specified at the initialization.\n",
" | \n",
" | return_distance : boolean, optional (default = False)\n",
" | Returns the distances of neighbors if set to True.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array, shape (n_samples, n_neighbors)\n",
" | Array representing the cosine distances to each point,\n",
" | only present if return_distance=True.\n",
" | \n",
" | ind : array, shape (n_samples, n_neighbors)\n",
" | Indices of the approximate nearest points in the population\n",
" | matrix.\n",
" | \n",
" | partial_fit(self, X, y=None)\n",
" | Inserts new data into the already fitted LSH Forest.\n",
" | Cost is proportional to new total size, so additions\n",
" | should be batched.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)\n",
" | New data point to be inserted into the LSH Forest.\n",
" | \n",
" | radius_neighbors(self, X, radius=None, return_distance=True)\n",
" | Finds the neighbors within a given radius of a point or points.\n",
" | \n",
" | Return the indices and distances of some points from the dataset\n",
" | lying in a ball with size ``radius`` around the points of the query\n",
" | array. Points lying on the boundary are included in the results.\n",
" | \n",
" | The result points are *not* necessarily sorted by distance to their\n",
" | query point.\n",
" | \n",
" | LSH Forest being an approximate method, some true neighbors from the\n",
" | indexed dataset might be missing from the results.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)\n",
" | List of n_features-dimensional data points. Each row\n",
" | corresponds to a single query.\n",
" | \n",
" | radius : float\n",
" | Limiting distance of neighbors to return.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional (default = False)\n",
" | Returns the distances of neighbors if set to True.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array, shape (n_samples,) of arrays\n",
" | Each element is an array representing the cosine distances\n",
" | to some points found within ``radius`` of the respective query.\n",
" | Only present if ``return_distance=True``.\n",
" | \n",
" | ind : array, shape (n_samples,) of arrays\n",
" | Each element is an array of indices for neighbors within ``radius``\n",
" | of the respective query.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.KNeighborsMixin:\n",
" | \n",
" | kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity')\n",
" | Computes the (weighted) graph of k-Neighbors for points in X\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors for each sample.\n",
" | (default is value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]\n",
" | n_samples_fit is the number of samples in the fitted data\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=2)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.kneighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 1.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors.radius_neighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.RadiusNeighborsMixin:\n",
" | \n",
" | radius_neighbors_graph(self, X=None, radius=None, mode='connectivity')\n",
" | Computes the (weighted) graph of Neighbors for points in X\n",
" | \n",
" | Neighborhoods are restricted the points at a distance lower than\n",
" | radius.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features], optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Radius of neighborhoods.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.5)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.radius_neighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 0.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | kneighbors_graph\n",
" \n",
" class NearestCentroid(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin)\n",
" | Nearest centroid classifier.\n",
" | \n",
" | Each class is represented by its centroid, with test samples classified to\n",
" | the class with the nearest centroid.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | metric: string, or callable\n",
" | The metric to use when calculating distance between instances in a\n",
" | feature array. If metric is a string or callable, it must be one of\n",
" | the options allowed by metrics.pairwise.pairwise_distances for its\n",
" | metric parameter.\n",
" | The centroids for the samples corresponding to each class is the point\n",
" | from which the sum of the distances (according to the metric) of all\n",
" | samples that belong to that particular class are minimized.\n",
" | If the \"manhattan\" metric is provided, this centroid is the median and\n",
" | for all other metrics, the centroid is now set to be the mean.\n",
" | \n",
" | shrink_threshold : float, optional (default = None)\n",
" | Threshold for shrinking centroids to remove features.\n",
" | \n",
" | Attributes\n",
" | ----------\n",
" | centroids_ : array-like, shape = [n_classes, n_features]\n",
" | Centroid of each class\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> from sklearn.neighbors.nearest_centroid import NearestCentroid\n",
" | >>> import numpy as np\n",
" | >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n",
" | >>> y = np.array([1, 1, 1, 2, 2, 2])\n",
" | >>> clf = NearestCentroid()\n",
" | >>> clf.fit(X, y)\n",
" | NearestCentroid(metric='euclidean', shrink_threshold=None)\n",
" | >>> print(clf.predict([[-0.8, -1]]))\n",
" | [1]\n",
" | \n",
" | See also\n",
" | --------\n",
" | sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier\n",
" | \n",
" | Notes\n",
" | -----\n",
" | When used for text classification with tf-idf vectors, this classifier is\n",
" | also known as the Rocchio classifier.\n",
" | \n",
" | References\n",
" | ----------\n",
" | Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of\n",
" | multiple cancer types by shrunken centroids of gene expression. Proceedings\n",
" | of the National Academy of Sciences of the United States of America,\n",
" | 99(10), 6567-6572. The National Academy of Sciences.\n",
" | \n",
" | Method resolution order:\n",
" | NearestCentroid\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.base.ClassifierMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, metric='euclidean', shrink_threshold=None)\n",
" | \n",
" | fit(self, X, y)\n",
" | Fit the NearestCentroid model according to the given training data.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n",
" | Training vector, where n_samples in the number of samples and\n",
" | n_features is the number of features.\n",
" | Note that centroid shrinking cannot be used with sparse matrices.\n",
" | y : array, shape = [n_samples]\n",
" | Target values (integers)\n",
" | \n",
" | predict(self, X)\n",
" | Perform classification on an array of test vectors X.\n",
" | \n",
" | The predicted class C for each sample in X is returned.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features]\n",
" | \n",
" | Returns\n",
" | -------\n",
" | C : array, shape = [n_samples]\n",
" | \n",
" | Notes\n",
" | -----\n",
" | If the metric constructor parameter is \"precomputed\", X is assumed to\n",
" | be the distance matrix between the data to be predicted and\n",
" | ``self.centroids_``.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.ClassifierMixin:\n",
" | \n",
" | score(self, X, y, sample_weight=None)\n",
" | Returns the mean accuracy on the given test data and labels.\n",
" | \n",
" | In multi-label classification, this is the subset accuracy\n",
" | which is a harsh metric since you require for each sample that\n",
" | each label set be correctly predicted.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = (n_samples, n_features)\n",
" | Test samples.\n",
" | \n",
" | y : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
" | True labels for X.\n",
" | \n",
" | sample_weight : array-like, shape = [n_samples], optional\n",
" | Sample weights.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | score : float\n",
" | Mean accuracy of self.predict(X) wrt. y.\n",
" \n",
" class NearestNeighbors(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.UnsupervisedMixin)\n",
" | Unsupervised learner for implementing neighbor searches.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | n_neighbors : int, optional (default = 5)\n",
" | Number of neighbors to use by default for :meth:`k_neighbors` queries.\n",
" | \n",
" | radius : float, optional (default = 1.0)\n",
" | Range of parameter space to use by default for :meth`radius_neighbors`\n",
" | queries.\n",
" | \n",
" | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional\n",
" | Algorithm used to compute the nearest neighbors:\n",
" | \n",
" | - 'ball_tree' will use :class:`BallTree`\n",
" | - 'kd_tree' will use :class:`KDtree`\n",
" | - 'brute' will use a brute-force search.\n",
" | - 'auto' will attempt to decide the most appropriate algorithm\n",
" | based on the values passed to :meth:`fit` method.\n",
" | \n",
" | Note: fitting on sparse input will override the setting of\n",
" | this parameter, using brute force.\n",
" | \n",
" | leaf_size : int, optional (default = 30)\n",
" | Leaf size passed to BallTree or KDTree. This can affect the\n",
" | speed of the construction and query, as well as the memory\n",
" | required to store the tree. The optimal value depends on the\n",
" | nature of the problem.\n",
" | \n",
" | p: integer, optional (default = 2)\n",
" | Parameter for the Minkowski metric from\n",
" | sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n",
" | equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" | \n",
" | metric : string or callable, default 'minkowski'\n",
" | metric to use for distance computation. Any metric from scikit-learn\n",
" | or scipy.spatial.distance can be used.\n",
" | \n",
" | If metric is a callable function, it is called on each\n",
" | pair of instances (rows) and the resulting value recorded. The callable\n",
" | should take two arrays as input and return one value indicating the\n",
" | distance between them. This works for Scipy's metrics, but is less\n",
" | efficient than passing the metric name as a string.\n",
" | \n",
" | Distance matrices are not supported.\n",
" | \n",
" | Valid values for metric are:\n",
" | \n",
" | - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n",
" | 'manhattan']\n",
" | \n",
" | - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n",
" | 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n",
" | 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',\n",
" | 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',\n",
" | 'sqeuclidean', 'yule']\n",
" | \n",
" | See the documentation for scipy.spatial.distance for details on these\n",
" | metrics.\n",
" | \n",
" | metric_params: dict, optional (default = None)\n",
" | additional keyword arguments for the metric function.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> import numpy as np\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]\n",
" | \n",
" | >>> neigh = NearestNeighbors(2, 0.4)\n",
" | >>> neigh.fit(samples) #doctest: +ELLIPSIS\n",
" | NearestNeighbors(...)\n",
" | \n",
" | >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)\n",
" | ... #doctest: +ELLIPSIS\n",
" | array([[2, 0]]...)\n",
" | \n",
" | >>> rng = neigh.radius_neighbors([0, 0, 1.3], 0.4, return_distance=False)\n",
" | >>> np.asarray(rng[0][0])\n",
" | array(2)\n",
" | \n",
" | See also\n",
" | --------\n",
" | KNeighborsClassifier\n",
" | RadiusNeighborsClassifier\n",
" | KNeighborsRegressor\n",
" | RadiusNeighborsRegressor\n",
" | BallTree\n",
" | \n",
" | Notes\n",
" | -----\n",
" | See :ref:`Nearest Neighbors <neighbors>` in the online documentation\n",
" | for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n",
" | \n",
" | http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n",
" | \n",
" | Method resolution order:\n",
" | NearestNeighbors\n",
" | sklearn.neighbors.base.NeighborsBase\n",
" | abc.NewBase\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.KNeighborsMixin\n",
" | sklearn.neighbors.base.RadiusNeighborsMixin\n",
" | sklearn.neighbors.base.UnsupervisedMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, **kwargs)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __abstractmethods__ = frozenset([])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.KNeighborsMixin:\n",
" | \n",
" | kneighbors(self, X=None, n_neighbors=None, return_distance=True)\n",
" | Finds the K-neighbors of a point.\n",
" | \n",
" | Returns distance\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors to get (default is the value\n",
" | passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array\n",
" | Array representing the lengths to points, only present if\n",
" | return_distance=True\n",
" | \n",
" | ind : array\n",
" | Indices of the nearest points in the population matrix.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1,1,1]\n",
" | \n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=1)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> print(neigh.kneighbors([1., 1., 1.])) # doctest: +ELLIPSIS\n",
" | (array([[ 0.5]]), array([[2]]...))\n",
" | \n",
" | As you can see, it returns [[0.5]], and [[2]], which means that the\n",
" | element is at distance 0.5 and is the third element of samples\n",
" | (indexes start at 0). You can also query for multiple points:\n",
" | \n",
" | >>> X = [[0., 1., 0.], [1., 0., 1.]]\n",
" | >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS\n",
" | array([[1],\n",
" | [2]]...)\n",
" | \n",
" | kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity')\n",
" | Computes the (weighted) graph of k-Neighbors for points in X\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, last dimension same as that of fit data, optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | n_neighbors : int\n",
" | Number of neighbors for each sample.\n",
" | (default is value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples_fit]\n",
" | n_samples_fit is the number of samples in the fitted data\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(n_neighbors=2)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.kneighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 1.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors.radius_neighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.RadiusNeighborsMixin:\n",
" | \n",
" | radius_neighbors(self, X=None, radius=None, return_distance=True)\n",
" | Finds the neighbors within a given radius of a point or points.\n",
" | \n",
" | Return the indices and distances of each point from the dataset\n",
" | lying in a ball with size ``radius`` around the points of the query\n",
" | array. Points lying on the boundary are included in the results.\n",
" | \n",
" | The result points are *not* necessarily sorted by distance to their\n",
" | query point.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, (n_samples, n_features), optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Limiting distance of neighbors to return.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array, shape (n_samples,) of arrays\n",
" | Array representing the distances to each point, only present if\n",
" | return_distance=True. The distance values are computed according\n",
" | to the ``metric`` constructor parameter.\n",
" | \n",
" | ind : array, shape (n_samples,) of arrays\n",
" | An array of arrays of indices of the approximate nearest points\n",
" | from the population matrix that lie within a ball of size\n",
" | ``radius`` around the query points.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1, 1, 1]:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.6)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> rng = neigh.radius_neighbors([1., 1., 1.])\n",
" | >>> print(np.asarray(rng[0][0])) # doctest: +ELLIPSIS\n",
" | [ 1.5 0.5]\n",
" | >>> print(np.asarray(rng[1][0])) # doctest: +ELLIPSIS\n",
" | [1 2]\n",
" | \n",
" | The first array returned contains the distances to all points which\n",
" | are closer than 1.6, while the second array returned contains their\n",
" | indices. In general, multiple points can be queried at the same time.\n",
" | \n",
" | Notes\n",
" | -----\n",
" | Because the number of neighbors of each point is not necessarily\n",
" | equal, the results for multiple query points cannot be fit in a\n",
" | standard data array.\n",
" | For efficiency, `radius_neighbors` returns arrays of objects, where\n",
" | each object is a 1D array of indices or distances.\n",
" | \n",
" | radius_neighbors_graph(self, X=None, radius=None, mode='connectivity')\n",
" | Computes the (weighted) graph of Neighbors for points in X\n",
" | \n",
" | Neighborhoods are restricted the points at a distance lower than\n",
" | radius.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features], optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Radius of neighborhoods.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.5)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.radius_neighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 0.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | kneighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.UnsupervisedMixin:\n",
" | \n",
" | fit(self, X, y=None)\n",
" | Fit the model using X as training data\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix, BallTree, KDTree}\n",
" | Training data. If array or matrix, shape = [n_samples, n_features]\n",
" \n",
" class RadiusNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)\n",
" | Classifier implementing a vote among neighbors within a given radius\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | radius : float, optional (default = 1.0)\n",
" | Range of parameter space to use by default for :meth`radius_neighbors`\n",
" | queries.\n",
" | \n",
" | weights : str or callable\n",
" | weight function used in prediction. Possible values:\n",
" | \n",
" | - 'uniform' : uniform weights. All points in each neighborhood\n",
" | are weighted equally.\n",
" | - 'distance' : weight points by the inverse of their distance.\n",
" | in this case, closer neighbors of a query point will have a\n",
" | greater influence than neighbors which are further away.\n",
" | - [callable] : a user-defined function which accepts an\n",
" | array of distances, and returns an array of the same shape\n",
" | containing the weights.\n",
" | \n",
" | Uniform weights are used by default.\n",
" | \n",
" | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional\n",
" | Algorithm used to compute the nearest neighbors:\n",
" | \n",
" | - 'ball_tree' will use :class:`BallTree`\n",
" | - 'kd_tree' will use :class:`KDtree`\n",
" | - 'brute' will use a brute-force search.\n",
" | - 'auto' will attempt to decide the most appropriate algorithm\n",
" | based on the values passed to :meth:`fit` method.\n",
" | \n",
" | Note: fitting on sparse input will override the setting of\n",
" | this parameter, using brute force.\n",
" | \n",
" | leaf_size : int, optional (default = 30)\n",
" | Leaf size passed to BallTree or KDTree. This can affect the\n",
" | speed of the construction and query, as well as the memory\n",
" | required to store the tree. The optimal value depends on the\n",
" | nature of the problem.\n",
" | \n",
" | metric : string or DistanceMetric object (default='minkowski')\n",
" | the distance metric to use for the tree. The default metric is\n",
" | minkowski, and with p=2 is equivalent to the standard Euclidean\n",
" | metric. See the documentation of the DistanceMetric class for a\n",
" | list of available metrics.\n",
" | \n",
" | p : integer, optional (default = 2)\n",
" | Power parameter for the Minkowski metric. When p = 1, this is\n",
" | equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" | \n",
" | outlier_label : int, optional (default = None)\n",
" | Label, which is given for outlier samples (samples with no\n",
" | neighbors on given radius).\n",
" | If set to None, ValueError is raised, when outlier is detected.\n",
" | \n",
" | metric_params: dict, optional (default = None)\n",
" | additional keyword arguments for the metric function.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [1], [2], [3]]\n",
" | >>> y = [0, 0, 1, 1]\n",
" | >>> from sklearn.neighbors import RadiusNeighborsClassifier\n",
" | >>> neigh = RadiusNeighborsClassifier(radius=1.0)\n",
" | >>> neigh.fit(X, y) # doctest: +ELLIPSIS\n",
" | RadiusNeighborsClassifier(...)\n",
" | >>> print(neigh.predict([[1.5]]))\n",
" | [0]\n",
" | \n",
" | See also\n",
" | --------\n",
" | KNeighborsClassifier\n",
" | RadiusNeighborsRegressor\n",
" | KNeighborsRegressor\n",
" | NearestNeighbors\n",
" | \n",
" | Notes\n",
" | -----\n",
" | See :ref:`Nearest Neighbors <neighbors>` in the online documentation\n",
" | for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n",
" | \n",
" | http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n",
" | \n",
" | Method resolution order:\n",
" | RadiusNeighborsClassifier\n",
" | sklearn.neighbors.base.NeighborsBase\n",
" | abc.NewBase\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.RadiusNeighborsMixin\n",
" | sklearn.neighbors.base.SupervisedIntegerMixin\n",
" | sklearn.base.ClassifierMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, **kwargs)\n",
" | \n",
" | predict(self, X)\n",
" | Predict the class labels for the provided data\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array of shape [n_samples, n_features]\n",
" | A 2-D array representing the test points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | y : array of shape [n_samples] or [n_samples, n_outputs]\n",
" | Class labels for each data sample.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __abstractmethods__ = frozenset([])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.RadiusNeighborsMixin:\n",
" | \n",
" | radius_neighbors(self, X=None, radius=None, return_distance=True)\n",
" | Finds the neighbors within a given radius of a point or points.\n",
" | \n",
" | Return the indices and distances of each point from the dataset\n",
" | lying in a ball with size ``radius`` around the points of the query\n",
" | array. Points lying on the boundary are included in the results.\n",
" | \n",
" | The result points are *not* necessarily sorted by distance to their\n",
" | query point.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, (n_samples, n_features), optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Limiting distance of neighbors to return.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array, shape (n_samples,) of arrays\n",
" | Array representing the distances to each point, only present if\n",
" | return_distance=True. The distance values are computed according\n",
" | to the ``metric`` constructor parameter.\n",
" | \n",
" | ind : array, shape (n_samples,) of arrays\n",
" | An array of arrays of indices of the approximate nearest points\n",
" | from the population matrix that lie within a ball of size\n",
" | ``radius`` around the query points.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1, 1, 1]:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.6)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> rng = neigh.radius_neighbors([1., 1., 1.])\n",
" | >>> print(np.asarray(rng[0][0])) # doctest: +ELLIPSIS\n",
" | [ 1.5 0.5]\n",
" | >>> print(np.asarray(rng[1][0])) # doctest: +ELLIPSIS\n",
" | [1 2]\n",
" | \n",
" | The first array returned contains the distances to all points which\n",
" | are closer than 1.6, while the second array returned contains their\n",
" | indices. In general, multiple points can be queried at the same time.\n",
" | \n",
" | Notes\n",
" | -----\n",
" | Because the number of neighbors of each point is not necessarily\n",
" | equal, the results for multiple query points cannot be fit in a\n",
" | standard data array.\n",
" | For efficiency, `radius_neighbors` returns arrays of objects, where\n",
" | each object is a 1D array of indices or distances.\n",
" | \n",
" | radius_neighbors_graph(self, X=None, radius=None, mode='connectivity')\n",
" | Computes the (weighted) graph of Neighbors for points in X\n",
" | \n",
" | Neighborhoods are restricted the points at a distance lower than\n",
" | radius.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features], optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Radius of neighborhoods.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.5)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.radius_neighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 0.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | kneighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.SupervisedIntegerMixin:\n",
" | \n",
" | fit(self, X, y)\n",
" | Fit the model using X as training data and y as target values\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix, BallTree, KDTree}\n",
" | Training data. If array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | y : {array-like, sparse matrix}\n",
" | Target values of shape = [n_samples] or [n_samples, n_outputs]\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.ClassifierMixin:\n",
" | \n",
" | score(self, X, y, sample_weight=None)\n",
" | Returns the mean accuracy on the given test data and labels.\n",
" | \n",
" | In multi-label classification, this is the subset accuracy\n",
" | which is a harsh metric since you require for each sample that\n",
" | each label set be correctly predicted.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = (n_samples, n_features)\n",
" | Test samples.\n",
" | \n",
" | y : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
" | True labels for X.\n",
" | \n",
" | sample_weight : array-like, shape = [n_samples], optional\n",
" | Sample weights.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | score : float\n",
" | Mean accuracy of self.predict(X) wrt. y.\n",
" \n",
" class RadiusNeighborsRegressor(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.RadiusNeighborsMixin, sklearn.neighbors.base.SupervisedFloatMixin, sklearn.base.RegressorMixin)\n",
" | Regression based on neighbors within a fixed radius.\n",
" | \n",
" | The target is predicted by local interpolation of the targets\n",
" | associated of the nearest neighbors in the training set.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | radius : float, optional (default = 1.0)\n",
" | Range of parameter space to use by default for :meth`radius_neighbors`\n",
" | queries.\n",
" | \n",
" | weights : str or callable\n",
" | weight function used in prediction. Possible values:\n",
" | \n",
" | - 'uniform' : uniform weights. All points in each neighborhood\n",
" | are weighted equally.\n",
" | - 'distance' : weight points by the inverse of their distance.\n",
" | in this case, closer neighbors of a query point will have a\n",
" | greater influence than neighbors which are further away.\n",
" | - [callable] : a user-defined function which accepts an\n",
" | array of distances, and returns an array of the same shape\n",
" | containing the weights.\n",
" | \n",
" | Uniform weights are used by default.\n",
" | \n",
" | algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional\n",
" | Algorithm used to compute the nearest neighbors:\n",
" | \n",
" | - 'ball_tree' will use :class:`BallTree`\n",
" | - 'kd_tree' will use :class:`KDtree`\n",
" | - 'brute' will use a brute-force search.\n",
" | - 'auto' will attempt to decide the most appropriate algorithm\n",
" | based on the values passed to :meth:`fit` method.\n",
" | \n",
" | Note: fitting on sparse input will override the setting of\n",
" | this parameter, using brute force.\n",
" | \n",
" | leaf_size : int, optional (default = 30)\n",
" | Leaf size passed to BallTree or KDTree. This can affect the\n",
" | speed of the construction and query, as well as the memory\n",
" | required to store the tree. The optimal value depends on the\n",
" | nature of the problem.\n",
" | \n",
" | metric : string or DistanceMetric object (default='minkowski')\n",
" | the distance metric to use for the tree. The default metric is\n",
" | minkowski, and with p=2 is equivalent to the standard Euclidean\n",
" | metric. See the documentation of the DistanceMetric class for a\n",
" | list of available metrics.\n",
" | \n",
" | p : integer, optional (default = 2)\n",
" | Power parameter for the Minkowski metric. When p = 1, this is\n",
" | equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" | (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" | \n",
" | metric_params: dict, optional (default = None)\n",
" | additional keyword arguments for the metric function.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [1], [2], [3]]\n",
" | >>> y = [0, 0, 1, 1]\n",
" | >>> from sklearn.neighbors import RadiusNeighborsRegressor\n",
" | >>> neigh = RadiusNeighborsRegressor(radius=1.0)\n",
" | >>> neigh.fit(X, y) # doctest: +ELLIPSIS\n",
" | RadiusNeighborsRegressor(...)\n",
" | >>> print(neigh.predict([[1.5]]))\n",
" | [ 0.5]\n",
" | \n",
" | See also\n",
" | --------\n",
" | NearestNeighbors\n",
" | KNeighborsRegressor\n",
" | KNeighborsClassifier\n",
" | RadiusNeighborsClassifier\n",
" | \n",
" | Notes\n",
" | -----\n",
" | See :ref:`Nearest Neighbors <neighbors>` in the online documentation\n",
" | for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n",
" | \n",
" | http://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n",
" | \n",
" | Method resolution order:\n",
" | RadiusNeighborsRegressor\n",
" | sklearn.neighbors.base.NeighborsBase\n",
" | abc.NewBase\n",
" | sklearn.base.BaseEstimator\n",
" | sklearn.neighbors.base.RadiusNeighborsMixin\n",
" | sklearn.neighbors.base.SupervisedFloatMixin\n",
" | sklearn.base.RegressorMixin\n",
" | __builtin__.object\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, **kwargs)\n",
" | \n",
" | predict(self, X)\n",
" | Predict the target for the provided data\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | Returns\n",
" | -------\n",
" | y : array of int, shape = [n_samples] or [n_samples, n_outputs]\n",
" | Target values\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __abstractmethods__ = frozenset([])\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __repr__(self)\n",
" | \n",
" | get_params(self, deep=True)\n",
" | Get parameters for this estimator.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | deep: boolean, optional\n",
" | If True, will return the parameters for this estimator and\n",
" | contained subobjects that are estimators.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | params : mapping of string to any\n",
" | Parameter names mapped to their values.\n",
" | \n",
" | set_params(self, **params)\n",
" | Set the parameters of this estimator.\n",
" | \n",
" | The method works on simple estimators as well as on nested objects\n",
" | (such as pipelines). The former have parameters of the form\n",
" | ``<component>__<parameter>`` so that it's possible to update each\n",
" | component of a nested object.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | self\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors inherited from sklearn.base.BaseEstimator:\n",
" | \n",
" | __dict__\n",
" | dictionary for instance variables (if defined)\n",
" | \n",
" | __weakref__\n",
" | list of weak references to the object (if defined)\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.RadiusNeighborsMixin:\n",
" | \n",
" | radius_neighbors(self, X=None, radius=None, return_distance=True)\n",
" | Finds the neighbors within a given radius of a point or points.\n",
" | \n",
" | Return the indices and distances of each point from the dataset\n",
" | lying in a ball with size ``radius`` around the points of the query\n",
" | array. Points lying on the boundary are included in the results.\n",
" | \n",
" | The result points are *not* necessarily sorted by distance to their\n",
" | query point.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, (n_samples, n_features), optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Limiting distance of neighbors to return.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | return_distance : boolean, optional. Defaults to True.\n",
" | If False, distances will not be returned\n",
" | \n",
" | Returns\n",
" | -------\n",
" | dist : array, shape (n_samples,) of arrays\n",
" | Array representing the distances to each point, only present if\n",
" | return_distance=True. The distance values are computed according\n",
" | to the ``metric`` constructor parameter.\n",
" | \n",
" | ind : array, shape (n_samples,) of arrays\n",
" | An array of arrays of indices of the approximate nearest points\n",
" | from the population matrix that lie within a ball of size\n",
" | ``radius`` around the query points.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | In the following example, we construct a NeighborsClassifier\n",
" | class from an array representing our data set and ask who's\n",
" | the closest point to [1, 1, 1]:\n",
" | \n",
" | >>> import numpy as np\n",
" | >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.6)\n",
" | >>> neigh.fit(samples) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> rng = neigh.radius_neighbors([1., 1., 1.])\n",
" | >>> print(np.asarray(rng[0][0])) # doctest: +ELLIPSIS\n",
" | [ 1.5 0.5]\n",
" | >>> print(np.asarray(rng[1][0])) # doctest: +ELLIPSIS\n",
" | [1 2]\n",
" | \n",
" | The first array returned contains the distances to all points which\n",
" | are closer than 1.6, while the second array returned contains their\n",
" | indices. In general, multiple points can be queried at the same time.\n",
" | \n",
" | Notes\n",
" | -----\n",
" | Because the number of neighbors of each point is not necessarily\n",
" | equal, the results for multiple query points cannot be fit in a\n",
" | standard data array.\n",
" | For efficiency, `radius_neighbors` returns arrays of objects, where\n",
" | each object is a 1D array of indices or distances.\n",
" | \n",
" | radius_neighbors_graph(self, X=None, radius=None, mode='connectivity')\n",
" | Computes the (weighted) graph of Neighbors for points in X\n",
" | \n",
" | Neighborhoods are restricted the points at a distance lower than\n",
" | radius.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = [n_samples, n_features], optional\n",
" | The query point or points.\n",
" | If not provided, neighbors of each indexed point are returned.\n",
" | In this case, the query point is not considered its own neighbor.\n",
" | \n",
" | radius : float\n",
" | Radius of neighborhoods.\n",
" | (default is the value passed to the constructor).\n",
" | \n",
" | mode : {'connectivity', 'distance'}, optional\n",
" | Type of returned matrix: 'connectivity' will return the\n",
" | connectivity matrix with ones and zeros, in 'distance' the\n",
" | edges are Euclidean distance between points.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" | A[i, j] is assigned the weight of edge that connects i to j.\n",
" | \n",
" | Examples\n",
" | --------\n",
" | >>> X = [[0], [3], [1]]\n",
" | >>> from sklearn.neighbors import NearestNeighbors\n",
" | >>> neigh = NearestNeighbors(radius=1.5)\n",
" | >>> neigh.fit(X) # doctest: +ELLIPSIS\n",
" | NearestNeighbors(algorithm='auto', leaf_size=30, ...)\n",
" | >>> A = neigh.radius_neighbors_graph(X)\n",
" | >>> A.toarray()\n",
" | array([[ 1., 0., 1.],\n",
" | [ 0., 1., 0.],\n",
" | [ 1., 0., 1.]])\n",
" | \n",
" | See also\n",
" | --------\n",
" | kneighbors_graph\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.neighbors.base.SupervisedFloatMixin:\n",
" | \n",
" | fit(self, X, y)\n",
" | Fit the model using X as training data and y as target values\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : {array-like, sparse matrix, BallTree, KDTree}\n",
" | Training data. If array or matrix, shape = [n_samples, n_features]\n",
" | \n",
" | y : {array-like, sparse matrix}\n",
" | Target values, array of float values, shape = [n_samples]\n",
" | or [n_samples, n_outputs]\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Methods inherited from sklearn.base.RegressorMixin:\n",
" | \n",
" | score(self, X, y, sample_weight=None)\n",
" | Returns the coefficient of determination R^2 of the prediction.\n",
" | \n",
" | The coefficient R^2 is defined as (1 - u/v), where u is the regression\n",
" | sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual\n",
" | sum of squares ((y_true - y_true.mean()) ** 2).sum().\n",
" | Best possible score is 1.0, lower values are worse.\n",
" | \n",
" | Parameters\n",
" | ----------\n",
" | X : array-like, shape = (n_samples, n_features)\n",
" | Test samples.\n",
" | \n",
" | y : array-like, shape = (n_samples) or (n_samples, n_outputs)\n",
" | True values for X.\n",
" | \n",
" | sample_weight : array-like, shape = [n_samples], optional\n",
" | Sample weights.\n",
" | \n",
" | Returns\n",
" | -------\n",
" | score : float\n",
" | R^2 of self.predict(X) wrt. y.\n",
"\n",
"FUNCTIONS\n",
" kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=None)\n",
" Computes the (weighted) graph of k-Neighbors for points in X\n",
" \n",
" Parameters\n",
" ----------\n",
" X : array-like or BallTree, shape = [n_samples, n_features]\n",
" Sample data, in the form of a numpy array or a precomputed\n",
" :class:`BallTree`.\n",
" \n",
" n_neighbors : int\n",
" Number of neighbors for each sample.\n",
" \n",
" mode : {'connectivity', 'distance'}, optional\n",
" Type of returned matrix: 'connectivity' will return the\n",
" connectivity matrix with ones and zeros, in 'distance' the\n",
" edges are Euclidean distance between points.\n",
" \n",
" metric : string, default 'minkowski'\n",
" The distance metric used to calculate the k-Neighbors for each sample\n",
" point. The DistanceMetric class gives a list of available metrics.\n",
" The default distance is 'euclidean' ('minkowski' metric with the p\n",
" param equal to 2.)\n",
" \n",
" include_self: bool, default backward-compatible.\n",
" Whether or not to mark each sample as the first nearest neighbor to\n",
" itself. If `None`, then True is used for mode='connectivity' and False\n",
" for mode='distance' as this will preserve backwards compatibilty. From\n",
" version 0.18, the default value will be False, irrespective of the\n",
" value of `mode`.\n",
" \n",
" p : int, default 2\n",
" Power parameter for the Minkowski metric. When p = 1, this is\n",
" equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" \n",
" metric_params: dict, optional\n",
" additional keyword arguments for the metric function.\n",
" \n",
" Returns\n",
" -------\n",
" A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" A[i, j] is assigned the weight of edge that connects i to j.\n",
" \n",
" Examples\n",
" --------\n",
" >>> X = [[0], [3], [1]]\n",
" >>> from sklearn.neighbors import kneighbors_graph\n",
" >>> A = kneighbors_graph(X, 2)\n",
" >>> A.toarray()\n",
" array([[ 1., 0., 1.],\n",
" [ 0., 1., 1.],\n",
" [ 1., 0., 1.]])\n",
" \n",
" See also\n",
" --------\n",
" radius_neighbors_graph\n",
" \n",
" radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=None)\n",
" Computes the (weighted) graph of Neighbors for points in X\n",
" \n",
" Neighborhoods are restricted the points at a distance lower than\n",
" radius.\n",
" \n",
" Parameters\n",
" ----------\n",
" X : array-like or BallTree, shape = [n_samples, n_features]\n",
" Sample data, in the form of a numpy array or a precomputed\n",
" :class:`BallTree`.\n",
" \n",
" radius : float\n",
" Radius of neighborhoods.\n",
" \n",
" mode : {'connectivity', 'distance'}, optional\n",
" Type of returned matrix: 'connectivity' will return the\n",
" connectivity matrix with ones and zeros, in 'distance' the\n",
" edges are Euclidean distance between points.\n",
" \n",
" metric : string, default 'minkowski'\n",
" The distance metric used to calculate the neighbors within a\n",
" given radius for each sample point. The DistanceMetric class\n",
" gives a list of available metrics. The default distance is\n",
" 'euclidean' ('minkowski' metric with the param equal to 2.)\n",
" \n",
" include_self: bool, default None\n",
" Whether or not to mark each sample as the first nearest neighbor to\n",
" itself. If `None`, then True is used for mode='connectivity' and False\n",
" for mode='distance' as this will preserve backwards compatibilty. From\n",
" version 0.18, the default value will be False, irrespective of the\n",
" value of `mode`.\n",
" \n",
" p : int, default 2\n",
" Power parameter for the Minkowski metric. When p = 1, this is\n",
" equivalent to using manhattan_distance (l1), and euclidean_distance\n",
" (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n",
" \n",
" metric_params: dict, optional\n",
" additional keyword arguments for the metric function.\n",
" \n",
" Returns\n",
" -------\n",
" A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n",
" A[i, j] is assigned the weight of edge that connects i to j.\n",
" \n",
" Examples\n",
" --------\n",
" >>> X = [[0], [3], [1]]\n",
" >>> from sklearn.neighbors import radius_neighbors_graph\n",
" >>> A = radius_neighbors_graph(X, 1.5)\n",
" >>> A.toarray()\n",
" array([[ 1., 0., 1.],\n",
" [ 0., 1., 0.],\n",
" [ 1., 0., 1.]])\n",
" \n",
" See also\n",
" --------\n",
" kneighbors_graph\n",
"\n",
"DATA\n",
" __all__ = ['BallTree', 'DistanceMetric', 'KDTree', 'KNeighborsClassifi...\n",
" __warningregistry__ = {('numpy.dtype size changed, may indicate binary...\n",
"\n",
"\n"
]
}
],
"source": [
"# About KNN model in Scikit Learn\n",
"help(neighbors)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_neighbors=5, p=2, weights='uniform')"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create the model\n",
"knn_model = neighbors.KNeighborsClassifier(n_neighbors=5)\n",
"\n",
"# fit the model\n",
"knn_model.fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted Probabilities \n",
"------------------------\n",
"['setosa' 'versicolor' 'virginica']\n",
"[[ 0. 1. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 1. 0. ]\n",
" [ 0. 1. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 1. 0. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 0.6 0.4]\n",
" [ 0. 1. 0. ]\n",
" [ 0. 0.2 0.8]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0.8 0.2]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 1. 0. ]\n",
" [ 0. 1. 0. ]\n",
" [ 0. 0. 1. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0.2 0.8]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 0. 1. ]\n",
" [ 0. 0. 1. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 1. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0.4 0.6]\n",
" [ 0. 1. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 0. 0.2 0.8]\n",
" [ 0. 0.8 0.2]\n",
" [ 0. 1. 0. ]\n",
" [ 1. 0. 0. ]\n",
" [ 1. 0. 0. ]]\n",
"\n",
"Prediction / Actual\n",
"-------------------\n",
"versicolor / versicolor\n",
"setosa / setosa\n",
"virginica / virginica\n",
"versicolor / versicolor\n",
"versicolor / versicolor\n",
"setosa / setosa\n",
"versicolor / versicolor\n",
"virginica / virginica\n",
"versicolor / versicolor\n",
"versicolor / versicolor\n",
"virginica / virginica\n",
"setosa / setosa\n",
"setosa / setosa\n",
"setosa / setosa\n",
"setosa / setosa\n",
"versicolor / versicolor\n",
"virginica / virginica\n",
"versicolor / versicolor\n",
"versicolor / versicolor\n",
"virginica / virginica\n",
"setosa / setosa\n",
"virginica / virginica\n",
"setosa / setosa\n",
"virginica / virginica\n",
"virginica / virginica\n",
"virginica / virginica\n",
"virginica / virginica\n",
"virginica / virginica\n",
"setosa / setosa\n",
"setosa / setosa\n",
"setosa / setosa\n",
"setosa / setosa\n",
"versicolor / versicolor\n",
"setosa / setosa\n",
"setosa / setosa\n",
"virginica / virginica\n",
"versicolor / versicolor\n",
"setosa / setosa\n",
"setosa / setosa\n",
"setosa / setosa\n",
"virginica / virginica\n",
"versicolor / versicolor\n",
"versicolor / versicolor\n",
"setosa / setosa\n",
"setosa / setosa\n"
]
}
],
"source": [
"# model predictions\n",
"predictions = knn_model.predict(X_test)\n",
"prob_predictions = knn_model.predict_proba(X_test)\n",
"predictions = [iris.target_names[i] for i in predictions]\n",
"print \"Predicted Probabilities \"\n",
"print \"------------------------\"\n",
"print iris.target_names\n",
"print prob_predictions\n",
"print \n",
"# as opposed to actuals\n",
"y_test\n",
"actuals = [iris.target_names[i] for i in y_test]\n",
"print \"Prediction / Actual\"\n",
"print \"-------------------\"\n",
"for i in range(len(y_test)):\n",
" print predictions[i] + \" / \" + actuals[i]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment