Skip to content

Instantly share code, notes, and snippets.

(ns lection02.core)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Namespaces
;; Refer
;;---------------
(comment
@spolakh
spolakh / geogres-knn-benchmark
Last active April 29, 2019 07:13
This script imports geonames into postgres for the purposes of knn-spgist vs knn-gist benchmarking. Based on the script from geonames forums: http://forum.geonames.org/gforum/posts/list/15/926.page
#!/bin/bash
#===============================================================================
#
# FILE: getgeo.sh
#
# USAGE: ./getgeo.sh
#
# DESCRIPTION: run the script so that the geodata will be downloaded and inserted into your
# database
#
{
"metadata": {
"name": "",
"signature": "sha256:4b29dfce07d9442ae9ee4692f1bf001b798f38159c280e68561abe8e2aeebc27"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{"nbformat_minor": 0, "cells": [{"execution_count": 11, "cell_type": "code", "source": "%pylab inline\nfrom sklearn.datasets import load_iris\nimport pandas as pd\nimport numpy as np\nimport pylab as pl\nfrom rep.utils import train_test_split\nfrom sklearn.metrics import roc_auc_score", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Populating the interactive namespace from numpy and matplotlib\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 12, "cell_type": "code", "source": "iris = load_iris()\nprint(iris.DESCR)", "outputs": [{"output_type": "stream", "name": "stdout", "text": "Iris Plants Database\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pylab as plt\n%matplotlib inline", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"source": "\u041f\u043e\u0434\u0433\u0440\u0443\u0437\u0438\u043c \u0440\u0435\u043f\u043e\u0432\u044b\u0439 \u0434\u0430\u0442\u0430\u0441\u0435\u0442 \u043c\u0430\u0441\u0441, \u043a\u0430\u043a \u0438 \u0432 \u0442\u0443\u0442\u043e\u0440\u0438\u0430\u043b\u0435", "cell_type": "markdown", "metadata": {}}, {"execution_count": 2, "cell_type": "code", "source": "from rep.utils import train_test_split\nfrom sklearn.metrics import roc_auc_score\n\nsig_data = pd.read_csv('../toy_datasets/toyMC_sig_mass.csv', sep='\\t')\nbck_data = pd.read_csv('../toy_datasets/toyMC_bck_mass.csv', sep='\\t')\n\nlabels = np.array([1] * len(sig_data) + [0] * len(bck_data))\ndata = pd.concat([sig_data, bck_data])\nvariables = [\"FlightDistance\", \"FlightDistanceError\", \"IP\", \"VertexC
{"metadata": {"kernelspec": {"display_name": "Python 3", "name": "python3", "language": "python"}, "language_info": {"file_extension": ".py", "nbconvert_exporter": "python", "codemirror_mode": {"name": "ipython", "version": 3}, "name": "python", "pygments_lexer": "ipython3", "mimetype": "text/x-python", "version": "3.4.3"}}, "cells": [{"metadata": {"trusted": true, "collapsed": false}, "cell_type": "code", "source": "%pylab inline\nimport numpy as np\nfrom functools import reduce\nimport scipy.stats as st\nSAMPLE_SIZE = 100\nBS_SIZE = 5000\nMU = 5", "outputs": [{"text": "Populating the interactive namespace from numpy and matplotlib\n", "name": "stdout", "output_type": "stream"}], "execution_count": 151}, {"metadata": {}, "cell_type": "markdown", "source": "#\u21163\n\u041f\u0443\u043d\u043a\u0442 \u0430): \u0421\u0433\u0435\u043d\u0435\u0440\u0438\u043c \u0432\u044b\u0431\u043e\u0440\u043a\u0443 \u0438 \u043f\u043e\u0441\u0442\u0440\u043e\u0438\u043c \u0434\u043e\u0432\u0435\u0440\u0438\u0442\u0435\u043b\u04
{"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"pygments_lexer": "ipython3", "name": "python", "mimetype": "text/x-python", "file_extension": ".py", "nbconvert_exporter": "python", "version": "3.4.3", "codemirror_mode": {"name": "ipython", "version": 3}}}, "cells": [{"metadata": {"trusted": true, "collapsed": false}, "outputs": [{"text": "Populating the interactive namespace from numpy and matplotlib\n", "output_type": "stream", "name": "stdout"}], "execution_count": 151, "cell_type": "code", "source": "%pylab inline\nimport numpy as np\nfrom functools import reduce\nimport scipy.stats as st\nSAMPLE_SIZE = 100\nBS_SIZE = 5000\nMU = 5"}, {"metadata": {}, "cell_type": "markdown", "source": "#\u0417\u0430\u0434\u0430\u043d\u0438\u0435 3\n\u041f\u0443\u043d\u043a\u0442 \u0430): \u0421\u0433\u0435\u043d\u0435\u0440\u0438\u043c \u0432\u044b\u0431\u043e\u0440\u043a\u0443 \u0438 \u043f\u043e\u0441\u0442\u0440\u043e\u0438\u043c \u0434\u043e\u0432\u0
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import numpy as np\nimport pandas as pd\nimport matplotlib.pylab as plt\n%matplotlib inline", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 2, "cell_type": "code", "source": "from rep.utils import train_test_split\nfrom sklearn.metrics import roc_auc_score\n\nsig_data = pd.read_csv('../toy_datasets/toyMC_sig_mass.csv', sep='\\t')\nbck_data = pd.read_csv('../toy_datasets/toyMC_bck_mass.csv', sep='\\t')\n\nlabels = np.array([1] * len(sig_data) + [0] * len(bck_data))\ndata = pd.concat([sig_data, bck_data])\nvariables = [\"FlightDistance\", \"FlightDistanceError\", \"IP\", \"VertexChi2\", \"pt\", \"p0_pt\", \"p1_pt\", \"p2_pt\", 'LifeTime','dira']", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 3, "cell_type": "code", "source": "X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.5)", "outputs": [], "metadata": {"collaps
{"metadata": {"kernelspec": {"display_name": "Python 3", "name": "python3", "language": "python"}, "language_info": {"file_extension": ".py", "version": "3.4.3", "nbconvert_exporter": "python", "codemirror_mode": {"name": "ipython", "version": 3}, "name": "python", "pygments_lexer": "ipython3", "mimetype": "text/x-python"}}, "cells": [{"metadata": {}, "source": "#\u0417\u0430\u0434\u0430\u0447\u0430 1", "cell_type": "markdown"}, {"execution_count": 1, "source": "%pylab inline\nimport numpy as np\ndf = np.loadtxt(\"A2P(1).txt\")", "outputs": [{"name": "stdout", "output_type": "stream", "text": "Populating the interactive namespace from numpy and matplotlib\n"}, {"name": "stderr", "output_type": "stream", "text": "WARNING: pylab import has clobbered these variables: ['sqrt']\n`%matplotlib` prevents importing * from pylab and numpy\n"}], "metadata": {"collapsed": false, "trusted": true}, "cell_type": "code"}, {"execution_count": 2, "source": "from sklearn.cross_validation import train_test_split\nprint(df.shape)
{"nbformat_minor": 0, "cells": [{"execution_count": 1, "cell_type": "code", "source": "import numpy\nimport pandas\n\ndef generate_classification_sample(n_samples, n_features, distance=2.0, n_classes=2):\n \"\"\"Generates some test distribution,\n distributions are gaussian with centers at (x, x, x, ... x), where x = class_id * distance\n \"\"\"\n from sklearn.datasets import make_blobs\n\n centers = numpy.zeros((n_classes, n_features))\n centers += numpy.arange(n_classes)[:, numpy.newaxis] * distance\n\n X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers)\n columns = [\"column\" + str(x) for x in range(n_features)]\n X = pandas.DataFrame(X, columns=columns)\n return X, y\n\ndef generate_classification_data(n_classes=2):\n \"\"\" Generates random number of samples and features. \"\"\"\n n_samples = 1000 + numpy.random.poisson(1000)\n n_features = numpy.random.randint(10, 16)\n sample_weight = numpy.ones(n_samples, dtype=float)\n X, y = g