Skip to content

Instantly share code, notes, and snippets.

@musically-ut
Created May 18, 2016 08:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save musically-ut/c06cdb52aefbef7a8f5f9d47406e63ad to your computer and use it in GitHub Desktop.
Save musically-ut/c06cdb52aefbef7a8f5f9d47406e63ad to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from matplotlib import pyplot as plt\n",
"\n",
"from sklearn.metrics.pairwise import pairwise_distances\n",
"from sklearn.metrics.pairwise import rbf_kernel\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.decomposition import RandomizedPCA\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.cross_decomposition import CCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import Ridge\n",
"from sklearn.svm import LinearSVC\n",
"\n",
"from scipy.stats import pearsonr\n",
"from random import shuffle\n",
"import cPickle, gzip\n",
"import numpy as np\n",
"import time\n",
"\n",
"def time_f(f,n):\n",
" start = time.time()\n",
" for i in xrange(n):\n",
" f()\n",
" return (time.time()-start)/n\n",
"\n",
"# download http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz\n",
" \n",
"f = gzip.open('mnist.pkl.gz', 'rb')\n",
"(mnist,labels), _, _ = cPickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Monte Carlo integration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def estimate_pi(n = 1000):\n",
" return np.mean(np.sum(np.power(np.random.rand(int(n),2),2),1) <= 1)*4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"np.random.seed(0)\n",
"\n",
"n_arr = np.logspace(1,6,10)\n",
"p_arr = np.zeros(len(n_arr))\n",
"n_reps = 10\n",
"\n",
"for rep in xrange(n_reps):\n",
" for i in xrange(len(n_arr)):\n",
" p_arr[i] += np.abs(estimate_pi(n_arr[i])-np.pi)/n_reps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.plot(np.log(n_arr), np.log(p_arr));\n",
"plt.xlabel('number of samples');\n",
"plt.ylabel('absolute error');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Truncated SVD"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def my_svd(A, k):\n",
" Omega = np.random.randn(A.shape[1],2*k)\n",
" Y = np.dot(A,Omega)\n",
" Q, _ = np.linalg.qr(Y)\n",
" B = np.dot(Q.T, A)\n",
" S, Sigma, V = np.linalg.svd(B)\n",
" U = np.dot(Q,S)\n",
" U = U[:,0:k]\n",
" Sigma = np.diag(Sigma[0:k])\n",
" V = V[0:k,:]\n",
" return np.dot(np.dot(U,Sigma),V)\n",
"\n",
"t_n = [125,250,500,1000,2000]\n",
"t_my = np.zeros(len(t_n)) \n",
"t_random = np.zeros(len(t_n)) \n",
"t_arpack = np.zeros(len(t_n))\n",
"\n",
"t_random_e = np.zeros(len(t_n))\n",
"t_arpack_e = np.zeros(len(t_n))\n",
"t_my_e = np.zeros(len(t_n))\n",
"\n",
"for n in xrange(len(t_n)):\n",
" A = np.dot(np.random.randn(t_n[n],2*t_n[n]/10),np.random.randn(2*t_n[n]/10,t_n[n]))\n",
" \n",
" def f(): my_svd(A, t_n[n]/10)\n",
" t_my[n] = time_f(f,10)\n",
" t_my_e[n] = np.linalg.norm(my_svd(A, t_n[n]/10)-A)/t_n[n]\n",
" \n",
" svd = TruncatedSVD(t_n[n]/10, 'randomized')\n",
" def f(): svd.fit(A)\n",
" t_random[n] = time_f(f,10)\n",
" t_random_e[n] = np.linalg.norm(svd.inverse_transform(svd.transform(A))-A)/t_n[n]\n",
"\n",
" svd = TruncatedSVD(t_n[n]/10, 'arpack')\n",
" def f(): svd.fit(A)\n",
" t_arpack[n] = time_f(f,10)\n",
" t_arpack_e[n] = np.linalg.norm(svd.inverse_transform(svd.transform(A))-A)/t_n[n]\n",
" \n",
"plt.figure(figsize=(12,4))\n",
"plt.subplot(1,2,1)\n",
"plt.plot(t_n,t_my, label='My beauty');\n",
"plt.plot(t_n,t_arpack, label='LAPACK');\n",
"plt.plot(t_n,t_random, label='Halko et al.');\n",
"plt.xlabel('m')\n",
"plt.ylabel('running time')\n",
"plt.legend();\n",
"\n",
"plt.subplot(1,2,2)\n",
"plt.plot(t_n,t_my_e, label='My beauty');\n",
"plt.plot(t_n,t_arpack_e, label='LAPACK');\n",
"plt.plot(t_n,t_random_e, label='Halko et al.');\n",
"plt.xlabel('m')\n",
"plt.ylabel('reconstruction error')\n",
"plt.legend();"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(12,4))\n",
"plt.subplot(1,2,1)\n",
"plt.plot(t_n,t_my, label='My beauty');\n",
"plt.plot(t_n,t_arpack, label='LAPACK');\n",
"plt.plot(t_n,t_random, label='Halko et al.');\n",
"plt.xlabel('m')\n",
"plt.ylabel('running time')\n",
"plt.legend();\n",
"\n",
"plt.subplot(1,2,2)\n",
"plt.plot(t_n,t_my_e, label='My beauty');\n",
"plt.plot(t_n,t_arpack_e, label='LAPACK');\n",
"plt.plot(t_n,t_random_e, label='Halko et al.');\n",
"plt.xlabel('m')\n",
"plt.ylabel('reconstruction error')\n",
"plt.legend();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dimensionality reduction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f = gzip.open('mnist.pkl.gz', 'rb')\n",
"(mnist,labels), _, _ = cPickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"i = 1\n",
"plt.figure(figsize=(10,10))\n",
"for k in [10,50,100,200]:\n",
" plt.subplot(2,2,i)\n",
" x1 = mnist[np.random.permutation(mnist.shape[0])[0:1000]]\n",
" x2 = mnist[np.random.permutation(mnist.shape[0])[0:1000]]\n",
" dx = pairwise_distances(x2,x2).ravel()\n",
" \n",
" A = np.random.randn(x1.shape[1],k)*1./np.sqrt(k)\n",
" Ai = np.linalg.pinv(A, 1e-3)\n",
" z2 = np.dot(x2,A)\n",
" xz = np.dot(z2,Ai)\n",
" dz = pairwise_distances(z2,z2).ravel()\n",
"\n",
" pca = PCA(k).fit(x1)\n",
" p2 = pca.transform(x2)\n",
" xp = pca.inverse_transform(p2)\n",
" dp = pairwise_distances(p2,p2).ravel()\n",
"\n",
" plt.hist(dz/(dx+1e-6),bins=100, alpha=0.5, normed=True, label='random');\n",
" #plt.hist(dp/(dx+1e-6),bins=100, alpha=0.5, normed=True, label='pca');\n",
" plt.title(str(k) + ' projections')\n",
" plt.legend()\n",
" i = i+1\n",
" \n",
"plt.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Beware! There are many ways of being \"far away\" from something!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"k = 10 # ten latent components\n",
"\n",
"A = np.random.randn(x1.shape[1],k)*1./np.sqrt(k)\n",
"Ai = np.linalg.pinv(A, 1e-3)\n",
"z2 = np.dot(x2,A)\n",
"xz = np.dot(z2,Ai)\n",
"dz = pairwise_distances(z2,z2).ravel()\n",
"\n",
"pca = PCA(k).fit(x1)\n",
"p2 = pca.transform(x2)\n",
"xp = pca.inverse_transform(p2)\n",
"dp = pairwise_distances(p2,p2).ravel()\n",
" \n",
"plt.figure(figsize=(10,10))\n",
"for i in xrange(100):\n",
" plt.subplot(10,10,i+1)\n",
" plt.imshow(xp[i].reshape(28,28), cmap='Greys_r')\n",
" plt.axis('off')\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(10,10))\n",
"for i in xrange(100):\n",
" plt.subplot(10,10,i+1)\n",
" plt.imshow(xz[i].reshape(28,28), cmap='Greys_r')\n",
" plt.axis('off')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Kernel methods and random features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"np.random.seed(1)\n",
"\n",
"def random_features(d,k,gamma):\n",
" w = np.random.randn(d,k)*np.sqrt(2.*gamma)\n",
" b = 2.*np.pi*np.random.rand(k)\n",
" def foo(x):\n",
" return np.sqrt(2./k)*np.cos(np.dot(x,w)+b)\n",
" return foo\n",
"\n",
"def estimate_gamma(x,n=100):\n",
" x_sub = x[np.random.permutation(x.shape[0])[0:n]]\n",
" return 1./np.median(np.power(pairwise_distances(x_sub,x_sub),2).flatten())\n",
"\n",
"x = mnist[np.random.permutation(mnist.shape[0])[0:100]]\n",
"g = estimate_gamma(x)\n",
"\n",
"plt.figure(figsize=(6,6))\n",
"\n",
"plt.subplot(2,2,1)\n",
"K = rbf_kernel(x,x,g)\n",
"plt.imshow(K)\n",
"plt.title('exact kernel')\n",
"plt.axis('off');\n",
"\n",
"p = 1\n",
"for k in [100,1000,10000]:\n",
" plt.subplot(2,2,p+1)\n",
" rf = random_features(x.shape[1], k,g)\n",
" plt.imshow(np.dot(rf(x),rf(x).T))\n",
" plt.title(k)\n",
" plt.axis('off');\n",
" p += 1\n",
"\n",
"plt.tight_layout()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Does it blend on MNIST?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f = gzip.open('mnist.pkl.gz', 'rb')\n",
"(x_tr, y_tr), (x_va, y_va), (x_te, y_te) = cPickle.load(f)\n",
"f.close()\n",
"\n",
"x_tr = np.vstack((x_tr,x_va))\n",
"y_tr = np.hstack((y_tr,y_va))\n",
"\n",
"rf = random_features(x_tr.shape[1], 2048, estimate_gamma(x_tr))\n",
"\n",
"z_tr = rf(x_tr)\n",
"z_va = rf(x_va)\n",
"z_te = rf(x_te)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"svm = LinearSVC().fit(z_tr,y_tr)\n",
"print np.mean(svm.predict(z_te)==y_te)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How do we deal with uncertainty?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The predictive distribution of Bayesian linear regression is\n",
"\\begin{equation*}\n",
" p(f | x, X, y) = \\mathcal{N} \\left( \\sigma^{-2}_n x^\\top A^{-1} Xy, x^\\top A^{-1}x\\right),\n",
"\\end{equation*}\n",
"where\n",
"\\begin{equation*}\n",
" A = \\sigma^{-2}_n XX^\\top + \\Sigma_p^{-1}.\n",
"\\end{equation*}\n",
"where\n",
"\\begin{equation*}\n",
" w \\sim \\mathcal{N}(0,\\Sigma_p)\n",
"\\end{equation*}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def bayesian_linear_regression(X,y,Sigma_p=None,sigma_n=None):\n",
" if Sigma_p is None:\n",
" Sigma_p = np.diag(np.repeat(1,X.shape[1]))\n",
" if sigma_n is None:\n",
" sigma_n = 1\n",
" \n",
" A = np.power(sigma_n,-2)*np.dot(X.T,X)+np.linalg.inv(Sigma_p)\n",
" Ai = np.linalg.inv(A)\n",
" C = np.dot(np.dot(Ai,X.T),y)\n",
" \n",
" def mean(x):\n",
" return np.power(sigma_n,-2)*np.dot(x,C)\n",
" def variance(x):\n",
" return 2*np.sqrt(np.diag(np.dot(np.dot(x,Ai),x.T))[:,np.newaxis])\n",
" \n",
" return (mean, variance)\n",
"\n",
"def f(x):\n",
" return np.sin(5*x)+np.random.randn(x.shape[0],1)*0.1\n",
"\n",
"def estimate_gamma(x,n=100):\n",
" x_sub = x[np.random.permutation(x.shape[0])[0:n]]\n",
" return 1./np.median(np.power(pairwise_distances(x_sub,x_sub),2).flatten())\n",
"\n",
"np.random.seed(0)\n",
"\n",
"N = 1000\n",
"K = 1000\n",
"\n",
"x_tr = np.random.randn(N,1)\n",
"y_tr = f(x_tr)\n",
"\n",
"s_x = StandardScaler().fit(x_tr)\n",
"s_y = StandardScaler().fit(y_tr)\n",
"\n",
"x_tr = s_x.transform(x_tr)\n",
"y_tr = s_y.transform(y_tr)\n",
"\n",
"rf = random_features(1,K,estimate_gamma(x_tr))\n",
"z_tr = rf(x_tr)\n",
"s_z = StandardScaler().fit(z_tr)\n",
"z_tr = s_z.transform(z_tr)\n",
"(mean, variance) = bayesian_linear_regression(z_tr,y_tr,None,1)\n",
"\n",
"x_te = 0.75*np.sort(np.random.randn(N,1),0)\n",
"#x_te = 1.5*np.sort(np.random.randn(N,1),0)\n",
"y_te = f(x_te)\n",
"\n",
"x_te = s_x.transform(x_te)\n",
"y_te = s_y.transform(y_te)\n",
"z_te = rf(x_te)\n",
"z_te = s_z.transform(z_te)\n",
"\n",
"plt.plot(x_te,y_te,'.', label='test data')\n",
"\n",
"plt.plot(x_tr,y_tr,'.',color='red', label='training data')\n",
"plt.plot(x_te,mean(z_te),lw=3, label='prediction')\n",
"plt.fill(np.concatenate([x_te, x_te[::-1]]),\n",
" np.concatenate([mean(z_te) - 1.9600 * np.sqrt(variance(z_te)),\n",
" (mean(z_te) + 1.9600 * np.sqrt(variance(z_te)))[::-1]]),\n",
" alpha=.25, fc='g', ec='None', label='95% confidence interval')\n",
"plt.xlim(np.min(x_te),np.max(x_te));\n",
"plt.legend(loc='best')\n",
"plt.title(np.power(y_te-mean(z_te),2).mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Nonlinear Component analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f = gzip.open('mnist.pkl.gz', 'rb')\n",
"(x_tr, y_tr), (x_va, y_va), (x_te, y_te) = cPickle.load(f)\n",
"f.close()\n",
"\n",
"x_tr = np.vstack((x_tr,x_va))\n",
"y_tr = np.hstack((y_tr,y_va))\n",
"\n",
"rf = random_features(x_tr.shape[1], 2048, estimate_gamma(x_tr))\n",
"\n",
"z_tr = rf(x_tr)\n",
"z_va = rf(x_va)\n",
"z_te = rf(x_te)\n",
"\n",
"xz_tr = z_tr\n",
"xz_te = z_te\n",
"pca = RandomizedPCA(10).fit(z_tr)\n",
"p_tr = pca.transform(z_tr)\n",
"p_te = pca.transform(z_te)\n",
"\n",
"rf = random_features(p_tr.shape[1], 2048, estimate_gamma(p_tr))\n",
"\n",
"pz_tr = rf(p_tr)\n",
"pz_te = rf(p_te)\n",
"\n",
"ridge = Ridge().fit(pz_tr, x_tr)\n",
"\n",
"rec_tr = ridge.predict(pz_tr)\n",
"rec_te = ridge.predict(pz_te)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(10,10))\n",
"for i in xrange(100):\n",
" plt.subplot(10,10,i+1)\n",
" plt.imshow(rec_tr[i].reshape(28,28), cmap='Greys_r')\n",
" plt.axis('off')\n",
"plt.show()\n",
"\n",
"plt.figure(figsize=(10,10))\n",
"for i in xrange(100):\n",
" plt.subplot(10,10,i+1)\n",
" plt.imshow(rec_te[i].reshape(28,28), cmap='Greys_r')\n",
" plt.axis('off')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dependence measurament"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def random_features(d,k,gamma):\n",
" w = np.random.randn(d,k)*np.sqrt(2.*gamma)\n",
" b = 2.*np.pi*np.random.rand(k)\n",
" def foo(x):\n",
" return np.sqrt(2./k)*np.cos(np.dot(x,w)+b)\n",
" return foo\n",
"\n",
"def experiment_sin(n=1000,k=100,freq=3,noise=.1):\n",
" x = np.random.randn(n,1)\n",
" y = np.sin(freq*x)+noise*np.random.randn(n,1)\n",
" \n",
" #t = 2*np.pi*np.random.rand(n,1)\n",
" #x = np.cos(freq*t)+noise*np.random.randn(n,1)\n",
" #y = np.sin(freq*t)+noise*np.random.randn(n,1)\n",
"\n",
" k = k\n",
" rf_x = random_features(x.shape[1],k,estimate_gamma(x))\n",
" z_x = rf_x(x)\n",
" rf_y = random_features(y.shape[1],k,estimate_gamma(y))\n",
" z_y = rf_x(y)\n",
"\n",
" f, g = CCA(n_components=1).fit_transform(z_x,z_y)\n",
" return x, y, f, g"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"x, y, f, g = experiment_sin()\n",
"\n",
"plt.figure(figsize=(8,8))\n",
"\n",
"plt.subplot(2,2,1)\n",
"plt.plot(x,y,'.')\n",
"plt.title('Correlation: ' + str(pearsonr(x,y)[0][0]))\n",
"\n",
"plt.subplot(2,2,2)\n",
"plt.plot(f,g,'.')\n",
"plt.title('Correlation: ' + str(pearsonr(f,g)[0][0]))\n",
"\n",
"plt.subplot(2,2,3)\n",
"plt.plot(x,f,'.')\n",
"plt.title('x-transformation')\n",
"plt.xlabel('x')\n",
"plt.ylabel('f')\n",
"\n",
"plt.subplot(2,2,4)\n",
"plt.plot(y,g,'.')\n",
"plt.title('y-transformation')\n",
"plt.xlabel('y')\n",
"plt.ylabel('g')\n",
"\n",
"plt.tight_layout()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"freqs = [1,2,3,4,5,6,7,8,9,10]\n",
"corr_freqs = np.zeros(len(freqs))\n",
"\n",
"for i in xrange(10):\n",
" for fi in xrange(len(freqs)):\n",
" x, y, f, g = experiment_sin(freq=freqs[fi])\n",
" corr_freqs[fi] += pearsonr(f,g)[0][0]\n",
"\n",
"plt.plot(freqs, corr_freqs)\n",
"plt.title('Degradation with respect to frequency')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Low-dimensional kernel mean embeddings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Two-sample testing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f = gzip.open('mnist.pkl.gz', 'rb')\n",
"(x_tr,y_tr), (x_va,y_va), (x_te,y_te) = cPickle.load(f)\n",
"f.close()\n",
"\n",
"x5_tr = x_tr[np.where(y_tr==5)]\n",
"x8_tr = x_tr[np.where(y_tr==8)]\n",
"x5_va = x_va[np.where(y_va==5)]\n",
"x8_va = x_va[np.where(y_va==8)]\n",
"x5_te = x_te[np.where(y_te==5)]\n",
"x8_te = x_te[np.where(y_te==8)]\n",
"\n",
"gamma = estimate_gamma(x5_tr, 100)\n",
"rf = random_features(x5_tr.shape[1],100,gamma)\n",
"\n",
"def mmd(x,y,gamma):\n",
" return rbf_kernel(x,gamma=gamma).mean()-\\\n",
" 2*rbf_kernel(x,y,gamma=gamma).mean()+\\\n",
" rbf_kernel(y,gamma=gamma).mean()\n",
"\n",
"def rmmd(x,y,rf):\n",
" return np.power(np.linalg.norm(rf(x).mean(0)-rf(y).mean(0)),2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print mmd(x5_tr,x8_tr,gamma)\n",
"print rmmd(x5_tr,x8_tr,rf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print mmd(x5_tr,x5_va,gamma)\n",
"print rmmd(x5_tr,x5_va,rf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def f(): mmd(x5_tr,x5_va,gamma)\n",
"print time_f(f,10)\n",
"\n",
"def f(): rmmd(x5_tr,x5_va,rf)\n",
"print time_f(f,10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Distribution classification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier as RFC\n",
"from scipy.interpolate import UnivariateSpline as sp\n",
"from sklearn.preprocessing import scale\n",
"from sklearn.mixture import GMM\n",
"\n",
"def continuous_cause(n,k,p1,p2):\n",
" g = GMM(k)\n",
" g.means_ = p1*np.random.randn(k,1)\n",
" g.covars_ = np.abs(p2*np.random.randn(k,1)+1)\n",
" g.weights_ = abs(np.random.rand(k,1))\n",
" g.weights_ = g.weights_/sum(g.weights_)\n",
" return scale(g.sample(n))\n",
"\n",
"def noise(n,v):\n",
" return v*np.random.randn(n,1)\n",
"\n",
"def mechanism(x,d):\n",
" g = np.linspace(min(x)-np.std(x),max(x)+np.std(x),d)\n",
" return sp(g,np.random.randn(d))(x.flatten())[:,np.newaxis]\n",
"\n",
"def pair(n,k,p1,p2,v,d):\n",
" x = continuous_cause(n,k,p1,p2)\n",
" y = scale(scale(mechanism(x,d))+noise(n,v))\n",
" return (x,y)\n",
"\n",
"def synset(N):\n",
" res_x_fw = []; res_x_bw = []\n",
" for i in range(N):\n",
" n = np.int(np.power(10,np.random.uniform(2,4)))\n",
" k = np.random.randint(1,5)\n",
" p1 = np.random.uniform(0,5)\n",
" p2 = np.random.uniform(0,5)\n",
" v = np.random.uniform(0,5)\n",
" d = np.random.randint(4,5)\n",
" (x,y) = pair(n,k,p1,p2,v,d)\n",
" res_x_fw += [np.hstack((x,y))]\n",
" res_x_bw += [np.hstack((y,x))]\n",
" return (res_x_fw + res_x_bw,np.hstack((np.zeros(N),np.ones(N))))\n",
"\n",
"def kdist(args):\n",
" x,w = args\n",
" fx = np.dot(x[:,0][:,np.newaxis],w[0,:][np.newaxis,:])\n",
" fy = np.dot(x[:,1][:,np.newaxis],w[1,:][np.newaxis,:])\n",
" return np.hstack((np.cos(fx).mean(0),np.cos(fy).mean(0),np.cos(fx+fy).mean(0)))\n",
"\n",
"def fdist(x,w):\n",
" return np.array([kdist((scale(xi),w)) for xi in x])\n",
"\n",
"def gen_random_parameter(M=100, G=[0.1,1,10]):\n",
" return np.vstack([np.sqrt(g)*np.random.randn(M,2) for g in G]).T\n",
"\n",
"def gen_dataset(w,N=100):\n",
" x,y = synset(N)\n",
" z = fdist(x,w)\n",
" p = np.random.permutation(2*N)\n",
" x2 = []\n",
" for i in xrange(len(p)):\n",
" x2.append(x[p[i]])\n",
" return x2,y[p],z[p]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"w = gen_random_parameter()\n",
"x,y,z = gen_dataset(w)\n",
"\n",
"plt.figure(figsize=(8,8))\n",
"p = np.random.permutation(len(x))\n",
"for i in xrange(25):\n",
" plt.subplot(5,5,i+1)\n",
" plt.plot(x[p[i]][:,0],x[p[i]][:,1],'.')\n",
" plt.title(y[p[i]])\n",
" plt.axis('off')\n",
"plt.tight_layout()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"_,y_tr,z_tr = gen_dataset(w, N=1000)\n",
"_,y_te,z_te = gen_dataset(w, N=1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"print np.mean(RandomForestClassifier(n_estimators=500).fit(z_tr,y_tr).predict(z_te)==y_te)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment