Created
September 23, 2014 12:32
-
-
Save cdetrio/c966078203a9297b0d30 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import scipy.stats\n", | |
"from __future__ import division\n", | |
"from sklearn import datasets\n", | |
"from sklearn.decomposition import FactorAnalysis\n", | |
"from sklearn.decomposition import PCA\n", | |
"from sklearn.decomposition import KernelPCA\n", | |
"%pylab inline\n", | |
"\n", | |
"import warnings\n", | |
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning) # annoying pandas bug" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# TruthCoin\n", | |
"** https://github.com/psztorc/Truthcoin **\n", | |
"\n", | |
"## A simplified example of multi-event resolution\n", | |
" * no reputation / stake. uniform vote weights\n", | |
" * only binary events / discrete outcomes." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# events are columns of the voter matrix:\n", | |
"# col[0] = Obama is the u.s. president (2014),\n", | |
"# col[1] = Brazil won the 2014 fifa world cup,\n", | |
"# col[2] = Djokovic won the 2014 wimbledon tennis championship,\n", | |
"# col[3] = MtGox exchange goes insolvent (1Q 2014)\n", | |
"# col[4] = Professor bitcorn won his bet (\"I predict that Bitcoin will trade for under $10 a share by the first half of 2014\")\n", | |
"#\n", | |
"\n", | |
"VoterMatrix = np.matrix([\n", | |
" [1, 0, 1, 1, 0], # first voter\n", | |
" [1, 1, 1, 1, 0], # ignorant about sports\n", | |
" [1, 0, 1, 1, 0],\n", | |
" [0, 0, 1, 1, 0], # republican in denial\n", | |
" [1, 0, 1, 1, 1]]) # prof bitcorn\n", | |
"\n", | |
"print VoterMatrix\n", | |
"\n", | |
"features = ['outcome_1', 'outcome_2', 'outcome_3', 'outcome_4', 'outcome_5']\n", | |
"voteMatrix_pd = pd.DataFrame(VoterMatrix, columns=features)\n", | |
"#raw['class'] = y\n", | |
"print ' voteMatrix_pd:'\n", | |
"print voteMatrix_pd" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"[[1 0 1 1 0]\n", | |
" [1 1 1 1 0]\n", | |
" [1 0 1 1 0]\n", | |
" [0 0 1 1 0]\n", | |
" [1 0 1 1 1]]\n", | |
" voteMatrix_pd:\n", | |
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n", | |
"0 1 0 1 1 0\n", | |
"1 1 1 1 1 0\n", | |
"2 1 0 1 1 0\n", | |
"3 0 0 1 1 0\n", | |
"4 1 0 1 1 1\n", | |
"\n", | |
"[5 rows x 5 columns]\n" | |
] | |
} | |
], | |
"prompt_number": 21 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Simple (non-SVD) event resolution\n", | |
" * using a uniform reputation / voter weighting" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def reWeight(Vec):\n", | |
" \"\"\"Get the relative influence of numbers, treat NaN as influence-less.\"\"\"\n", | |
" vec2 = np.array(Vec, dtype=float)\n", | |
" for i in range(len(Vec)):\n", | |
" if isnan(Vec[i]):\n", | |
" vec2[i] = 0\n", | |
"\n", | |
" vec2sum = np.sum(vec2)\n", | |
" for i in range(len(vec2)):\n", | |
" vec2[i] = vec2[i] / vec2sum\n", | |
"\n", | |
" return(vec2)\n", | |
"\n", | |
"\n", | |
"rew = reWeight(np.array([1,1,1,1]))\n", | |
"print \"reweighted vector test. uniform vector\", rew\n", | |
"\n", | |
"def getWeight(Vec, AddMean=0):\n", | |
" \"\"\"Takes an array (vector in practice), and returns proportional distance from zero.\"\"\"\n", | |
" New = abs(Vec) #Absolute Value\n", | |
" if AddMean == 1: #Add the mean to each element of the vector\n", | |
" New = New + mean(New)\n", | |
" if sum(New) == 0: #Catch an error here\n", | |
" New = New + 1\n", | |
" New = New/sum(New) #Normalize\n", | |
" return(New)\n", | |
"\n", | |
"\n", | |
"uniformWeight = array([[1]]*len(VoterMatrix))\n", | |
"print \"\\nuniform weights:\\n\", uniformWeight\n", | |
"uniformReputation = getWeight(uniformWeight)\n", | |
"print \"\\nuniform reputation:\\n\", uniformReputation\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"reweighted vector test. uniform vector [ 0.25 0.25 0.25 0.25]\n", | |
"\n", | |
"uniform weights:\n", | |
"[[1]\n", | |
" [1]\n", | |
" [1]\n", | |
" [1]\n", | |
" [1]]\n", | |
"\n", | |
"uniform reputation:\n", | |
"[[ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]]\n" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Weigh votes and resolve decisions.\n", | |
"measure each decision by taking a dot product. essentially this just uses the average vote value among all voters. \n", | |
"\n", | |
"* an SVD result would change the reputation vector, but without SVD its a simple uniform vector (all votes equal)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# port of GetDecisionOutcomes() https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L139\n", | |
"\n", | |
"# VoterMatrix\n", | |
"# we're using the regular matrix here. data is not even zero-centered.\n", | |
"\n", | |
"MaskedVoterMatrix = np.ma.masked_array(VoterMatrix, isnan(VoterMatrix))\n", | |
"matrix_mask_thingie = -MaskedVoterMatrix[...,0].mask\n", | |
"# not sure what the mask is for.\n", | |
"# corresponds to https://github.com/psztorc/Truthcoin/blob/master/pylib/consensus/consensus.py#L113-L114\n", | |
"row = reWeight( rep [ matrix_mask_thingie ] )\n", | |
"print \"row:\", row\n", | |
"col = MaskedVoterMatrix[matrix_mask_thingie, 0]\n", | |
"print \"col:\", col\n", | |
"\n", | |
"\n", | |
"decisions = []\n", | |
"for i in range(VoterMatrix.shape[1]):\n", | |
" row = reWeight( rep [ -MaskedVoterMatrix[...,i].mask ] )\n", | |
" col = MaskedVoterMatrix[ -MaskedVoterMatrix[...,i].mask, i]\n", | |
" col = np.array(col, dtype=float)\n", | |
" row = np.transpose(row)[0]\n", | |
" decisions.append(np.dot(col, row))\n", | |
" \n", | |
"print \"\\ndecisions:\"\n", | |
"print decisions\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"row: [[ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]\n", | |
" [ 0.2]]\n", | |
"col: [[1 1 1 0 1]]\n", | |
"\n", | |
"decisions:\n", | |
"[0.80000000000000004, 0.20000000000000001, 1.0, 1.0, 0.20000000000000001]\n" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* **that's the output for a simplified multi-decision resolution from votes on binary outcomes.**\n", | |
" - map values between [0,1] to one of {0, 0.5, 1}\n", | |
"* **in this simplified version there is no reputation or vote stake amounts. every vote is equal ([0.2, 0.2, 0.2, 0.2, 0.2])**\n", | |
"\n", | |
"* an extended method would incorporate vote stake/deposit amounts to weigh votes. if we also add scaled/continous outcomes then the resulting consensus method would be a form of multi-decision SchellingCoin (or equivalently, TruthCoin without reputation)." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# SVD for Reputation based voting\n", | |
"* SVD operates on a covariance matrix. covariance calc needs data matrix of normalized continuous values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# normalize each feature/column to mean = 0, std = 1\n", | |
"# data matrix needs to be normalized to get covariance and SVD\n", | |
"normed = voteMatrix_pd.copy()\n", | |
"for col in features:\n", | |
" #normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())\n", | |
" normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()))\n", | |
"\n", | |
"# normed data matrix is only zero-centered (not auto-scaled)\n", | |
"print '\\nNormalized dataset:'\n", | |
"print normed[:5]\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"Normalized dataset:\n", | |
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n", | |
"0 0.2 -0.2 0 0 -0.2\n", | |
"1 0.2 0.8 0 0 -0.2\n", | |
"2 0.2 -0.2 0 0 -0.2\n", | |
"3 -0.8 -0.2 0 0 -0.2\n", | |
"4 0.2 -0.2 0 0 0.8\n", | |
"\n", | |
"[5 rows x 5 columns]\n" | |
] | |
} | |
], | |
"prompt_number": 23 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"this covariance calc is from the [original pca example](http://nbviewer.ipython.org/github/tmsquasher/data-science-notebooks/blob/master/PCA%20Basics.ipynb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"### since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations\n", | |
"# ^^ old assumption from the original example. valid here??\n", | |
"cov_df = pd.DataFrame(index=features)\n", | |
"for colA in features:\n", | |
" column = []\n", | |
" for colB in features:\n", | |
" cov = normed[colA].cov(normed[colB])\n", | |
" column.append(cov)\n", | |
" cov_df[colA] = column\n", | |
"\n", | |
"print 'Covariance matrix:'\n", | |
"print cov_df\n", | |
"# everybody agrees on outcomes 3 & 4 (tennis winner, mtgox solvency), so those columns have zero variance" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Covariance matrix:\n", | |
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n", | |
"outcome_1 0.20 0.05 0 0 0.05\n", | |
"outcome_2 0.05 0.20 0 0 -0.05\n", | |
"outcome_3 0.00 0.00 0 0 0.00\n", | |
"outcome_4 0.00 0.00 0 0 0.00\n", | |
"outcome_5 0.05 -0.05 0 0 0.20\n", | |
"\n", | |
"[5 rows x 5 columns]\n" | |
] | |
} | |
], | |
"prompt_number": 24 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 3. Singular Value Decomposition (SVD)\n", | |
"\n", | |
"** [U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component. **\n", | |
"\n", | |
"** [S]: Represents how much variance is explained by each component. **" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# use numpy's SVD implementation\n", | |
"u, s, v = scipy.linalg.svd(cov_df)\n", | |
"print 'U: (feature loading for each component)'\n", | |
"print pd.DataFrame(u, index=features) # first loading\n", | |
"print '\\nExplained variance:\\n', s\n", | |
"\n", | |
"firstScore = np.transpose(np.dot(cov_df, u))[0]\n", | |
"print \"\\nfirstScore:\"\n", | |
"print firstScore\n", | |
"\n", | |
"Set1 = firstScore + abs(min(firstScore))\n", | |
"print \"\\nSet1:\"\n", | |
"print Set1\n", | |
"Set2 = firstScore - max(firstScore)\n", | |
"print \"Set2:\"\n", | |
"print Set2" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"U: (feature loading for each component)\n", | |
" 0 1 2 3 4\n", | |
"outcome_1 -0.816497 -8.088567e-17 0.57735 0 0\n", | |
"outcome_2 -0.408248 -7.071068e-01 -0.57735 0 0\n", | |
"outcome_3 0.000000 0.000000e+00 0.00000 1 0\n", | |
"outcome_4 0.000000 0.000000e+00 0.00000 0 1\n", | |
"outcome_5 -0.408248 7.071068e-01 -0.57735 0 0\n", | |
"\n", | |
"[5 rows x 5 columns]\n", | |
"\n", | |
"Explained variance:\n", | |
"[ 0.25 0.25 0.1 0. 0. ]\n", | |
"\n", | |
"firstScore:\n", | |
"[-0.20412415 -0.10206207 0. 0. -0.10206207]\n", | |
"\n", | |
"Set1:\n", | |
"[ 0. 0.10206207 0.20412415 0.20412415 0.10206207]\n", | |
"Set2:\n", | |
"[-0.20412415 -0.10206207 0. 0. -0.10206207]\n" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# note on these two sets: https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L40-L51\n", | |
"New1 = getWeight(np.dot(Set1, voteMatrix_pd))\n", | |
"print \"\\nNew1:\"\n", | |
"print New1\n", | |
"New2 = getWeight(np.dot(Set2, voteMatrix_pd))\n", | |
"print \"New2:\"\n", | |
"print New2\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n", | |
"New1:\n", | |
"[ 0.22222222 0.05555556 0.33333333 0.33333333 0.05555556]\n", | |
"New2:\n", | |
"[ 0.28571429 0.07142857 0.28571429 0.28571429 0.07142857]\n" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### incomplete. more calcs follow for adjusting voter reputations." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment