Skip to content

Instantly share code, notes, and snippets.

@cdetrio
Created September 23, 2014 12:32
Show Gist options
  • Save cdetrio/c966078203a9297b0d30 to your computer and use it in GitHub Desktop.
Save cdetrio/c966078203a9297b0d30 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy.stats\n",
"from __future__ import division\n",
"from sklearn import datasets\n",
"from sklearn.decomposition import FactorAnalysis\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.decomposition import KernelPCA\n",
"%pylab inline\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning) # annoying pandas bug"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TruthCoin\n",
"** https://github.com/psztorc/Truthcoin **\n",
"\n",
"## A simplified example of multi-event resolution\n",
" * no reputation / stake. uniform vote weights\n",
" * only binary events / discrete outcomes."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# events are columns of the voter matrix:\n",
"# col[0] = Obama is the u.s. president (2014),\n",
"# col[1] = Brazil won the 2014 fifa world cup,\n",
"# col[2] = Djokovic won the 2014 wimbledon tennis championship,\n",
"# col[3] = MtGox exchange goes insolvent (1Q 2014)\n",
"# col[4] = Professor bitcorn won his bet (\"I predict that Bitcoin will trade for under $10 a share by the first half of 2014\")\n",
"#\n",
"\n",
"VoterMatrix = np.matrix([\n",
" [1, 0, 1, 1, 0], # first voter\n",
" [1, 1, 1, 1, 0], # ignorant about sports\n",
" [1, 0, 1, 1, 0],\n",
" [0, 0, 1, 1, 0], # republican in denial\n",
" [1, 0, 1, 1, 1]]) # prof bitcorn\n",
"\n",
"print VoterMatrix\n",
"\n",
"features = ['outcome_1', 'outcome_2', 'outcome_3', 'outcome_4', 'outcome_5']\n",
"voteMatrix_pd = pd.DataFrame(VoterMatrix, columns=features)\n",
"#raw['class'] = y\n",
"print ' voteMatrix_pd:'\n",
"print voteMatrix_pd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[[1 0 1 1 0]\n",
" [1 1 1 1 0]\n",
" [1 0 1 1 0]\n",
" [0 0 1 1 0]\n",
" [1 0 1 1 1]]\n",
" voteMatrix_pd:\n",
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n",
"0 1 0 1 1 0\n",
"1 1 1 1 1 0\n",
"2 1 0 1 1 0\n",
"3 0 0 1 1 0\n",
"4 1 0 1 1 1\n",
"\n",
"[5 rows x 5 columns]\n"
]
}
],
"prompt_number": 21
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Simple (non-SVD) event resolution\n",
" * using a uniform reputation / voter weighting"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def reWeight(Vec):\n",
" \"\"\"Get the relative influence of numbers, treat NaN as influence-less.\"\"\"\n",
" vec2 = np.array(Vec, dtype=float)\n",
" for i in range(len(Vec)):\n",
" if isnan(Vec[i]):\n",
" vec2[i] = 0\n",
"\n",
" vec2sum = np.sum(vec2)\n",
" for i in range(len(vec2)):\n",
" vec2[i] = vec2[i] / vec2sum\n",
"\n",
" return(vec2)\n",
"\n",
"\n",
"rew = reWeight(np.array([1,1,1,1]))\n",
"print \"reweighted vector test. uniform vector\", rew\n",
"\n",
"def getWeight(Vec, AddMean=0):\n",
" \"\"\"Takes an array (vector in practice), and returns proportional distance from zero.\"\"\"\n",
" New = abs(Vec) #Absolute Value\n",
" if AddMean == 1: #Add the mean to each element of the vector\n",
" New = New + mean(New)\n",
" if sum(New) == 0: #Catch an error here\n",
" New = New + 1\n",
" New = New/sum(New) #Normalize\n",
" return(New)\n",
"\n",
"\n",
"uniformWeight = array([[1]]*len(VoterMatrix))\n",
"print \"\\nuniform weights:\\n\", uniformWeight\n",
"uniformReputation = getWeight(uniformWeight)\n",
"print \"\\nuniform reputation:\\n\", uniformReputation\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"reweighted vector test. uniform vector [ 0.25 0.25 0.25 0.25]\n",
"\n",
"uniform weights:\n",
"[[1]\n",
" [1]\n",
" [1]\n",
" [1]\n",
" [1]]\n",
"\n",
"uniform reputation:\n",
"[[ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]]\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weigh votes and resolve decisions.\n",
"measure each decision by taking a dot product. essentially this just uses the average vote value among all voters. \n",
"\n",
"* an SVD result would change the reputation vector, but without SVD its a simple uniform vector (all votes equal)."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# port of GetDecisionOutcomes() https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L139\n",
"\n",
"# VoterMatrix\n",
"# we're using the regular matrix here. data is not even zero-centered.\n",
"\n",
"MaskedVoterMatrix = np.ma.masked_array(VoterMatrix, isnan(VoterMatrix))\n",
"matrix_mask_thingie = -MaskedVoterMatrix[...,0].mask\n",
"# not sure what the mask is for.\n",
"# corresponds to https://github.com/psztorc/Truthcoin/blob/master/pylib/consensus/consensus.py#L113-L114\n",
"row = reWeight( rep [ matrix_mask_thingie ] )\n",
"print \"row:\", row\n",
"col = MaskedVoterMatrix[matrix_mask_thingie, 0]\n",
"print \"col:\", col\n",
"\n",
"\n",
"decisions = []\n",
"for i in range(VoterMatrix.shape[1]):\n",
" row = reWeight( rep [ -MaskedVoterMatrix[...,i].mask ] )\n",
" col = MaskedVoterMatrix[ -MaskedVoterMatrix[...,i].mask, i]\n",
" col = np.array(col, dtype=float)\n",
" row = np.transpose(row)[0]\n",
" decisions.append(np.dot(col, row))\n",
" \n",
"print \"\\ndecisions:\"\n",
"print decisions\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"row: [[ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]\n",
" [ 0.2]]\n",
"col: [[1 1 1 0 1]]\n",
"\n",
"decisions:\n",
"[0.80000000000000004, 0.20000000000000001, 1.0, 1.0, 0.20000000000000001]\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* **that's the output for a simplified multi-decision resolution from votes on binary outcomes.**\n",
" - map values between [0,1] to one of {0, 0.5, 1}\n",
"* **in this simplified version there is no reputation or vote stake amounts. every vote is equal ([0.2, 0.2, 0.2, 0.2, 0.2])**\n",
"\n",
"* an extended method would incorporate vote stake/deposit amounts to weigh votes. if we also add scaled/continous outcomes then the resulting consensus method would be a form of multi-decision SchellingCoin (or equivalently, TruthCoin without reputation)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SVD for Reputation based voting\n",
"* SVD operates on a covariance matrix. covariance calc needs data matrix of normalized continuous values"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# normalize each feature/column to mean = 0, std = 1\n",
"# data matrix needs to be normalized to get covariance and SVD\n",
"normed = voteMatrix_pd.copy()\n",
"for col in features:\n",
" #normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()) / normed[col].std())\n",
" normed[col] = normed[col].apply(lambda x: (x - normed[col].mean()))\n",
"\n",
"# normed data matrix is only zero-centered (not auto-scaled)\n",
"print '\\nNormalized dataset:'\n",
"print normed[:5]\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Normalized dataset:\n",
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n",
"0 0.2 -0.2 0 0 -0.2\n",
"1 0.2 0.8 0 0 -0.2\n",
"2 0.2 -0.2 0 0 -0.2\n",
"3 -0.8 -0.2 0 0 -0.2\n",
"4 0.2 -0.2 0 0 0.8\n",
"\n",
"[5 rows x 5 columns]\n"
]
}
],
"prompt_number": 23
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"this covariance calc is from the [original pca example](http://nbviewer.ipython.org/github/tmsquasher/data-science-notebooks/blob/master/PCA%20Basics.ipynb)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"### since our data is already normalized, cov(x1, x2) = sum(x1*x2) / num_observations\n",
"# ^^ old assumption from the original example. valid here??\n",
"cov_df = pd.DataFrame(index=features)\n",
"for colA in features:\n",
" column = []\n",
" for colB in features:\n",
" cov = normed[colA].cov(normed[colB])\n",
" column.append(cov)\n",
" cov_df[colA] = column\n",
"\n",
"print 'Covariance matrix:'\n",
"print cov_df\n",
"# everybody agrees on outcomes 3 & 4 (tennis winner, mtgox solvency), so those columns have zero variance"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Covariance matrix:\n",
" outcome_1 outcome_2 outcome_3 outcome_4 outcome_5\n",
"outcome_1 0.20 0.05 0 0 0.05\n",
"outcome_2 0.05 0.20 0 0 -0.05\n",
"outcome_3 0.00 0.00 0 0 0.00\n",
"outcome_4 0.00 0.00 0 0 0.00\n",
"outcome_5 0.05 -0.05 0 0 0.20\n",
"\n",
"[5 rows x 5 columns]\n"
]
}
],
"prompt_number": 24
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Singular Value Decomposition (SVD)\n",
"\n",
"** [U]: Rows are the original features and columns are the PCA 'components'. Each cell gives the 'loading' of the feature on the corresponding component. **\n",
"\n",
"** [S]: Represents how much variance is explained by each component. **"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# use numpy's SVD implementation\n",
"u, s, v = scipy.linalg.svd(cov_df)\n",
"print 'U: (feature loading for each component)'\n",
"print pd.DataFrame(u, index=features) # first loading\n",
"print '\\nExplained variance:\\n', s\n",
"\n",
"firstScore = np.transpose(np.dot(cov_df, u))[0]\n",
"print \"\\nfirstScore:\"\n",
"print firstScore\n",
"\n",
"Set1 = firstScore + abs(min(firstScore))\n",
"print \"\\nSet1:\"\n",
"print Set1\n",
"Set2 = firstScore - max(firstScore)\n",
"print \"Set2:\"\n",
"print Set2"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"U: (feature loading for each component)\n",
" 0 1 2 3 4\n",
"outcome_1 -0.816497 -8.088567e-17 0.57735 0 0\n",
"outcome_2 -0.408248 -7.071068e-01 -0.57735 0 0\n",
"outcome_3 0.000000 0.000000e+00 0.00000 1 0\n",
"outcome_4 0.000000 0.000000e+00 0.00000 0 1\n",
"outcome_5 -0.408248 7.071068e-01 -0.57735 0 0\n",
"\n",
"[5 rows x 5 columns]\n",
"\n",
"Explained variance:\n",
"[ 0.25 0.25 0.1 0. 0. ]\n",
"\n",
"firstScore:\n",
"[-0.20412415 -0.10206207 0. 0. -0.10206207]\n",
"\n",
"Set1:\n",
"[ 0. 0.10206207 0.20412415 0.20412415 0.10206207]\n",
"Set2:\n",
"[-0.20412415 -0.10206207 0. 0. -0.10206207]\n"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# note on these two sets: https://github.com/psztorc/Truthcoin/blob/master/lib/consensus/ConsensusMechanism.r#L40-L51\n",
"New1 = getWeight(np.dot(Set1, voteMatrix_pd))\n",
"print \"\\nNew1:\"\n",
"print New1\n",
"New2 = getWeight(np.dot(Set2, voteMatrix_pd))\n",
"print \"New2:\"\n",
"print New2\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"New1:\n",
"[ 0.22222222 0.05555556 0.33333333 0.33333333 0.05555556]\n",
"New2:\n",
"[ 0.28571429 0.07142857 0.28571429 0.28571429 0.07142857]\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### incomplete. more calcs follow for adjusting voter reputations."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment