Skip to content

Instantly share code, notes, and snippets.

@xccds
Created April 21, 2015 12:44
Show Gist options
  • Save xccds/6dfd67737f53aa40f50a to your computer and use it in GitHub Desktop.
Save xccds/6dfd67737f53aa40f50a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 用spark进行数据挖掘\n",
"\n",
"- 本例使用spark的python接口,对titanic数据做了一个完整的尝试\n",
"- 首先用算质数的例子显示,即使在单机中,spark利用了多核处理能提高计算效率\n",
"- 之后读入数据集,并对数据进行预处理\n",
" - 步骤1:对名字进行了处理,用正则取出四种常见title\n",
" - 步骤2:基于title,对年龄进行了缺失值处理\n",
" - 步骤3:将类别变量均转为0-1变量\n",
"- 数据合并整理成spark.mllib需要的格式\n",
"- 使用线性模型建模,并得出错误率\n",
"- 本例代码参考了《machine learning with spark》一书"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pyspark import SparkContext\n",
"sc = SparkContext( 'local[4]')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 算质数的例子"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def isprime(n):\n",
" \"\"\"\n",
" check if integer n is a prime\n",
" \"\"\"\n",
" # make sure n is a positive integer\n",
" n = abs(int(n))\n",
" # 0 and 1 are not primes\n",
" if n < 2:\n",
" return False\n",
" # 2 is the only even prime number\n",
" if n == 2:\n",
" return True\n",
" # all other even numbers are not primes\n",
" if not n & 1:\n",
" return False\n",
" # range starts with 3 and only needs to go up the square root of n\n",
" # for all odd numbers\n",
" for x in range(3, int(n**0.5)+1, 2):\n",
" if n % x == 0:\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"78498\n",
"78498\n",
"78498\n",
"78498\n",
"1 loops, best of 3: 4.81 s per loop\n"
]
}
],
"source": [
"%%timeit\n",
"import numpy as np\n",
"nums = xrange(1000000)\n",
"print np.sum([1 for x in nums if isprime(x)])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"78498\n",
"78498\n",
"78498\n",
"78498\n",
"1 loops, best of 3: 2.71 s per loop\n"
]
}
],
"source": [
"%%timeit\n",
"nums = sc.parallelize(xrange(1000000))\n",
"print nums.filter(isprime).count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- titanic例子,先读入变量名"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vname = !head -1 titanic.csv\n",
"vname = vname[0].split(',')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#!sed 1d titanic.csv > titanic_noheader.csv\n",
"raw = sc.textFile('titanic_noheader.csv')\n",
"raw.first() # 原始数据"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 处理title\n",
"def extract_name(x):\n",
" import re\n",
" return re.search(\"\\\"(.*)\\\"\", x).group(1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'Braund, Mr. Owen Harris',\n",
" u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n",
" u'Heikkinen, Miss. Laina',\n",
" u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names = raw.map(extract_name)\n",
"names.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'Mr', 517),\n",
" (u'Miss', 182),\n",
" (u'Mrs', 125),\n",
" (u'Master', 40),\n",
" (u'Dr', 7),\n",
" (u'Rev', 6),\n",
" (u'Major', 2),\n",
" (u'Mlle', 2),\n",
" (u'Col', 2),\n",
" (u'Sir', 1),\n",
" (u'the Countess', 1),\n",
" (u'Don', 1),\n",
" (u'Capt', 1),\n",
" (u'Lady', 1),\n",
" (u'Jonkheer', 1),\n",
" (u'Ms', 1),\n",
" (u'Mme', 1)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'Mr', u'Miss', u'Mrs', u'Master']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n",
"top_title"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def assign_title(x):\n",
" if x in top_title: return x\n",
" else: return u'other'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'Mr', u'Mrs', u'Miss', u'Mrs']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"title_less = title.map(assign_title)\n",
"title_less.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 处理其它数据\n",
"def split_rest(x):\n",
" import re\n",
" rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n",
" return rec.split(',')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = raw.map(split_rest)\n",
"df.first()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 观察数据\n",
"vname.remove('name')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0th variable:survived distinct value: 2\n",
"1th variable:pclass distinct value: 3\n",
"2th variable:sex distinct value: 2\n",
"3th variable:age distinct value: 89\n",
"4th variable:sibsp distinct value: 7\n",
"5th variable:parch distinct value: 7\n",
"6th variable:ticket distinct value: 681\n",
"7th variable:fare distinct value: 248\n",
"8th variable:cabin distinct value: 148\n",
"9th variable:embarked distinct value: 4\n"
]
}
],
"source": [
"# 取值个数\n",
"m = len(df.first())\n",
"for i in range(m):\n",
" print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0th variable:survived miss value: 0\n",
"1th variable:pclass miss value: 0\n",
"2th variable:sex miss value: 0\n",
"3th variable:age miss value: 177\n",
"4th variable:sibsp miss value: 0\n",
"5th variable:parch miss value: 0\n",
"6th variable:ticket miss value: 0\n",
"7th variable:fare miss value: 0\n",
"8th variable:cabin miss value: 687\n",
"9th variable:embarked miss value: 2\n"
]
}
],
"source": [
"# 缺失个数\n",
"for i in range(m):\n",
" print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 处理年龄缺失\n",
"age = df.map(lambda x: x[3])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"title_age = title.zip(age)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def miss_mean(data):\n",
" res = [x for x in data if x!=-1]\n",
" return np.mean(res)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{u'Capt': 70.0,\n",
" u'Col': 58.0,\n",
" u'Don': 40.0,\n",
" u'Dr': 42.0,\n",
" u'Jonkheer': 38.0,\n",
" u'Lady': 48.0,\n",
" u'Major': 48.5,\n",
" u'Master': 4.5741666666666667,\n",
" u'Miss': 21.773972602739725,\n",
" u'Mlle': 24.0,\n",
" u'Mme': 24.0,\n",
" u'Mr': 32.368090452261306,\n",
" u'Mrs': 35.898148148148145,\n",
" u'Ms': 28.0,\n",
" u'Rev': 43.166666666666664,\n",
" u'Sir': 49.0,\n",
" u'the Countess': 33.0}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"age_dict"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def age_func((title,age)):\n",
" if age== -1: res = (title, age_dict[title])\n",
" else: res = (title, age)\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[22.0, 38.0, 26.0, 35.0]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"title_age = title_age.map(age_func)\n",
"age_imputed = title_age.values()\n",
"age_imputed.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 处理 embarked缺失\n",
"df.map(lambda record: record[9]).countByValue()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def embarked_func(record):\n",
" if record[9]=='' : return u'S' \n",
" else: return record[9]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"embarked= df.map(embarked_func)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 将四个类别变量转为0-1二元变量"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n",
"title_dict"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def create_vector(term, term_dict):\n",
" #from scipy import sparse as sp\n",
" num_terms = len(term_dict)\n",
" #x = sp.csc_matrix((1, num_terms))\n",
" x = [0]*num_terms\n",
" idx = term_dict[term]\n",
" x[idx] = 1\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1, 0, 0, 0]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"create_vector(u'Master',title_dict)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n",
"title_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{u'1': 0, u'2': 2, u'3': 1}"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n",
"pclass_dict"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n",
"pclass_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{u'C': 2, u'Q': 0, u'S': 1}"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n",
"embarked_dict"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n",
"embarked_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [0, 1, 0, 7.25]),\n",
" (1, [1, 1, 0, 71.2833]),\n",
" (2, [1, 0, 0, 7.925]),\n",
" (3, [1, 1, 0, 53.1])]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 合并数据\n",
"restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n",
"restdf.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [0, 0, 0, 1, 0]),\n",
" (1, [0, 0, 0, 0, 1]),\n",
" (2, [1, 0, 0, 0, 0]),\n",
" (3, [0, 0, 0, 0, 1])]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
"title_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
"pclass_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
"embarked_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [1]), (1, [0]), (2, [0]), (3, [0])]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
"gender_ind.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
"age_imputed.take(4)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n",
"finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n",
"finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n",
"finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n",
"finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
" (384,\n",
" [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
" (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n",
" (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"finaldf.take(4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 准备建模需要格式\n",
"from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
"from pyspark.mllib.regression import LabeledPoint\n",
"def parsePoint(line):\n",
" features = line[1][1:]\n",
" target = line[1][0]\n",
" return LabeledPoint(target, features)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"modeldata = finaldf.map(parsePoint)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"modeldata.first()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# 数据切分\n",
"train, test = modeldata.randomSplit([0.75,0.25])\n",
"# 建模\n",
"model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training Error = 0.308056872038\n"
]
}
],
"source": [
"# 评估\n",
"labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n",
"testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n",
"print(\"Training Error = \" + str(testErr))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment