tklein23/gist:7188804

## gistfile1.txt
{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Learning from streamed sparse features"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "*Author (GitHub ID: [tklein23](https://github.com/tklein23]))*"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Simple demonstration of how to use sparse features to train a simple online learning."
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Preparing"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import modshogun\n",
      "# modshogun.get_global_io().set_loglevel(modshogun.MSG_DEBUG & modshogun.MSG_LINE_AND_FILE)\n",
      "\n",
      "from modshogun import StreamingAsciiFile\n",
      "from modshogun import StreamingSparseRealFeatures\n",
      "from modshogun import BinaryLabels"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "StreamingSparseRealFeatures is an alias to StreamingSparseFeatures< float64_t >."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "shogun_data = os.environ['SHOGUN_DATA'] if 'SHOGUN_DATA' in os.environ else '../../../data'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "shogun_data = '.'\n",
      "train_file = shogun_data + \"/train_sparsereal.light\"\n",
      "eval_file = shogun_data + \"/train_sparsereal.light\""
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Loading binary labels from streamed input file:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "filestream = StreamingAsciiFile(eval_file)\n",
      "eval_feats = StreamingSparseRealFeatures(filestream, True, 1024)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "labels_temp = []\n",
      "\n",
      "eval_feats.start_parser()\n",
      "\n",
      "while eval_feats.get_next_example():\n",
      "    labels_temp.append(eval_feats.get_label())\n",
      "    eval_feats.release_example()\n",
      "\n",
      "eval_feats.end_parser()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy\n",
      "labels_true = BinaryLabels(numpy.array(labels_temp))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "labels_true.get_labels()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,\n",
        "        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,\n",
        "        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,\n",
        "        1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Training"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "filestream = StreamingAsciiFile(train_file)\n",
      "train_feats = StreamingSparseRealFeatures(filestream, True, 1024)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from modshogun import OnlineSVMSGD\n",
      "C=1.0\n",
      "svm = OnlineSVMSGD(C, train_feats)\n",
      "%time svm.train()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
        "Wall time: 1.2 ms\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "True"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy.linalg\n",
      "numpy.linalg.norm(svm.get_w(), 2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "325.52298"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Evaluation"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "filestream = StreamingAsciiFile(eval_file)\n",
      "eval_feats = StreamingSparseRealFeatures(filestream, True, 1024)\n",
      "%time labels_pred = svm.apply_binary(eval_feats)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
        "Wall time: 1.08 ms\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "labels_pred.get_labels()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
        "       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "labels_true.get_labels() == labels_pred.get_labels()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 13,
       "text": [
        "array([False, False, False, False, False, False, False, False, False,\n",
        "       False, False, False, False, False, False, False, False, False,\n",
        "       False, False, False, False, False,  True, False, False, False,\n",
        "       False, False, False, False, False, False, False, False, False,\n",
        "       False, False, False, False, False, False, False, False, False,\n",
        "       False,  True,  True,  True,  True,  True,  True,  True,  True,\n",
        "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
        "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
        "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
        "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
        "        True,  True], dtype=bool)"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "num_labels = labels_true.get_num_labels()\n",
      "1.0*numpy.count_nonzero(labels_true.get_labels() == labels_pred.get_labels())/num_labels"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 14,
       "text": [
        "0.5108695652173914"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "References"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "- [StreamingAsciiFile](http://www.shogun-toolbox.org/doc/en/current/StreamingAsciiFile_8cpp.html)\n",
      "- [StreamingSparseRealFeatures](http://www.shogun-toolbox.org/doc/en/current/StreamingSparseFeatures_8cpp.html)"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 1,
	"metadata": {},
	"source": [
	"Learning from streamed sparse features"
	]
	},
	{
	"cell_type": "heading",
	"level": 4,
	"metadata": {},
	"source": [
	"Author (GitHub ID: [tklein23](https://github.com/tklein23]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Simple demonstration of how to use sparse features to train a simple online learning."
	]
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Preparing"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import modshogun\n",
	"# modshogun.get_global_io().set_loglevel(modshogun.MSG_DEBUG & modshogun.MSG_LINE_AND_FILE)\n",
	"\n",
	"from modshogun import StreamingAsciiFile\n",
	"from modshogun import StreamingSparseRealFeatures\n",
	"from modshogun import BinaryLabels"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"StreamingSparseRealFeatures is an alias to StreamingSparseFeatures< float64_t >."
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import os\n",
	"shogun_data = os.environ['SHOGUN_DATA'] if 'SHOGUN_DATA' in os.environ else '../../../data'"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"shogun_data = '.'\n",
	"train_file = shogun_data + \"/train_sparsereal.light\"\n",
	"eval_file = shogun_data + \"/train_sparsereal.light\""
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 3
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Loading binary labels from streamed input file:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"filestream = StreamingAsciiFile(eval_file)\n",
	"eval_feats = StreamingSparseRealFeatures(filestream, True, 1024)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"labels_temp = []\n",
	"\n",
	"eval_feats.start_parser()\n",
	"\n",
	"while eval_feats.get_next_example():\n",
	" labels_temp.append(eval_feats.get_label())\n",
	" eval_feats.release_example()\n",
	"\n",
	"eval_feats.end_parser()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import numpy\n",
	"labels_true = BinaryLabels(numpy.array(labels_temp))"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"labels_true.get_labels()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 7,
	"text": [
	"array([ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
	" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
	" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
	" 1., 1., 1., 1., 1., 1., 1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Training"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"filestream = StreamingAsciiFile(train_file)\n",
	"train_feats = StreamingSparseRealFeatures(filestream, True, 1024)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 8
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from modshogun import OnlineSVMSGD\n",
	"C=1.0\n",
	"svm = OnlineSVMSGD(C, train_feats)\n",
	"%time svm.train()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
	"Wall time: 1.2 ms\n"
	]
	},
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 9,
	"text": [
	"True"
	]
	}
	],
	"prompt_number": 9
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import numpy.linalg\n",
	"numpy.linalg.norm(svm.get_w(), 2)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 10,
	"text": [
	"325.52298"
	]
	}
	],
	"prompt_number": 10
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"Evaluation"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"filestream = StreamingAsciiFile(eval_file)\n",
	"eval_feats = StreamingSparseRealFeatures(filestream, True, 1024)\n",
	"%time labels_pred = svm.apply_binary(eval_feats)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n",
	"Wall time: 1.08 ms\n"
	]
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"labels_pred.get_labels()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 12,
	"text": [
	"array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., 1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n",
	" -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])"
	]
	}
	],
	"prompt_number": 12
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"labels_true.get_labels() == labels_pred.get_labels()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 13,
	"text": [
	"array([False, False, False, False, False, False, False, False, False,\n",
	" False, False, False, False, False, False, False, False, False,\n",
	" False, False, False, False, False, True, False, False, False,\n",
	" False, False, False, False, False, False, False, False, False,\n",
	" False, False, False, False, False, False, False, False, False,\n",
	" False, True, True, True, True, True, True, True, True,\n",
	" True, True, True, True, True, True, True, True, True,\n",
	" True, True, True, True, True, True, True, True, True,\n",
	" True, True, True, True, True, True, True, True, True,\n",
	" True, True, True, True, True, True, True, True, True,\n",
	" True, True], dtype=bool)"
	]
	}
	],
	"prompt_number": 13
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"num_labels = labels_true.get_num_labels()\n",
	"1.0*numpy.count_nonzero(labels_true.get_labels() == labels_pred.get_labels())/num_labels"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"metadata": {},
	"output_type": "pyout",
	"prompt_number": 14,
	"text": [
	"0.5108695652173914"
	]
	}
	],
	"prompt_number": 14
	},
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"References"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- [StreamingAsciiFile](http://www.shogun-toolbox.org/doc/en/current/StreamingAsciiFile_8cpp.html)\n",
	"- [StreamingSparseRealFeatures](http://www.shogun-toolbox.org/doc/en/current/StreamingSparseFeatures_8cpp.html)"
	]
	}
	],
	"metadata": {}
	}
	]
	}