john9631/AssignmentWithListComprehensionCorrs

## AssignmentWithListComprehensionCorrs
{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "CfDA R Course Programming Assignment 2"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import numpy as np\n",
      "import scipy\n",
      "\n",
      "directory = \"/home/john/Moocs/Computing for Data Analysis JH Coursera/L2/specdata\"\n",
      "ID = range(1,333)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def getmonitor(id, directory = \"\", summary = False):\n",
      "    \n",
      "    df = pd.read_csv('{0}/{1:03.0f}.csv'.format(directory, int(id)))\n",
      "    if summary: \n",
      "        print df.describe()\n",
      "    return df\n",
      "\n",
      "# getmonitor(\"1\", directory, True)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def complete(directory, id=range(1,332)):\n",
      "    \n",
      "    data = pd.DataFrame(columns = [\"nobs\"], index = [id])\n",
      "    for i in id: \n",
      "        df = getmonitor(id = i, directory = directory)\n",
      "        data.ix[i] = len(df.dropna())\n",
      "    return data\n",
      "\n",
      "print complete(directory, [1,20,31,50]) \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "   nobs\n",
        "1   117\n",
        "20  124\n",
        "31  483\n",
        "50  459\n"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def corr(directory, threshold = 0): \n",
      "    # build df then extract indexes of rows > threshold na values\n",
      "    df = complete(directory)\n",
      "    filelist = df[df['nobs'] > threshold].index\n",
      "\n",
      "    corrs = [] \n",
      "    for i in filelist:\n",
      "        corrs.append(getmonitor(i,directory).corr(method='pearson').ix[0,1])\n",
      "    return corrs\n",
      "\n",
      "\n",
      "cr = corr(directory,400)\n",
      "print cr[0:6], '\\n'\n",
      "print pd.DataFrame(cr).describe()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
        "\n",
        "                0\n",
        "count  127.000000\n",
        "mean     0.139686\n",
        "std      0.210523\n",
        "min     -0.176233\n",
        "25%     -0.031093\n",
        "50%      0.100212\n",
        "75%      0.268492\n",
        "max      0.763129\n"
       ]
      }
     ],
     "prompt_number": 116
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "    import os.path\n",
      "    \n",
      "    # def corr(directory, threshold = 0): \n",
      "    #    corrs = []\n",
      "     #   for f in files:\n",
      "      #      df = pd.read_csv('{0}/{1}'.format(directory, f))\n",
      "       #     if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold:\n",
      "        #        corrs.append(df.corr(method='pearson').ix[0,1])    \n",
      "        #  return corrs\n",
      "\n",
      "    def corr(directory, threshold = 0):\n",
      "        return [df.corr(method='pearson').ix[0,1] for df in \n",
      "                   [pd.read_csv('{0}/{1}'.format(directory, f)) for f in files]  \n",
      "                       if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold]\n",
      "\n",
      "cr = corr(directory, 400)\n",
      "print cr[0:6], '\\n'\n",
      "print pd.DataFrame(cr).describe()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
        "\n",
        "                0\n",
        "count  127.000000\n",
        "mean     0.139686\n",
        "std      0.210523\n",
        "min     -0.176233\n",
        "25%     -0.031093\n",
        "50%      0.100212\n",
        "75%      0.268492\n",
        "max      0.763129\n"
       ]
      }
     ],
     "prompt_number": 115
    }
   ],
   "metadata": {}
  }
 ]
}
	{
	"metadata": {
	"name": ""
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "heading",
	"level": 2,
	"metadata": {},
	"source": [
	"CfDA R Course Programming Assignment 2"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import scipy\n",
	"\n",
	"directory = \"/home/john/Moocs/Computing for Data Analysis JH Coursera/L2/specdata\"\n",
	"ID = range(1,333)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 11
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def getmonitor(id, directory = \"\", summary = False):\n",
	" \n",
	" df = pd.read_csv('{0}/{1:03.0f}.csv'.format(directory, int(id)))\n",
	" if summary: \n",
	" print df.describe()\n",
	" return df\n",
	"\n",
	"# getmonitor(\"1\", directory, True)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 20
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def complete(directory, id=range(1,332)):\n",
	" \n",
	" data = pd.DataFrame(columns = [\"nobs\"], index = [id])\n",
	" for i in id: \n",
	" df = getmonitor(id = i, directory = directory)\n",
	" data.ix[i] = len(df.dropna())\n",
	" return data\n",
	"\n",
	"print complete(directory, [1,20,31,50]) \n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" nobs\n",
	"1 117\n",
	"20 124\n",
	"31 483\n",
	"50 459\n"
	]
	}
	],
	"prompt_number": 21
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def corr(directory, threshold = 0): \n",
	" # build df then extract indexes of rows > threshold na values\n",
	" df = complete(directory)\n",
	" filelist = df[df['nobs'] > threshold].index\n",
	"\n",
	" corrs = [] \n",
	" for i in filelist:\n",
	" corrs.append(getmonitor(i,directory).corr(method='pearson').ix[0,1])\n",
	" return corrs\n",
	"\n",
	"\n",
	"cr = corr(directory,400)\n",
	"print cr[0:6], '\\n'\n",
	"print pd.DataFrame(cr).describe()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
	"\n",
	" 0\n",
	"count 127.000000\n",
	"mean 0.139686\n",
	"std 0.210523\n",
	"min -0.176233\n",
	"25% -0.031093\n",
	"50% 0.100212\n",
	"75% 0.268492\n",
	"max 0.763129\n"
	]
	}
	],
	"prompt_number": 116
	},
	{
	"cell_type": "code",
	"collapsed": true,
	"input": [
	" import os.path\n",
	" \n",
	" # def corr(directory, threshold = 0): \n",
	" # corrs = []\n",
	" # for f in files:\n",
	" # df = pd.read_csv('{0}/{1}'.format(directory, f))\n",
	" # if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold:\n",
	" # corrs.append(df.corr(method='pearson').ix[0,1]) \n",
	" # return corrs\n",
	"\n",
	" def corr(directory, threshold = 0):\n",
	" return [df.corr(method='pearson').ix[0,1] for df in \n",
	" [pd.read_csv('{0}/{1}'.format(directory, f)) for f in files] \n",
	" if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold]\n",
	"\n",
	"cr = corr(directory, 400)\n",
	"print cr[0:6], '\\n'\n",
	"print pd.DataFrame(cr).describe()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
	"\n",
	" 0\n",
	"count 127.000000\n",
	"mean 0.139686\n",
	"std 0.210523\n",
	"min -0.176233\n",
	"25% -0.031093\n",
	"50% 0.100212\n",
	"75% 0.268492\n",
	"max 0.763129\n"
	]
	}
	],
	"prompt_number": 115
	}
	],
	"metadata": {}
	}
	]
	}