Skip to content

Instantly share code, notes, and snippets.

@john9631
Last active December 24, 2015 20:49
Show Gist options
  • Save john9631/6861096 to your computer and use it in GitHub Desktop.
Save john9631/6861096 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"CfDA R Course Programming Assignment 2"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"import numpy as np\n",
"import scipy\n",
"\n",
"directory = \"/home/john/Moocs/Computing for Data Analysis JH Coursera/L2/specdata\"\n",
"ID = range(1,333)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def getmonitor(id, directory = \"\", summary = False):\n",
" \n",
" df = pd.read_csv('{0}/{1:03.0f}.csv'.format(directory, int(id)))\n",
" if summary: \n",
" print df.describe()\n",
" return df\n",
"\n",
"# getmonitor(\"1\", directory, True)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 20
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def complete(directory, id=range(1,332)):\n",
" \n",
" data = pd.DataFrame(columns = [\"nobs\"], index = [id])\n",
" for i in id: \n",
" df = getmonitor(id = i, directory = directory)\n",
" data.ix[i] = len(df.dropna())\n",
" return data\n",
"\n",
"print complete(directory, [1,20,31,50]) \n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" nobs\n",
"1 117\n",
"20 124\n",
"31 483\n",
"50 459\n"
]
}
],
"prompt_number": 21
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def corr(directory, threshold = 0): \n",
" # build df then extract indexes of rows > threshold na values\n",
" df = complete(directory)\n",
" filelist = df[df['nobs'] > threshold].index\n",
"\n",
" corrs = [] \n",
" for i in filelist:\n",
" corrs.append(getmonitor(i,directory).corr(method='pearson').ix[0,1])\n",
" return corrs\n",
"\n",
"\n",
"cr = corr(directory,400)\n",
"print cr[0:6], '\\n'\n",
"print pd.DataFrame(cr).describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
"\n",
" 0\n",
"count 127.000000\n",
"mean 0.139686\n",
"std 0.210523\n",
"min -0.176233\n",
"25% -0.031093\n",
"50% 0.100212\n",
"75% 0.268492\n",
"max 0.763129\n"
]
}
],
"prompt_number": 116
},
{
"cell_type": "code",
"collapsed": true,
"input": [
" import os.path\n",
" \n",
" # def corr(directory, threshold = 0): \n",
" # corrs = []\n",
" # for f in files:\n",
" # df = pd.read_csv('{0}/{1}'.format(directory, f))\n",
" # if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold:\n",
" # corrs.append(df.corr(method='pearson').ix[0,1]) \n",
" # return corrs\n",
"\n",
" def corr(directory, threshold = 0):\n",
" return [df.corr(method='pearson').ix[0,1] for df in \n",
" [pd.read_csv('{0}/{1}'.format(directory, f)) for f in files] \n",
" if len(df[np.isfinite(df['sulfate']) & np.isfinite(df['nitrate'])]) > threshold]\n",
"\n",
"cr = corr(directory, 400)\n",
"print cr[0:6], '\\n'\n",
"print pd.DataFrame(cr).describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"[-0.018957540970254896, -0.043897372138784689, -0.06815956229777316, -0.075888144218988859, 0.76312883703629375, -0.15782860340392174] \n",
"\n",
" 0\n",
"count 127.000000\n",
"mean 0.139686\n",
"std 0.210523\n",
"min -0.176233\n",
"25% -0.031093\n",
"50% 0.100212\n",
"75% 0.268492\n",
"max 0.763129\n"
]
}
],
"prompt_number": 115
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment