Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Created March 30, 2016 19:01
Show Gist options
  • Save TomAugspurger/37097cce6a3368a8dad7cf2c8a9e5e92 to your computer and use it in GitHub Desktop.
Save TomAugspurger/37097cce6a3368a8dad7cf2c8a9e5e92 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%load_ext rpy2.ipython"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%%R\n",
"library(ggplot2)\n",
"\n",
"write.csv(diamonds, 'diamonds.csv')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_csv('diamonds.csv', index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53936</th>\n",
" <td>0.72</td>\n",
" <td>Ideal</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>60.8</td>\n",
" <td>57.0</td>\n",
" <td>2757</td>\n",
" <td>5.75</td>\n",
" <td>5.76</td>\n",
" <td>3.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53937</th>\n",
" <td>0.72</td>\n",
" <td>Good</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>63.1</td>\n",
" <td>55.0</td>\n",
" <td>2757</td>\n",
" <td>5.69</td>\n",
" <td>5.75</td>\n",
" <td>3.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53938</th>\n",
" <td>0.70</td>\n",
" <td>Very Good</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>62.8</td>\n",
" <td>60.0</td>\n",
" <td>2757</td>\n",
" <td>5.66</td>\n",
" <td>5.68</td>\n",
" <td>3.56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53939</th>\n",
" <td>0.86</td>\n",
" <td>Premium</td>\n",
" <td>H</td>\n",
" <td>SI2</td>\n",
" <td>61.0</td>\n",
" <td>58.0</td>\n",
" <td>2757</td>\n",
" <td>6.15</td>\n",
" <td>6.12</td>\n",
" <td>3.74</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53940</th>\n",
" <td>0.75</td>\n",
" <td>Ideal</td>\n",
" <td>D</td>\n",
" <td>SI2</td>\n",
" <td>62.2</td>\n",
" <td>55.0</td>\n",
" <td>2757</td>\n",
" <td>5.83</td>\n",
" <td>5.87</td>\n",
" <td>3.64</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>53940 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z\n",
"1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43\n",
"2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31\n",
"3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31\n",
"4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63\n",
"5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75\n",
"... ... ... ... ... ... ... ... ... ... ...\n",
"53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50\n",
"53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61\n",
"53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56\n",
"53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74\n",
"53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64\n",
"\n",
"[53940 rows x 10 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Multiple Aggregations"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">depth</th>\n",
" <th colspan=\"2\" halign=\"left\">carat</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" <th>Average Weight</th>\n",
" <th>+/-</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cut</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Fair</th>\n",
" <td>43.0</td>\n",
" <td>79.0</td>\n",
" <td>1.046137</td>\n",
" <td>0.516404</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Good</th>\n",
" <td>54.3</td>\n",
" <td>67.0</td>\n",
" <td>0.849185</td>\n",
" <td>0.454054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ideal</th>\n",
" <td>43.0</td>\n",
" <td>66.7</td>\n",
" <td>0.702837</td>\n",
" <td>0.432876</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Premium</th>\n",
" <td>58.0</td>\n",
" <td>63.0</td>\n",
" <td>0.891955</td>\n",
" <td>0.515262</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Very Good</th>\n",
" <td>56.8</td>\n",
" <td>64.9</td>\n",
" <td>0.806381</td>\n",
" <td>0.459435</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" depth carat \n",
" min max Average Weight +/-\n",
"cut \n",
"Fair 43.0 79.0 1.046137 0.516404\n",
"Good 54.3 67.0 0.849185 0.454054\n",
"Ideal 43.0 66.7 0.702837 0.432876\n",
"Premium 58.0 63.0 0.891955 0.515262\n",
"Very Good 56.8 64.9 0.806381 0.459435"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(\n",
" df.groupby('cut')\n",
" .agg({'carat': {'Average Weight': 'mean', '+/-': 'std'},\n",
" 'depth': ['min', 'max']})\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Summaries Back in Original"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>carat</th>\n",
" <th>cut</th>\n",
" <th>color</th>\n",
" <th>clarity</th>\n",
" <th>depth</th>\n",
" <th>table</th>\n",
" <th>price</th>\n",
" <th>x</th>\n",
" <th>y</th>\n",
" <th>z</th>\n",
" <th>xy</th>\n",
" <th>xz</th>\n",
" <th>mean_xy</th>\n",
" <th>var_xy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.23</td>\n",
" <td>Ideal</td>\n",
" <td>E</td>\n",
" <td>SI2</td>\n",
" <td>61.5</td>\n",
" <td>55.0</td>\n",
" <td>326</td>\n",
" <td>3.95</td>\n",
" <td>3.98</td>\n",
" <td>2.43</td>\n",
" <td>7.93</td>\n",
" <td>6.38</td>\n",
" <td>5.507451</td>\n",
" <td>4.537332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.21</td>\n",
" <td>Premium</td>\n",
" <td>E</td>\n",
" <td>SI1</td>\n",
" <td>59.8</td>\n",
" <td>61.0</td>\n",
" <td>326</td>\n",
" <td>3.89</td>\n",
" <td>3.84</td>\n",
" <td>2.31</td>\n",
" <td>7.73</td>\n",
" <td>6.20</td>\n",
" <td>5.973887</td>\n",
" <td>5.807050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.23</td>\n",
" <td>Good</td>\n",
" <td>E</td>\n",
" <td>VS1</td>\n",
" <td>56.9</td>\n",
" <td>65.0</td>\n",
" <td>327</td>\n",
" <td>4.05</td>\n",
" <td>4.07</td>\n",
" <td>2.31</td>\n",
" <td>8.12</td>\n",
" <td>6.36</td>\n",
" <td>5.838785</td>\n",
" <td>4.443224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.29</td>\n",
" <td>Premium</td>\n",
" <td>I</td>\n",
" <td>VS2</td>\n",
" <td>62.4</td>\n",
" <td>58.0</td>\n",
" <td>334</td>\n",
" <td>4.20</td>\n",
" <td>4.23</td>\n",
" <td>2.63</td>\n",
" <td>8.43</td>\n",
" <td>6.83</td>\n",
" <td>5.973887</td>\n",
" <td>5.807050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.31</td>\n",
" <td>Good</td>\n",
" <td>J</td>\n",
" <td>SI2</td>\n",
" <td>63.3</td>\n",
" <td>58.0</td>\n",
" <td>335</td>\n",
" <td>4.34</td>\n",
" <td>4.35</td>\n",
" <td>2.75</td>\n",
" <td>8.69</td>\n",
" <td>7.09</td>\n",
" <td>5.838785</td>\n",
" <td>4.443224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53936</th>\n",
" <td>0.72</td>\n",
" <td>Ideal</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>60.8</td>\n",
" <td>57.0</td>\n",
" <td>2757</td>\n",
" <td>5.75</td>\n",
" <td>5.76</td>\n",
" <td>3.50</td>\n",
" <td>11.51</td>\n",
" <td>9.25</td>\n",
" <td>5.507451</td>\n",
" <td>4.537332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53937</th>\n",
" <td>0.72</td>\n",
" <td>Good</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>63.1</td>\n",
" <td>55.0</td>\n",
" <td>2757</td>\n",
" <td>5.69</td>\n",
" <td>5.75</td>\n",
" <td>3.61</td>\n",
" <td>11.44</td>\n",
" <td>9.30</td>\n",
" <td>5.838785</td>\n",
" <td>4.443224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53938</th>\n",
" <td>0.70</td>\n",
" <td>Very Good</td>\n",
" <td>D</td>\n",
" <td>SI1</td>\n",
" <td>62.8</td>\n",
" <td>60.0</td>\n",
" <td>2757</td>\n",
" <td>5.66</td>\n",
" <td>5.68</td>\n",
" <td>3.56</td>\n",
" <td>11.34</td>\n",
" <td>9.22</td>\n",
" <td>5.740696</td>\n",
" <td>4.854494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53939</th>\n",
" <td>0.86</td>\n",
" <td>Premium</td>\n",
" <td>H</td>\n",
" <td>SI2</td>\n",
" <td>61.0</td>\n",
" <td>58.0</td>\n",
" <td>2757</td>\n",
" <td>6.15</td>\n",
" <td>6.12</td>\n",
" <td>3.74</td>\n",
" <td>12.27</td>\n",
" <td>9.89</td>\n",
" <td>5.973887</td>\n",
" <td>5.807050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53940</th>\n",
" <td>0.75</td>\n",
" <td>Ideal</td>\n",
" <td>D</td>\n",
" <td>SI2</td>\n",
" <td>62.2</td>\n",
" <td>55.0</td>\n",
" <td>2757</td>\n",
" <td>5.83</td>\n",
" <td>5.87</td>\n",
" <td>3.64</td>\n",
" <td>11.70</td>\n",
" <td>9.47</td>\n",
" <td>5.507451</td>\n",
" <td>4.537332</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>53940 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" carat cut color clarity depth table price x y z \\\n",
"1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 \n",
"2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 \n",
"3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 \n",
"4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 \n",
"5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 \n",
"... ... ... ... ... ... ... ... ... ... ... \n",
"53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50 \n",
"53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61 \n",
"53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56 \n",
"53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74 \n",
"53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64 \n",
"\n",
" xy xz mean_xy var_xy \n",
"1 7.93 6.38 5.507451 4.537332 \n",
"2 7.73 6.20 5.973887 5.807050 \n",
"3 8.12 6.36 5.838785 4.443224 \n",
"4 8.43 6.83 5.973887 5.807050 \n",
"5 8.69 7.09 5.838785 4.443224 \n",
"... ... ... ... ... \n",
"53936 11.51 9.25 5.507451 4.537332 \n",
"53937 11.44 9.30 5.838785 4.443224 \n",
"53938 11.34 9.22 5.740696 4.854494 \n",
"53939 12.27 9.89 5.973887 5.807050 \n",
"53940 11.70 9.47 5.507451 4.537332 \n",
"\n",
"[53940 rows x 14 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(\n",
" df.assign(xy = lambda d: d.x + d.y, xz = lambda d: d.x + d.z)\n",
" .assign(mean_xy=lambda x: x.groupby('cut').x.transform('mean'),\n",
" var_xy=lambda x: x.groupby('cut').xy.transform('var'))\n",
"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So that's possbile, but maybe not that legible."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment