Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save eblancoh/ca4e56604300889455c279f140f8faca to your computer and use it in GitHub Desktop.
Save eblancoh/ca4e56604300889455c279f140f8faca to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploración de los datos"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cargando las librerías y leyendo el dataset"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Cargamos el dataset desde el cihero `.csv`."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"SDSS_data/skyserver.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"cell_style": "split",
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 18 columns):\n",
"objid 10000 non-null float64\n",
"ra 10000 non-null float64\n",
"dec 10000 non-null float64\n",
"u 10000 non-null float64\n",
"g 10000 non-null float64\n",
"r 10000 non-null float64\n",
"i 10000 non-null float64\n",
"z 10000 non-null float64\n",
"run 10000 non-null int64\n",
"rerun 10000 non-null int64\n",
"camcol 10000 non-null int64\n",
"field 10000 non-null int64\n",
"specobjid 10000 non-null float64\n",
"class 10000 non-null object\n",
"redshift 10000 non-null float64\n",
"plate 10000 non-null int64\n",
"mjd 10000 non-null int64\n",
"fiberid 10000 non-null int64\n",
"dtypes: float64(10), int64(7), object(1)\n",
"memory usage: 1.4+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {
"cell_style": "split"
},
"source": [
"Con el comando `df.describe()` podremos obtener más información sobre el dataset."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"cell_style": "split",
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>objid</th>\n",
" <th>ra</th>\n",
" <th>dec</th>\n",
" <th>u</th>\n",
" <th>g</th>\n",
" <th>r</th>\n",
" <th>i</th>\n",
" <th>z</th>\n",
" <th>run</th>\n",
" <th>rerun</th>\n",
" <th>camcol</th>\n",
" <th>field</th>\n",
" <th>specobjid</th>\n",
" <th>redshift</th>\n",
" <th>plate</th>\n",
" <th>mjd</th>\n",
" <th>fiberid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.000000e+04</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.0</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>1.000000e+04</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" <td>10000.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.237650e+18</td>\n",
" <td>175.529987</td>\n",
" <td>14.836148</td>\n",
" <td>18.619355</td>\n",
" <td>17.371931</td>\n",
" <td>16.840963</td>\n",
" <td>16.583579</td>\n",
" <td>16.422833</td>\n",
" <td>981.034800</td>\n",
" <td>301.0</td>\n",
" <td>3.648700</td>\n",
" <td>302.380100</td>\n",
" <td>1.645022e+18</td>\n",
" <td>0.143726</td>\n",
" <td>1460.986400</td>\n",
" <td>52943.533300</td>\n",
" <td>353.069400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.577039e+05</td>\n",
" <td>47.783439</td>\n",
" <td>25.212207</td>\n",
" <td>0.828656</td>\n",
" <td>0.945457</td>\n",
" <td>1.067764</td>\n",
" <td>1.141805</td>\n",
" <td>1.203188</td>\n",
" <td>273.305024</td>\n",
" <td>0.0</td>\n",
" <td>1.666183</td>\n",
" <td>162.577763</td>\n",
" <td>2.013998e+18</td>\n",
" <td>0.388774</td>\n",
" <td>1788.778371</td>\n",
" <td>1511.150651</td>\n",
" <td>206.298149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.237650e+18</td>\n",
" <td>8.235100</td>\n",
" <td>-5.382632</td>\n",
" <td>12.988970</td>\n",
" <td>12.799550</td>\n",
" <td>12.431600</td>\n",
" <td>11.947210</td>\n",
" <td>11.610410</td>\n",
" <td>308.000000</td>\n",
" <td>301.0</td>\n",
" <td>1.000000</td>\n",
" <td>11.000000</td>\n",
" <td>2.995780e+17</td>\n",
" <td>-0.004136</td>\n",
" <td>266.000000</td>\n",
" <td>51578.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.237650e+18</td>\n",
" <td>157.370946</td>\n",
" <td>-0.539035</td>\n",
" <td>18.178035</td>\n",
" <td>16.815100</td>\n",
" <td>16.173333</td>\n",
" <td>15.853705</td>\n",
" <td>15.618285</td>\n",
" <td>752.000000</td>\n",
" <td>301.0</td>\n",
" <td>2.000000</td>\n",
" <td>184.000000</td>\n",
" <td>3.389248e+17</td>\n",
" <td>0.000081</td>\n",
" <td>301.000000</td>\n",
" <td>51900.000000</td>\n",
" <td>186.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.237650e+18</td>\n",
" <td>180.394514</td>\n",
" <td>0.404166</td>\n",
" <td>18.853095</td>\n",
" <td>17.495135</td>\n",
" <td>16.858770</td>\n",
" <td>16.554985</td>\n",
" <td>16.389945</td>\n",
" <td>756.000000</td>\n",
" <td>301.0</td>\n",
" <td>4.000000</td>\n",
" <td>299.000000</td>\n",
" <td>4.966580e+17</td>\n",
" <td>0.042591</td>\n",
" <td>441.000000</td>\n",
" <td>51997.000000</td>\n",
" <td>351.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.237650e+18</td>\n",
" <td>201.547279</td>\n",
" <td>35.649397</td>\n",
" <td>19.259232</td>\n",
" <td>18.010145</td>\n",
" <td>17.512675</td>\n",
" <td>17.258550</td>\n",
" <td>17.141447</td>\n",
" <td>1331.000000</td>\n",
" <td>301.0</td>\n",
" <td>5.000000</td>\n",
" <td>414.000000</td>\n",
" <td>2.881300e+18</td>\n",
" <td>0.092579</td>\n",
" <td>2559.000000</td>\n",
" <td>54468.000000</td>\n",
" <td>510.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.237650e+18</td>\n",
" <td>260.884382</td>\n",
" <td>68.542265</td>\n",
" <td>19.599900</td>\n",
" <td>19.918970</td>\n",
" <td>24.802040</td>\n",
" <td>28.179630</td>\n",
" <td>22.833060</td>\n",
" <td>1412.000000</td>\n",
" <td>301.0</td>\n",
" <td>6.000000</td>\n",
" <td>768.000000</td>\n",
" <td>9.468830e+18</td>\n",
" <td>5.353854</td>\n",
" <td>8410.000000</td>\n",
" <td>57481.000000</td>\n",
" <td>1000.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" objid ra dec u g \\\n",
"count 1.000000e+04 10000.000000 10000.000000 10000.000000 10000.000000 \n",
"mean 1.237650e+18 175.529987 14.836148 18.619355 17.371931 \n",
"std 1.577039e+05 47.783439 25.212207 0.828656 0.945457 \n",
"min 1.237650e+18 8.235100 -5.382632 12.988970 12.799550 \n",
"25% 1.237650e+18 157.370946 -0.539035 18.178035 16.815100 \n",
"50% 1.237650e+18 180.394514 0.404166 18.853095 17.495135 \n",
"75% 1.237650e+18 201.547279 35.649397 19.259232 18.010145 \n",
"max 1.237650e+18 260.884382 68.542265 19.599900 19.918970 \n",
"\n",
" r i z run rerun \\\n",
"count 10000.000000 10000.000000 10000.000000 10000.000000 10000.0 \n",
"mean 16.840963 16.583579 16.422833 981.034800 301.0 \n",
"std 1.067764 1.141805 1.203188 273.305024 0.0 \n",
"min 12.431600 11.947210 11.610410 308.000000 301.0 \n",
"25% 16.173333 15.853705 15.618285 752.000000 301.0 \n",
"50% 16.858770 16.554985 16.389945 756.000000 301.0 \n",
"75% 17.512675 17.258550 17.141447 1331.000000 301.0 \n",
"max 24.802040 28.179630 22.833060 1412.000000 301.0 \n",
"\n",
" camcol field specobjid redshift plate \\\n",
"count 10000.000000 10000.000000 1.000000e+04 10000.000000 10000.000000 \n",
"mean 3.648700 302.380100 1.645022e+18 0.143726 1460.986400 \n",
"std 1.666183 162.577763 2.013998e+18 0.388774 1788.778371 \n",
"min 1.000000 11.000000 2.995780e+17 -0.004136 266.000000 \n",
"25% 2.000000 184.000000 3.389248e+17 0.000081 301.000000 \n",
"50% 4.000000 299.000000 4.966580e+17 0.042591 441.000000 \n",
"75% 5.000000 414.000000 2.881300e+18 0.092579 2559.000000 \n",
"max 6.000000 768.000000 9.468830e+18 5.353854 8410.000000 \n",
"\n",
" mjd fiberid \n",
"count 10000.000000 10000.000000 \n",
"mean 52943.533300 353.069400 \n",
"std 1511.150651 206.298149 \n",
"min 51578.000000 1.000000 \n",
"25% 51900.000000 186.750000 \n",
"50% 51997.000000 351.000000 \n",
"75% 54468.000000 510.000000 \n",
"max 57481.000000 1000.000000 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment