Skip to content

Instantly share code, notes, and snippets.

Created February 19, 2018 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/477127695902119da620ea815117f96d to your computer and use it in GitHub Desktop.
Save anonymous/477127695902119da620ea815117f96d to your computer and use it in GitHub Desktop.
numpy and pandas example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import pandas as pd # Import, manipulate, export data (DataFrames)\nimport numpy as np # Mathematical and matrix operations\nimport os # Set working directory",
"execution_count": 27,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Working directory\nos.chdir('/Users/bernardo/Dropbox (Personal)/Documentos/Python/Data Mining/')",
"execution_count": 28,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Import CSV or Excel file"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df = pd.read_csv(\"05. Classification Concepts/5052_05_Code/anes_dataset.csv\")\n#df = pd.read_excel(path)\ndf.head(3)",
"execution_count": 29,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 29,
"data": {
"text/plain": " popul TVnews selfLR ClinLR DoleLR PID age educ income vote\n0 0 7 7 1 6 6 36 3 1 1\n1 190 1 3 3 5 1 20 4 1 0\n2 31 7 2 2 6 1 24 6 1 0",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>popul</th>\n <th>TVnews</th>\n <th>selfLR</th>\n <th>ClinLR</th>\n <th>DoleLR</th>\n <th>PID</th>\n <th>age</th>\n <th>educ</th>\n <th>income</th>\n <th>vote</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>7</td>\n <td>7</td>\n <td>1</td>\n <td>6</td>\n <td>6</td>\n <td>36</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>190</td>\n <td>1</td>\n <td>3</td>\n <td>3</td>\n <td>5</td>\n <td>1</td>\n <td>20</td>\n <td>4</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31</td>\n <td>7</td>\n <td>2</td>\n <td>2</td>\n <td>6</td>\n <td>1</td>\n <td>24</td>\n <td>6</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Matrix and Dictionary manual creation"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Create a manual dictionary\nd1 = {'Nombe': pd.Series(['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']),\n 'Edad': pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),\n 'Rating': pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}",
"execution_count": 30,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Create a manual matrix with lists\nnames = ['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']\nage = [25,26,25,23,30,29,23,34,40,30,51,46]\nrating = [4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65]\nd2 = list(zip(names, age, rating))\nprint(d2)",
"execution_count": 31,
"outputs": [
{
"output_type": "stream",
"text": "[('Tomas', 25, 4.23), ('Jaime', 26, 3.24), ('Ricardo', 25, 3.98), ('Victor', 23, 2.56), ('Esteban', 30, 3.2), ('Susana', 29, 4.6), ('Jorge', 23, 3.8), ('Lili', 34, 3.78), ('David', 40, 2.98), ('Liliana', 30, 4.8), ('Beto', 51, 4.1), ('JJ', 46, 3.65)]\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Create a sequence with an array\nx = np.arange(1,15,1)\na = np.array([[1, 1, 2], [3, 5, 8], [13, 21, 34]])\nprint(\"Arange:\",x)\nprint(\"Array\",a)",
"execution_count": 46,
"outputs": [
{
"output_type": "stream",
"text": "Arange: [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14]\nArray [[ 1 1 2]\n [ 3 5 8]\n [13 21 34]]\n",
"name": "stdout"
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# DataFrames for data manipulation"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Create a panda Data frame with Dictionary or Matrix (they are the same)\ndf = pd.DataFrame(data = d1)\ndf = pd.DataFrame(data = d2, columns = ['Nombre','Edad','Rating'])\ndf",
"execution_count": 33,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 33,
"data": {
"text/plain": " Edad Nombe Rating\n0 25 Tomas 4.23\n1 26 Jaime 3.24\n2 25 Ricardo 3.98\n3 23 Victor 2.56\n4 30 Esteban 3.20\n5 29 Susana 4.60\n6 23 Jorge 3.80\n7 34 Lili 3.78\n8 40 David 2.98\n9 30 Liliana 4.80\n10 51 Beto 4.10\n11 46 JJ 3.65",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Nombe</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>25</td>\n <td>Tomas</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>26</td>\n <td>Jaime</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>25</td>\n <td>Ricardo</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>23</td>\n <td>Victor</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>30</td>\n <td>Esteban</td>\n <td>3.20</td>\n </tr>\n <tr>\n <th>5</th>\n <td>29</td>\n <td>Susana</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>6</th>\n <td>23</td>\n <td>Jorge</td>\n <td>3.80</td>\n </tr>\n <tr>\n <th>7</th>\n <td>34</td>\n <td>Lili</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>8</th>\n <td>40</td>\n <td>David</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>9</th>\n <td>30</td>\n <td>Liliana</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>51</td>\n <td>Beto</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>46</td>\n <td>JJ</td>\n <td>3.65</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Data analysis and manipulation"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "df.info()",
"execution_count": 35,
"outputs": [
{
"output_type": "stream",
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 12 entries, 0 to 11\nData columns (total 3 columns):\nNombre 12 non-null object\nEdad 12 non-null int64\nRating 12 non-null float64\ndtypes: float64(1), int64(1), object(1)\nmemory usage: 368.0+ bytes\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# First rows\ndf.head(5)",
"execution_count": 36,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 36,
"data": {
"text/plain": " Nombre Edad Rating\n0 Tomas 25 4.23\n1 Jaime 26 3.24\n2 Ricardo 25 3.98\n3 Victor 23 2.56\n4 Esteban 30 3.20",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Jaime</td>\n <td>26</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Ricardo</td>\n <td>25</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Victor</td>\n <td>23</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# DataFrame dimensions\nprint(\"Dim:\",df.shape)\nprint(\"Rows:\",len(df.index))\nprint(\"Columns:\",len(df.columns))",
"execution_count": 37,
"outputs": [
{
"output_type": "stream",
"text": "Dim: (12, 3)\nRows: 12\nColumns: 3\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Statistical description on the data frame with 2 decimals\nround(df.describe(),2)",
"execution_count": 38,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 38,
"data": {
"text/plain": " Edad Rating\ncount 12.00 12.00\nmean 31.83 3.74\nstd 9.23 0.66\nmin 23.00 2.56\n25% 25.00 3.23\n50% 29.50 3.79\n75% 35.50 4.13\nmax 51.00 4.80",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>12.00</td>\n <td>12.00</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>31.83</td>\n <td>3.74</td>\n </tr>\n <tr>\n <th>std</th>\n <td>9.23</td>\n <td>0.66</td>\n </tr>\n <tr>\n <th>min</th>\n <td>23.00</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>25.00</td>\n <td>3.23</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>29.50</td>\n <td>3.79</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>35.50</td>\n <td>4.13</td>\n </tr>\n <tr>\n <th>max</th>\n <td>51.00</td>\n <td>4.80</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Mean on all numerical features\ndf.mean()",
"execution_count": 39,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 39,
"data": {
"text/plain": "Edad 31.833333\nRating 3.743333\ndtype: float64"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Max value on a specific column\ndf[\"Edad\"].max()",
"execution_count": 40,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 40,
"data": {
"text/plain": "51"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Arrange columns, ascending, by a specific column and only show top 5\ndf.sort_values(by=\"Edad\", ascending = False).head(5)",
"execution_count": 41,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 41,
"data": {
"text/plain": " Nombre Edad Rating\n10 Beto 51 4.10\n11 JJ 46 3.65\n8 David 40 2.98\n7 Lili 34 3.78\n4 Esteban 30 3.20",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>JJ</td>\n <td>46</td>\n <td>3.65</td>\n </tr>\n <tr>\n <th>8</th>\n <td>David</td>\n <td>40</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>7</th>\n <td>Lili</td>\n <td>34</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Filter rows given a condition\ndf[df.Rating >= 4]",
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 42,
"data": {
"text/plain": " Nombre Edad Rating\n0 Tomas 25 4.23\n5 Susana 29 4.60\n9 Liliana 30 4.80\n10 Beto 51 4.10",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Susana</td>\n <td>29</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>9</th>\n <td>Liliana</td>\n <td>30</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Group, count, and arrange given a column's name\ndf[['Edad']].groupby('Edad')['Edad'].count().sort_values(ascending=False)",
"execution_count": 43,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 43,
"data": {
"text/plain": "Edad\n30 2\n25 2\n23 2\n51 1\n46 1\n40 1\n34 1\n29 1\n26 1\nName: Edad, dtype: int64"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Show all column's names\nlist(df.columns.values)",
"execution_count": 44,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 44,
"data": {
"text/plain": "['Nombre', 'Edad', 'Rating']"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Export to CSV files"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "# Export to a CSV file on a specific path\ndf.to_csv('02. Python and Packages/example.csv')",
"execution_count": 45,
"outputs": []
},
{
"metadata": {
"trusted": false
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.3",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "b737bf065d2267636c25e888f8beca4b",
"data": {
"description": "numpy and pandas example",
"public": true
}
},
"_draft": {
"nbviewer_url": "https://gist.github.com/b737bf065d2267636c25e888f8beca4b"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment