Created February 19, 2018
numpy and pandas example
import pandas as pd # Import, manipulate, export data (DataFrames)
import numpy as np # Mathematical and matrix operations
import os # Set working directory
# Working directory
os.chdir('/Users/bernardo/Dropbox (Personal)/Documentos/Python/Data Mining/')
# Import CSV or Excel file
df = pd.read_csv("05. Classification Concepts/5052_05_Code/anes_dataset.csv")
#df = pd.read_excel(path)
df.head(3)
"data": {
popul TVnews selfLR ClinLR DoleLR PID age educ income vote
0 0 7 7 1 6 6 36 3 1 1
1 190 1 3 3 5 1 20 4 1 0
2 31 7 2 2 6 1 24 6 1 0
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>popul</th>\n <th>TVnews</th>\n <th>selfLR</th>\n <th>ClinLR</th>\n <th>DoleLR</th>\n <th>PID</th>\n <th>age</th>\n <th>educ</th>\n <th>income</th>\n <th>vote</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>7</td>\n <td>7</td>\n <td>1</td>\n <td>6</td>\n <td>6</td>\n <td>36</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>190</td>\n <td>1</td>\n <td>3</td>\n <td>3</td>\n <td>5</td>\n <td>1</td>\n <td>20</td>\n <td>4</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31</td>\n <td>7</td>\n <td>2</td>\n <td>2</td>\n <td>6</td>\n <td>1</td>\n <td>24</td>\n <td>6</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"
# Matrix and Dictionary manual creation
# Create a manual dictionary
d1 = {'Nombe': pd.Series(['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']),
 'Edad': pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
 'Rating': pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
# Create a manual matrix with lists
names = ['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']
age = [25,26,25,23,30,29,23,34,40,30,51,46]
rating = [4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65]
d2 = list(zip(names, age, rating))
print(d2)
[('Tomas', 25, 4.23), ('Jaime', 26, 3.24), ('Ricardo', 25, 3.98), ('Victor', 23, 2.56), ('Esteban', 30, 3.2), ('Susana', 29, 4.6), ('Jorge', 23, 3.8), ('Lili', 34, 3.78), ('David', 40, 2.98), ('Liliana', 30, 4.8), ('Beto', 51, 4.1), ('JJ', 46, 3.65)]
# Create a sequence with an array
x = np.arange(1,15,1)
a = np.array([[1, 1, 2], [3, 5, 8], [13, 21, 34]])
print("Arange:",x)
print("Array",a)
Arange: [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
Array [[ 1 1 2]
 [ 3 5 8]
 [13 21 34]]
# DataFrames for data manipulation
# Create a panda Data frame with Dictionary or Matrix (they are the same)
df = pd.DataFrame(data = d1)
df = pd.DataFrame(data = d2, columns = ['Nombre','Edad','Rating'])
df
"data": {
Edad Nombe Rating
0 25 Tomas 4.23
1 26 Jaime 3.24
2 25 Ricardo 3.98
3 23 Victor 2.56
4 30 Esteban 3.20
5 29 Susana 4.60
6 23 Jorge 3.80
7 34 Lili 3.78
8 40 David 2.98
9 30 Liliana 4.80
10 51 Beto 4.10
11 46 JJ 3.65
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Nombe</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>25</td>\n <td>Tomas</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>26</td>\n <td>Jaime</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>25</td>\n <td>Ricardo</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>23</td>\n <td>Victor</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>30</td>\n <td>Esteban</td>\n <td>3.20</td>\n </tr>\n <tr>\n <th>5</th>\n <td>29</td>\n <td>Susana</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>6</th>\n <td>23</td>\n <td>Jorge</td>\n <td>3.80</td>\n </tr>\n <tr>\n <th>7</th>\n <td>34</td>\n <td>Lili</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>8</th>\n <td>40</td>\n <td>David</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>9</th>\n <td>30</td>\n <td>Liliana</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>51</td>\n <td>Beto</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>46</td>\n <td>JJ</td>\n <td>3.65</td>\n </tr>\n </tbody>\n</table>\n</div>"
# Data analysis and manipulation
"source": "",
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
Nombre 12 non-null object
Edad 12 non-null int64
Rating 12 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 368.0+ bytes
# First rows
df.head(5)
"data": {
Nombre Edad Rating
0 Tomas 25 4.23
1 Jaime 26 3.24
2 Ricardo 25 3.98
3 Victor 23 2.56
4 Esteban 30 3.20
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Jaime</td>\n <td>26</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Ricardo</td>\n <td>25</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Victor</td>\n <td>23</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>"
# DataFrame dimensions
print("Dim:",df.shape)
print("Rows:",len(df.index))
print("Columns:",len(df.columns))
Dim: (12, 3)
Rows: 12
Columns: 3
# Statistical description on the data frame with 2 decimals
round(df.describe(),2)
"data": {
Edad Rating
count 12.00 12.00
mean 31.83 3.74
std 9.23 0.66
min 23.00 2.56
25% 25.00 3.23
50% 29.50 3.79
75% 35.50 4.13
max 51.00 4.80
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>12.00</td>\n <td>12.00</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>31.83</td>\n <td>3.74</td>\n </tr>\n <tr>\n <th>std</th>\n <td>9.23</td>\n <td>0.66</td>\n </tr>\n <tr>\n <th>min</th>\n <td>23.00</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>25.00</td>\n <td>3.23</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>29.50</td>\n <td>3.79</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>35.50</td>\n <td>4.13</td>\n </tr>\n <tr>\n <th>max</th>\n <td>51.00</td>\n <td>4.80</td>\n </tr>\n </tbody>\n</table>\n</div>"
# Mean on all numerical features
df.mean()
"data": {
Edad 31.833333
Rating 3.743333
dtype: float64
# Max value on a specific column
df["Edad"].max()
"data": {
51
# Arrange columns, ascending, by a specific column and only show top 5
df.sort_values(by="Edad", ascending = False).head(5)
"data": {
Nombre Edad Rating
10 Beto 51 4.10
11 JJ 46 3.65
8 David 40 2.98
7 Lili 34 3.78
4 Esteban 30 3.20
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>JJ</td>\n <td>46</td>\n <td>3.65</td>\n </tr>\n <tr>\n <th>8</th>\n <td>David</td>\n <td>40</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>7</th>\n <td>Lili</td>\n <td>34</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>"
# Filter rows given a condition
df[df.Rating >= 4]
"data": {
Nombre Edad Rating
0 Tomas 25 4.23
5 Susana 29 4.60
9 Liliana 30 4.80
10 Beto 51 4.10
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Susana</td>\n <td>29</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>9</th>\n <td>Liliana</td>\n <td>30</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n </tbody>\n</table>\n</div>"
# Group, count, and arrange given a column's name
df[['Edad']].groupby('Edad')['Edad'].count().sort_values(ascending=False)
"data": {
Edad
30 2
25 2
23 2
51 1
46 1
40 1
34 1
29 1
26 1
Name: Edad, dtype: int64
# Show all column's names
list(df.columns.values)
"data": {
['Nombre', 'Edad', 'Rating']
# Export to CSV files
# Export to a CSV file on a specific path
df.to_csv('02. Python and Packages/example.csv')
