import pandas as pd | |
# Load the data | |
df = pd.read_csv('data/adult.csv', na_values='?') | |
# Dataset Overview | |
df.head() # preview a sample | |
df.shape # number of observations and features | |
# (32561, 15) | |
df.dtypes # data types | |
#age int64 | |
#workclass object | |
#fnlwgt int64 | |
#education object | |
# (...) | |
df[df.duplicated()] # check duplicated rows | |
df.isna().sum() # missing values per feature | |
#age 0 | |
#workclass 1836 | |
#fnlwgt 0 | |
# (...) | |
df.isna().sum().sum() # number of missing cells | |
round(df.isna().sum().sum() / df.size * 100, 1) # percentage of missing cells |