import pandas as pd
# Load the data
df = pd.read_csv('data/adult.csv', na_values='?')
# Dataset Overview
df.head() # preview a sample
df.shape # number of observations and features
# (32561, 15)
df.dtypes # data types
#age int64
#workclass object
#fnlwgt int64
#education object
# (...)
df[df.duplicated()] # check duplicated rows
df.isna().sum() # missing values per feature
#age 0
#workclass 1836
#fnlwgt 0
# (...)
df.isna().sum().sum() # number of missing cells
round(df.isna().sum().sum() / df.size * 100, 1) # percentage of missing cells