mzaradzki/pump_hidden_null_values.py

## pump_hidden_null_values.py
for col in numerical_columns: # Check frequency of most common values
    cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
    for k in cs.keys()[0:5]:
        print( col, k, int(cs[k]*1000)/10. )

# Output :
# amount_tsh 0.0    70.0 # 70% of the record are 0
# amount_tsh 500.0  5.2
# amount_tsh 50.0   4.1
# amount_tsh 1000.0 2.5
# amount_tsh 20.0   2.4

# population 0      36.0 # 36% of the record are 0
# population 1      11.8 # 12% of the record are 1
# population 200    3.2
# population 150    3.1
# population 250    2.8
	for col in numerical_columns: # Check frequency of most common values
	cs = dfX[col].value_counts(normalize=True, sort=True, ascending=False)
	for k in cs.keys()[0:5]:
	print( col, k, int(cs[k]*1000)/10. )

	# Output :
	# amount_tsh 0.0 70.0 # 70% of the record are 0
	# amount_tsh 500.0 5.2
	# amount_tsh 50.0 4.1
	# amount_tsh 1000.0 2.5
	# amount_tsh 20.0 2.4

	# population 0 36.0 # 36% of the record are 0
	# population 1 11.8 # 12% of the record are 1
	# population 200 3.2
	# population 150 3.1
	# population 250 2.8