Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
How to deal with non trivial missing values when using pandas read_csv
import pandas as pd
import numpy as np
import time
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
names = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
def manual_convert():
df = pd.read_csv(url, names=names)
df = df.replace('?', np.NAN)
df.loc[:, names[:-1]] = df.loc[:, names[:-1]].apply(pd.to_numeric)
def use_na_values():
df = pd.read_csv(url, names=names, na_values=["?"])
def use_converters():
df = pd.read_csv(
url,
names=names,
converters={"BI-RADS": lambda x: x if x != "?" else np.NAN}
)
def repeat(func, n=10):
times = []
for _ in range(n):
start = time.time()
func()
end = time.time()
times.append(end-start)
return sum(times)/len(times)
n = 100
print("manual_convert", repeat(manual_convert, n))
print("use_na_values", repeat(use_na_values, n))
print("use_converters", repeat(use_converters, n))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment