Skip to content

Instantly share code, notes, and snippets.

View GuidoTournois's full-sized avatar

Guido Tournois GuidoTournois

  • Adyen
  • Amsterdam
View GitHub Profile
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import pandas
from sklearn.linear_model import LogisticRegression
datafile = "data.csv"
chunksize = 100000
models = []
for chunk in pd.read_csv(datafile, chunksize=chunksize):
chunk = pre_process_and_feature_engineer(chunk)
# A function to clean my data and create my features
model = LogisticRegression()
model.fit(chunk[features], chunk['label'])
import pandas
import random
filename = "data.csv"
n = sum(1 for line in open(filename))-1 # Calculate number of rows in file
s = n//10 # sample size of 10%
skip = sorted(random.sample(range(1, n+1), n-s)) # n+1 to compensate for header
df = pandas.read_csv(filename, skiprows=skip)