Skip to content

Instantly share code, notes, and snippets.

@christinebuckler
Created August 10, 2018 03:41
Show Gist options
  • Save christinebuckler/2038c453eb76891a5b6465cf473f7c99 to your computer and use it in GitHub Desktop.
Save christinebuckler/2038c453eb76891a5b6465cf473f7c99 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""
cPickle works well and it is very flexible, but if array only data are available
numpy's save features can provide better read speed
"""
import pickle,os
import numpy as np
#######################################
## Basic pickle example
#######################################
results = {'a':1,'b':range(10)}
results_pickle = 'foo.pickle'
if os.path.exists(results_pickle):
## load it
print("...loading pickle")
tmp = open(results_pickle,'rb')
loadedResults = pickle.load(tmp)
tmp.close()
else:
## save it
print("...saving pickle")
tmp = open(results_pickle,'wb')
pickle.dump(results,tmp)
tmp.close()
## clean up and delete pickle file after loading it
## so can save new pickle with updates
print(loadedResults)
os.system("rm %s"%results_pickle)
print("done")
#######################################
## Saving multiple arrays NumPy
#######################################
a = np.arange(10)
b = np.arange(12)
file_name = 'foo.npz'
if os.path.exists(file_name):
npz = np.load(file_name)
a = npz['a']
else:
args = {'a':a,'b':b}
np.savez_compressed(file_name,**args)
a = npz['a']
b = npz['b']
print(a)
print(b)
## clean up and delete after loading file
os.system("rm %s"%file_name)
#######################################
## Pickling Objects
#######################################
class MyModel(object):
def fit(self):
pass
def predict(self):
return random.choice([True, False])
def get_data(datafile):
....
return X, y
# We don't necessarily want to retrain this model every time we want to predict because this could take a long time.
# Instead, we can pickle our model after training like so:
import random
import cPickle as pickle
# Let's look at training a model and then saving the resulting model
X, y = get_data('data/train.json')
model = MyModel()
model.fit(X, y)
with open('model.pkl', 'w') as f:
pickle.dump(model, f)
# Now we can easily load in this pickled model and use it to predict without having to retrain!
# **NOTE**: You can run into issues if packages are updated since the model was pickled.
# e.g. you train your model using a particular version of `sklearn` and then later update it and try and read in the pickled model
# Now we can read this model in without having to retrain!
with open('model.pkl') as f:
model = pickle.load(f)
model.predict(...)
#######################################
## Pickling DataFrames
#######################################
# When working in `pandas` it is extremely easy to pickle our DataFrame to use later on.
# But why would we do this when we could easily just save our DataFrame as a CSV and load that in later?
# The issue is that we may need to do processing every time we read this CSV in such as parsing datetimes into the pandas datetime object.
# Since we are pickling the DataFrame, we no longer have to worry about recomputing that date parsing.
import pandas as pd
import numpy as np
def clean_df(df):
pass
# Read in our DataFrame
df = pd.read_csv('my_awesome_csv.csv')
# Do your data cleaning which is potentially very time intensive
clean_df(df)
# Pickle the resulting DataFrame
df.to_pickle('my_dataframe.pkl')
# We can now read in this pickled DataFrame like so
df = pd.read_pickle('my_dataframe.pkl')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment