Skip to content

Instantly share code, notes, and snippets.

@abhirk
Last active December 11, 2015 16:49
Show Gist options
  • Select an option

  • Save abhirk/4630645 to your computer and use it in GitHub Desktop.

Select an option

Save abhirk/4630645 to your computer and use it in GitHub Desktop.
Classifier object corrupted when compressed with joblib.
from sklearn.linear_model import SGDClassifier
from sklearn.externals import joblib
#load data vectors (vectorized with Tfidf) and target array
def train(documents,target):
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2),
smooth_idf=True, sublinear_tf=True, max_df=0.5, token_pattern=ur'\b(?!\d)\w\w+\b', use_idf=False)
data_vectors = vectorizer.fit_transform(documents)
joblib.dump("vectorizer.joblib", compress=9)
joblib.dump("data_vectors.joblib", compress=9)
joblib.dump("target.joblib", compress=9)
#all the vectors above load correctly, can be used with uncompressed classifier
#Fit using SGD
clf = SGDClassifier(loss="log", n_iter=35, alpha=0.00001)
clf.fit(data_vectors, target)
joblib.dump(clf, "compressedclassifier.joblib", compress=6)
from sklearn.externals import joblib
#load the vectorizer and classifier
vectorizer = joblib.load("vectorizer.joblib")
clf = joblib.load("compressedclassifier.joblib")
#Vectorize the document to predict
with open("topredict.txt", "rb") as f:
doc=f.read()
document_vector = vectorizer.transform(doc)
prediction = clf.predict(document_vector)
#process further using the category determined
$ ipython
Python 2.6.6 (r266:84292, Sep 11 2012, 08:34:23)
Type "copyright", "credits" or "license" for more information.
IPython 0.13 -- An enhanced Interactive Python.
? -> Introduction and overview of IPython's features.
%quickref -> Quick reference.
help -> Python's own help system.
object? -> Details about 'object', use 'object??' for extra details.
In [1]: import sklearn
In [2]: sklearn.__version__
Out[2]: '0.14-git'
In [3]: from sklearn.externals import joblib
In [4]: clf=joblib.load("compressedclassifier.joblib")
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-5-ad6b23335871> in <module>()
----> 1 clf=joblib.load("compressedclassifier.joblib")
/home/n7/newenv/lib/python2.6/site-
packages/sklearn/externals/joblib/numpy_pickle.pyc in load(filename, mmap_mode)
422
423 try:
--> 424 obj = unpickler.load()
425 finally:
426 if hasattr(unpickler, 'file_handle'):
/usr/lib64/python2.6/pickle.pyc in load(self)
856 while 1:
857 key = read(1)
--> 858 dispatch[key](self)
859 except _Stop, stopinst:
860 return stopinst.value
/home/n7/newenv/lib/python2.6/site-
packages/sklearn/externals/joblib/numpy_pickle.pyc in load_build(self)
291 "but numpy didn't import correctly")
292 nd_array_wrapper = self.stack.pop()
--> 293 array = nd_array_wrapper.read(self)
294 self.stack.append(array)
295
/home/n7/newenv/lib/python2.6/site-
packages/sklearn/externals/joblib/numpy_pickle.pyc in read(self, unpickler)
157 filename = os.path.join(unpickler._dirname, self.filename)
158 array =
unpickler.np.core.multiarray._reconstruct(*self.init_args)
--> 159 data = read_zfile(open(filename, 'rb'))
160 state = self.state + (data,)
161 array.__setstate__(state)
/home/n7/newenv/lib/python2.6/site-
packages/sklearn/externals/joblib/numpy_pickle.pyc in read_zfile(file_handle)
69 assert len(data) == length, (
70 "Incorrect data length while decompressing %s."
---> 71 "The file could be corrupted." % file_handle)
72 return data
73
AssertionError: Incorrect data length while decompressing <open file
'compressedclassifier.joblib_01.npy.z', mode 'rb' at 0x2d5b5d0>.The file could
be corrupted.
In [5]:
Vectorizer:
TfidfVectorizer(analyzer=word, binary=False, charset=utf-8,
charset_error=strict, dtype=<type 'long'>, input=content,
lowercase=True, max_df=0.5, max_features=None, max_n=None,
min_df=2, min_n=None, ngram_range=(1, 2), norm=l2,
preprocessor=None, smooth_idf=True, stop_words=english,
strip_accents=None, sublinear_tf=True,
token_pattern=\b(?!\d)\w\w+\b, tokenizer=None, use_idf=False,
vocabulary=None).
Number of samples in training data: 12440
Number of features: 484762
Classifier:
SGDClassifier(alpha=1e-05, class_weight=None, epsilon=0.1, eta0=0.0,
fit_intercept=True, l1_ratio=0.15, learning_rate=optimal, loss=log,
n_iter=35, n_jobs=1, penalty=l2, power_t=0.5, random_state=None,
rho=None, shuffle=False, verbose=0, warm_start=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment