Last active
December 11, 2015 16:49
-
-
Save abhirk/4630645 to your computer and use it in GitHub Desktop.
Classifier object corrupted when compressed with joblib.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.externals import joblib | |
| #load data vectors (vectorized with Tfidf) and target array | |
| def train(documents,target): | |
| vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), | |
| smooth_idf=True, sublinear_tf=True, max_df=0.5, token_pattern=ur'\b(?!\d)\w\w+\b', use_idf=False) | |
| data_vectors = vectorizer.fit_transform(documents) | |
| joblib.dump("vectorizer.joblib", compress=9) | |
| joblib.dump("data_vectors.joblib", compress=9) | |
| joblib.dump("target.joblib", compress=9) | |
| #all the vectors above load correctly, can be used with uncompressed classifier | |
| #Fit using SGD | |
| clf = SGDClassifier(loss="log", n_iter=35, alpha=0.00001) | |
| clf.fit(data_vectors, target) | |
| joblib.dump(clf, "compressedclassifier.joblib", compress=6) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.externals import joblib | |
| #load the vectorizer and classifier | |
| vectorizer = joblib.load("vectorizer.joblib") | |
| clf = joblib.load("compressedclassifier.joblib") | |
| #Vectorize the document to predict | |
| with open("topredict.txt", "rb") as f: | |
| doc=f.read() | |
| document_vector = vectorizer.transform(doc) | |
| prediction = clf.predict(document_vector) | |
| #process further using the category determined |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $ ipython | |
| Python 2.6.6 (r266:84292, Sep 11 2012, 08:34:23) | |
| Type "copyright", "credits" or "license" for more information. | |
| IPython 0.13 -- An enhanced Interactive Python. | |
| ? -> Introduction and overview of IPython's features. | |
| %quickref -> Quick reference. | |
| help -> Python's own help system. | |
| object? -> Details about 'object', use 'object??' for extra details. | |
| In [1]: import sklearn | |
| In [2]: sklearn.__version__ | |
| Out[2]: '0.14-git' | |
| In [3]: from sklearn.externals import joblib | |
| In [4]: clf=joblib.load("compressedclassifier.joblib") | |
| --------------------------------------------------------------------------- | |
| AssertionError Traceback (most recent call last) | |
| <ipython-input-5-ad6b23335871> in <module>() | |
| ----> 1 clf=joblib.load("compressedclassifier.joblib") | |
| /home/n7/newenv/lib/python2.6/site- | |
| packages/sklearn/externals/joblib/numpy_pickle.pyc in load(filename, mmap_mode) | |
| 422 | |
| 423 try: | |
| --> 424 obj = unpickler.load() | |
| 425 finally: | |
| 426 if hasattr(unpickler, 'file_handle'): | |
| /usr/lib64/python2.6/pickle.pyc in load(self) | |
| 856 while 1: | |
| 857 key = read(1) | |
| --> 858 dispatch[key](self) | |
| 859 except _Stop, stopinst: | |
| 860 return stopinst.value | |
| /home/n7/newenv/lib/python2.6/site- | |
| packages/sklearn/externals/joblib/numpy_pickle.pyc in load_build(self) | |
| 291 "but numpy didn't import correctly") | |
| 292 nd_array_wrapper = self.stack.pop() | |
| --> 293 array = nd_array_wrapper.read(self) | |
| 294 self.stack.append(array) | |
| 295 | |
| /home/n7/newenv/lib/python2.6/site- | |
| packages/sklearn/externals/joblib/numpy_pickle.pyc in read(self, unpickler) | |
| 157 filename = os.path.join(unpickler._dirname, self.filename) | |
| 158 array = | |
| unpickler.np.core.multiarray._reconstruct(*self.init_args) | |
| --> 159 data = read_zfile(open(filename, 'rb')) | |
| 160 state = self.state + (data,) | |
| 161 array.__setstate__(state) | |
| /home/n7/newenv/lib/python2.6/site- | |
| packages/sklearn/externals/joblib/numpy_pickle.pyc in read_zfile(file_handle) | |
| 69 assert len(data) == length, ( | |
| 70 "Incorrect data length while decompressing %s." | |
| ---> 71 "The file could be corrupted." % file_handle) | |
| 72 return data | |
| 73 | |
| AssertionError: Incorrect data length while decompressing <open file | |
| 'compressedclassifier.joblib_01.npy.z', mode 'rb' at 0x2d5b5d0>.The file could | |
| be corrupted. | |
| In [5]: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Vectorizer: | |
| TfidfVectorizer(analyzer=word, binary=False, charset=utf-8, | |
| charset_error=strict, dtype=<type 'long'>, input=content, | |
| lowercase=True, max_df=0.5, max_features=None, max_n=None, | |
| min_df=2, min_n=None, ngram_range=(1, 2), norm=l2, | |
| preprocessor=None, smooth_idf=True, stop_words=english, | |
| strip_accents=None, sublinear_tf=True, | |
| token_pattern=\b(?!\d)\w\w+\b, tokenizer=None, use_idf=False, | |
| vocabulary=None). | |
| Number of samples in training data: 12440 | |
| Number of features: 484762 | |
| Classifier: | |
| SGDClassifier(alpha=1e-05, class_weight=None, epsilon=0.1, eta0=0.0, | |
| fit_intercept=True, l1_ratio=0.15, learning_rate=optimal, loss=log, | |
| n_iter=35, n_jobs=1, penalty=l2, power_t=0.5, random_state=None, | |
| rho=None, shuffle=False, verbose=0, warm_start=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment