Created
December 31, 2019 18:42
-
-
Save kretes/c3e96567975a727e6e6971f440a8b0e3 to your computer and use it in GitHub Desktop.
Reproducing the problem in xgboost - impossible to create a DMatrix from a big sparse matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xgboost as xgb | |
from scipy.sparse.csr import csr_matrix | |
import numpy as np | |
from sklearn.datasets import load_svmlight_file | |
num_rows = int(np.iinfo(np.int32).max / 1000) | |
num_cols = 1001 | |
more_than_int32_count = num_rows * num_cols - np.iinfo(np.int32).max | |
print(more_than_int32_count) | |
data = np.ones((num_rows,num_cols)) | |
csr = csr_matrix(data) | |
print(csr.indices.dtype) | |
y = np.ones((num_rows,)) | |
ds = xgb.DMatrix(csr, y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2146836 | |
dtype('int64') | |
--------------------------------------------------------------------------- | |
SystemError Traceback (most recent call last) | |
<ipython-input-10-c4eb335ee525> in <module> | |
----> 1 ds = xgb.DMatrix(csr, y) | |
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in __init__(self, data, label, missing, weight, silent, feature_names, feature_types, nthread) | |
398 self.handle = handle | |
399 elif isinstance(data, scipy.sparse.csr_matrix): | |
--> 400 self._init_from_csr(data) | |
401 elif isinstance(data, scipy.sparse.csc_matrix): | |
402 self._init_from_csc(data) | |
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in _init_from_csr(self, csr) | |
435 handle = ctypes.c_void_p() | |
436 _check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr), | |
--> 437 c_array(ctypes.c_uint, csr.indices), | |
438 c_array(ctypes.c_float, csr.data), | |
439 ctypes.c_size_t(len(csr.indptr)), | |
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in c_array(ctype, values) | |
215 if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype): | |
216 return (ctype * len(values)).from_buffer_copy(values) | |
--> 217 return (ctype * len(values))(*values) | |
218 | |
219 | |
SystemError: ../Objects/tupleobject.c:71: bad argument to internal function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment