Skip to content

Instantly share code, notes, and snippets.

@kretes
Created December 31, 2019 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kretes/c3e96567975a727e6e6971f440a8b0e3 to your computer and use it in GitHub Desktop.
Save kretes/c3e96567975a727e6e6971f440a8b0e3 to your computer and use it in GitHub Desktop.
Reproducing the problem in xgboost - impossible to create a DMatrix from a big sparse matrix
import xgboost as xgb
from scipy.sparse.csr import csr_matrix
import numpy as np
from sklearn.datasets import load_svmlight_file
num_rows = int(np.iinfo(np.int32).max / 1000)
num_cols = 1001
more_than_int32_count = num_rows * num_cols - np.iinfo(np.int32).max
print(more_than_int32_count)
data = np.ones((num_rows,num_cols))
csr = csr_matrix(data)
print(csr.indices.dtype)
y = np.ones((num_rows,))
ds = xgb.DMatrix(csr, y)
2146836
dtype('int64')
---------------------------------------------------------------------------
SystemError Traceback (most recent call last)
<ipython-input-10-c4eb335ee525> in <module>
----> 1 ds = xgb.DMatrix(csr, y)
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in __init__(self, data, label, missing, weight, silent, feature_names, feature_types, nthread)
398 self.handle = handle
399 elif isinstance(data, scipy.sparse.csr_matrix):
--> 400 self._init_from_csr(data)
401 elif isinstance(data, scipy.sparse.csc_matrix):
402 self._init_from_csc(data)
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in _init_from_csr(self, csr)
435 handle = ctypes.c_void_p()
436 _check_call(_LIB.XGDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr),
--> 437 c_array(ctypes.c_uint, csr.indices),
438 c_array(ctypes.c_float, csr.data),
439 ctypes.c_size_t(len(csr.indptr)),
/usr/local/lib/python3.5/dist-packages/xgboost/core.py in c_array(ctype, values)
215 if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
216 return (ctype * len(values)).from_buffer_copy(values)
--> 217 return (ctype * len(values))(*values)
218
219
SystemError: ../Objects/tupleobject.c:71: bad argument to internal function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment