Skip to content

Instantly share code, notes, and snippets.

@atemate
Created May 23, 2023 18:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atemate/9810c78d4c85fefdd7c4fa11a59444ea to your computer and use it in GitHub Desktop.
Save atemate/9810c78d4c85fefdd7c4fa11a59444ea to your computer and use it in GitHub Desktop.
Lightweigh pd.DataFrame without pandas
# TODO: use np.array and dtypes there
class _MyIndexer:
def __init__(self, obj) -> None:
self._obj = obj
def __getitem__(self, idx):
print('__getitem__', idx, type(idx), isinstance(idx, (list, tuple)))
if isinstance(idx, str):
return self._obj[idx]
if isinstance(idx, tuple):
if len(idx) != 2:
raise ValueError(f"Only 2-dim frames supported, got index for .loc: {idx}")
slise, columns = idx
if not isinstance(columns, (list, np.ndarray, pd.Series)):
columns = [columns]
result = [self._obj[c][slise] for c in columns]
print('__getitem__ result: ', result)
return result
if isinstance(idx, list):
result = [self._obj[c] for c in idx]
print('__getitem__ result: ', result)
return result
# if isinstance(indices, slice):
raise NotImplementedError(f"Not implemented: .loc for index of type {type(idx)}")
class MyDataFrame(dict): # not collections.UserDict for better performance
def __init__(self, *args, **kwargs):
if kwargs:
raise NotImplementedError(
f"Instantiating {self.__class__.__name__} with kwargs "
f"not supported, got: {kwargs.names()}"
)
if len(args) > 1:
raise NotImplementedError(
f"Instantiating {self.__class__.__name__} with more "
f"than one positional argument not supported, got: {len(args)}"
)
dtypes = {}
arg = args[0]
if isinstance(arg, pd.DataFrame):
df = arg
arg = df.to_dict(orient='list')
dtypes = df.dtypes.to_dict()
# elif isinstance(arg, list):
# # list of records
# arg_list = arg
# result = collections.defaultdict(list)
# for row in arg_list:
# if not isinstance(row, dict):
# raise ValueError(f"Must be a dict: {row}")
# for k, v in row.items():
# result[k].append(v)
# result["__dtypes"] = [None] * len(result[k])
# arg = result
arg = {k: np.array(v, dtype=dtypes.get(k)) for k, v in arg.items()}
dict.__init__(self, arg)
self.__indexer = _MyIndexer(self)
@property
def columns(self):
return sorted(set(self))
@property
def dtypes(self):
return set(v.dtype for v in self.values())
@property
def shape(self):
lenmap = {k: len(self[k]) for k in self}
lengths = list(set(lenmap.values()))
if len(lengths) == 0:
return (0,)
if len(lengths) > 1:
cols = {k: v for k, v in lenmap.items() if v in lengths}
raise ValueError(f"Some columns have different lengths: {cols}")
common_length = lengths[0]
return (len(lenmap), common_length)
@property
def ndim(self):
return 2
@property
def loc(self):
return self.__indexer
def iloc(self, *args, **kwargs):
raise NotImplementedError
# def drop(self, *args, columns, inplace=False, **kwargs):
# if args or kwargs:
# raise NotImplementedError()
# columns = set(columns)
# missing = columns - set(self.columns)
# if missing:
# raise KeyError(f"{list(missing)} not found in axis")
# if inplace:
# for col in columns:
# del self[col]
# else:
# return MyDataFrame({k: v for k, v in self.items() if k not in columns})
def to_df(self):
# data = {k: v for k, v in self.items() if k not in self.RESERVED_COLUMNS}
return pd.DataFrame(self)
def __array__(self):
raise NotImplementedError
# df = pd.DataFrame({'review_date': ['2022-11-12', '2022-11-13'], 'keks': [1, 2]})
# mydf = MyDataFrame(df)
# mydf.columns, mydf.dtypes, mydf.shape, mydf.to_df().dtypes
# mydf.loc['review_date'], mydf.loc[['review_date']], mydf.loc[1:, 'review_date']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment