atemate/light_data_frame.py

## light_data_frame.py
# TODO: use np.array and dtypes there


class _MyIndexer:
    def __init__(self, obj) -> None:
        self._obj = obj

    def __getitem__(self, idx):
        print('__getitem__', idx, type(idx), isinstance(idx, (list, tuple)))
        if isinstance(idx, str):
            return self._obj[idx]

        if isinstance(idx, tuple):
            if len(idx) != 2:
                raise ValueError(f"Only 2-dim frames supported, got index for .loc: {idx}")
            slise, columns = idx
            if not isinstance(columns, (list, np.ndarray, pd.Series)):
                columns = [columns]
            result = [self._obj[c][slise] for c in columns]
            print('__getitem__ result: ', result)
            return result

        if isinstance(idx, list):
            result = [self._obj[c] for c in idx]
            print('__getitem__ result: ', result)
            return result

        # if isinstance(indices, slice):
        raise NotImplementedError(f"Not implemented: .loc for index of type {type(idx)}")


class MyDataFrame(dict):  # not collections.UserDict for better performance
    def __init__(self, *args, **kwargs):
        if kwargs:
            raise NotImplementedError(
                f"Instantiating {self.__class__.__name__} with kwargs "
                f"not supported, got: {kwargs.names()}"
            )
        if len(args) > 1:
            raise NotImplementedError(
                f"Instantiating {self.__class__.__name__} with more "
                f"than one positional argument not supported, got: {len(args)}"
            )


        dtypes = {}
        arg = args[0]
        if isinstance(arg, pd.DataFrame):
            df = arg
            arg = df.to_dict(orient='list')
            dtypes = df.dtypes.to_dict()
        # elif isinstance(arg, list):
        #     # list of records
        #     arg_list = arg
        #     result = collections.defaultdict(list)
        #     for row in arg_list:
        #         if not isinstance(row, dict):
        #             raise ValueError(f"Must be a dict: {row}")
        #         for k, v in row.items():
        #             result[k].append(v)
        #     result["__dtypes"] = [None] * len(result[k])
        #     arg = result
        arg = {k: np.array(v, dtype=dtypes.get(k)) for k, v in arg.items()}
        dict.__init__(self, arg)

        self.__indexer = _MyIndexer(self)


    @property
    def columns(self):
        return sorted(set(self))

    @property
    def dtypes(self):
        return set(v.dtype for v in self.values())

    @property
    def shape(self):
        lenmap = {k: len(self[k]) for k in self}
        lengths = list(set(lenmap.values()))
        if len(lengths) == 0:
            return (0,)
        if len(lengths) > 1:
            cols = {k: v for k, v in lenmap.items() if v in lengths}
            raise ValueError(f"Some columns have different lengths: {cols}")
        common_length = lengths[0]
        return (len(lenmap), common_length)

    @property
    def ndim(self):
        return 2

    @property
    def loc(self):
        return self.__indexer

    def iloc(self, *args, **kwargs):
        raise NotImplementedError

    # def drop(self, *args, columns, inplace=False, **kwargs):
    #     if args or kwargs:
    #         raise NotImplementedError()

    #     columns = set(columns)
    #     missing = columns - set(self.columns)
    #     if missing:
    #         raise KeyError(f"{list(missing)} not found in axis")
    #     if inplace:
    #         for col in columns:
    #             del self[col]
    #     else:
    #         return MyDataFrame({k: v for k, v in self.items() if k not in columns})

    def to_df(self):
        # data = {k: v for k, v in self.items() if k not in self.RESERVED_COLUMNS}
        return pd.DataFrame(self)

    def __array__(self):
        raise NotImplementedError


# df = pd.DataFrame({'review_date': ['2022-11-12', '2022-11-13'], 'keks': [1, 2]})
# mydf = MyDataFrame(df)
# mydf.columns, mydf.dtypes, mydf.shape, mydf.to_df().dtypes
# mydf.loc['review_date'], mydf.loc[['review_date']], mydf.loc[1:, 'review_date']
	# TODO: use np.array and dtypes there


	class _MyIndexer:
	def __init__(self, obj) -> None:
	self._obj = obj

	def __getitem__(self, idx):
	print('__getitem__', idx, type(idx), isinstance(idx, (list, tuple)))
	if isinstance(idx, str):
	return self._obj[idx]

	if isinstance(idx, tuple):
	if len(idx) != 2:
	raise ValueError(f"Only 2-dim frames supported, got index for .loc: {idx}")
	slise, columns = idx
	if not isinstance(columns, (list, np.ndarray, pd.Series)):
	columns = [columns]
	result = [self._obj[c][slise] for c in columns]
	print('__getitem__ result: ', result)
	return result

	if isinstance(idx, list):
	result = [self._obj[c] for c in idx]
	print('__getitem__ result: ', result)
	return result

	# if isinstance(indices, slice):
	raise NotImplementedError(f"Not implemented: .loc for index of type {type(idx)}")


	class MyDataFrame(dict): # not collections.UserDict for better performance
	def __init__(self, args, *kwargs):
	if kwargs:
	raise NotImplementedError(
	f"Instantiating {self.__class__.__name__} with kwargs "
	f"not supported, got: {kwargs.names()}"
	)
	if len(args) > 1:
	raise NotImplementedError(
	f"Instantiating {self.__class__.__name__} with more "
	f"than one positional argument not supported, got: {len(args)}"
	)


	dtypes = {}
	arg = args[0]
	if isinstance(arg, pd.DataFrame):
	df = arg
	arg = df.to_dict(orient='list')
	dtypes = df.dtypes.to_dict()
	# elif isinstance(arg, list):
	# # list of records
	# arg_list = arg
	# result = collections.defaultdict(list)
	# for row in arg_list:
	# if not isinstance(row, dict):
	# raise ValueError(f"Must be a dict: {row}")
	# for k, v in row.items():
	# result[k].append(v)
	# result["__dtypes"] = [None] * len(result[k])
	# arg = result
	arg = {k: np.array(v, dtype=dtypes.get(k)) for k, v in arg.items()}
	dict.__init__(self, arg)

	self.__indexer = _MyIndexer(self)


	@property
	def columns(self):
	return sorted(set(self))

	@property
	def dtypes(self):
	return set(v.dtype for v in self.values())

	@property
	def shape(self):
	lenmap = {k: len(self[k]) for k in self}
	lengths = list(set(lenmap.values()))
	if len(lengths) == 0:
	return (0,)
	if len(lengths) > 1:
	cols = {k: v for k, v in lenmap.items() if v in lengths}
	raise ValueError(f"Some columns have different lengths: {cols}")
	common_length = lengths[0]
	return (len(lenmap), common_length)

	@property
	def ndim(self):
	return 2

	@property
	def loc(self):
	return self.__indexer

	def iloc(self, args, *kwargs):
	raise NotImplementedError

	# def drop(self, args, columns, inplace=False, *kwargs):
	# if args or kwargs:
	# raise NotImplementedError()

	# columns = set(columns)
	# missing = columns - set(self.columns)
	# if missing:
	# raise KeyError(f"{list(missing)} not found in axis")
	# if inplace:
	# for col in columns:
	# del self[col]
	# else:
	# return MyDataFrame({k: v for k, v in self.items() if k not in columns})

	def to_df(self):
	# data = {k: v for k, v in self.items() if k not in self.RESERVED_COLUMNS}
	return pd.DataFrame(self)

	def __array__(self):
	raise NotImplementedError




	# df = pd.DataFrame({'review_date': ['2022-11-12', '2022-11-13'], 'keks': [1, 2]})
	# mydf = MyDataFrame(df)
	# mydf.columns, mydf.dtypes, mydf.shape, mydf.to_df().dtypes
	# mydf.loc['review_date'], mydf.loc[['review_date']], mydf.loc[1:, 'review_date']