gregglind/dataframe_list_of_dicts.py

## dataframe_list_of_dicts.py
"""
Sketch of proposed behaviour... make 'list of dicts'
create a (potentially) 'ragged' array, with autoguessed column names,
and sensible default values, where the keys don't match.

Current behaviour

In [215]: pandas.DataFrame([dict(a=1),dict(a=2)],columns=['a'])
Out[215]:
   a
0  {'a': 1}
1  {'a': 2}

(I happen to find this very surprising/useless behaviour!)

(one) Proposed behaviour...

#

print DataFrame2([dict(a=1,c=1,d=True),dict(b=2,c='abc')])
   a    c    d
0  1    1    True
1  NaN  abc  NaN


Proposed code follows...


This is entirely straw implementation.  Real might affect the .pyx files,
and should be reviewed for sensible-ness.

Grossnesses:

* default_iget is *foul!*
* adding another potential arg for the 'default' value...
* is this *guessing*?  If so, is it unpythonic levels of guessing?
* so much gross around itemgetter and single/multiple... ugh!
* this adds a lot of potential interactions with other 'init' forms of DataFrame

Wins:
--------

* this is *super lazy* and even lazier than R data.frame


Extensions
-------------

* should this go to collections.namedtuple as well?
* rather than guessing based on data[0], get the set of all keys over data
  (so nothing will be lost)

"""

import operator
def default_iget(default=None,*fields):
    """

    Note: it is gross the default must be first arg, but
          *fields...
    Note: *always* returns a list... unlike itemgetter,
        which can return tuples or 'singles'
    """
    myiget = operator.itemgetter(*fields)
    L = len(fields)
    def f(thing):
        try:
            ans = list(myiget(thing))
            if L < 2:
                ans = [ans,]
            return ans
        except KeyError:
            return [thing.get(x,default) for x in fields]

    f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields)
    f.__name__ = "default_itemgetter"
    return f

import pandas
def DataFrame2(data=None, columns=None,*args,**kwargs):
    # we do some preprocessing...
    if data and isinstance(data, list) and isinstance(data[0],dict):
        if columns:  # we could guard here too... using 'contracts' module?
            # this is gross that it's a full copy
            f = default_iget(None,*columns)
        else:
            columns = sorted(data[0].keys())
            f = default_iget(None,*columns)

        data = [f(x) for x in data]

    #print data
    # now data and columns are both 'clean'-ish...
    return pandas.DataFrame(data=data,columns=columns,*args,**kwargs)


print DataFrame2([dict(a=1,c=1,d=True),dict(b=2,c='abc')])
	"""
	Sketch of proposed behaviour... make 'list of dicts'
	create a (potentially) 'ragged' array, with autoguessed column names,
	and sensible default values, where the keys don't match.

	Current behaviour

	In [215]: pandas.DataFrame([dict(a=1),dict(a=2)],columns=['a'])
	Out[215]:
	a
	0 {'a': 1}
	1 {'a': 2}

	(I happen to find this very surprising/useless behaviour!)

	(one) Proposed behaviour...

	#

	print DataFrame2([dict(a=1,c=1,d=True),dict(b=2,c='abc')])
	a c d
	0 1 1 True
	1 NaN abc NaN


	Proposed code follows...


	This is entirely straw implementation. Real might affect the .pyx files,
	and should be reviewed for sensible-ness.

	Grossnesses:

	* default_iget is foul!
	* adding another potential arg for the 'default' value...
	* is this guessing? If so, is it unpythonic levels of guessing?
	* so much gross around itemgetter and single/multiple... ugh!
	* this adds a lot of potential interactions with other 'init' forms of DataFrame

	Wins:
	--------

	* this is super lazy and even lazier than R data.frame


	Extensions
	-------------

	* should this go to collections.namedtuple as well?
	* rather than guessing based on data[0], get the set of all keys over data
	(so nothing will be lost)

	"""

	import operator
	def default_iget(default=None,*fields):
	"""

	Note: it is gross the default must be first arg, but
	*fields...
	Note: always returns a list... unlike itemgetter,
	which can return tuples or 'singles'
	"""
	myiget = operator.itemgetter(*fields)
	L = len(fields)
	def f(thing):
	try:
	ans = list(myiget(thing))
	if L < 2:
	ans = [ans,]
	return ans
	except KeyError:
	return [thing.get(x,default) for x in fields]

	f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields)
	f.__name__ = "default_itemgetter"
	return f

	import pandas
	def DataFrame2(data=None, columns=None,args,*kwargs):
	# we do some preprocessing...
	if data and isinstance(data, list) and isinstance(data[0],dict):
	if columns: # we could guard here too... using 'contracts' module?
	# this is gross that it's a full copy
	f = default_iget(None,*columns)
	else:
	columns = sorted(data[0].keys())
	f = default_iget(None,*columns)

	data = [f(x) for x in data]

	#print data
	# now data and columns are both 'clean'-ish...
	return pandas.DataFrame(data=data,columns=columns,args,*kwargs)


	print DataFrame2([dict(a=1,c=1,d=True),dict(b=2,c='abc')])