adyork/get_flat_header.py

## get_flat_header.py
'''
get_flat_header(group_results)
get_flat_header(group_results,combine=True)

* For use with pandas .groupby() and .agg() results
* For when you get back multilevel headers, and you just want one header list .
* Returns list of column names.

if combine=False (default), then it returns the column name as the last name in the list for that column.
      e.g.
      .agg({
          'orig colname':{
              'my name for col':'sum',
              'num_rows': 'count'
              }
       becomes
     ('my name for col','num_rows')


if combine=True then it combines all names in list with an underscore.  This is good if you didn't provide a name for the
for the agg operation.
      e.g.
      .agg('orig colname':['sum','count'])
      becomes column names
      ('orig colname_sum','orig colname_count')

'''
def get_flat_header(group_results,combine=False):
    header_array = group_results.columns.ravel()
    #build new list
    flat_header = list()
    for colnames in list(header_array):
        colnames  = list(filter(None, colnames)) #remove blank ones
        if combine: # if you didn't provide a name for the agg, this is best.
            colname = '_'.join(colnames)
        else: #use last name in list (if you provided a name for the agg, this is what you want.)
            colname = colnames[-1] #  take last element
        flat_header.append(colname)
    return flat_header


'''
Example 1: if you didn't provide names in agg() the use combine=True

group_results = df.groupby(['Group ID'],as_index=False).agg({'n instances':['sum','first','count']})

list(group_results.columns.ravel())
# [('Group ID', ''), ('n instances', 'sum'), ('n instances', 'first'), ('n instances', 'count')]

group_results.columns = get_flat_header(group_results,combine=True)

list(group_results.columns.ravel())
# ['Group ID', 'n instances_sum', 'n instances_first', 'n instances_count']

Example 2: If you provided column names in agg()
group_results = df.groupby(['Group ID'],as_index=False)\
    .agg({
    'n instances':{
        'my name for col':'sum',
        'num_rows': 'count'
        },
    'peptides including PTMs':{
        'num_peps_PTMs': 'nunique',
        'peps_and_PTMs':lambda x :  ';'.join(list(x))
        }
    })

# e.g. you have:

list(group_results.columns.ravel())
#  [('Group ID', ''), ('n instances', 'my name for col'), ('n instances', 'num_rows'), ('peptides including PTMs', 'num_peps_PTMs'), ('peptides including PTMs', 'peps_and_PTMs')]

group_results.columns = get_flat_header(group_results)

list(group_results.columns.ravel())
# ['Group ID', 'my name for col', 'num_rows','num_peps_PTMs','peps_and_PTMs']
'''
	'''
	get_flat_header(group_results)
	get_flat_header(group_results,combine=True)

	* For use with pandas .groupby() and .agg() results
	* For when you get back multilevel headers, and you just want one header list .
	* Returns list of column names.

	if combine=False (default), then it returns the column name as the last name in the list for that column.
	e.g.
	.agg({
	'orig colname':{
	'my name for col':'sum',
	'num_rows': 'count'
	}
	becomes
	('my name for col','num_rows')


	if combine=True then it combines all names in list with an underscore. This is good if you didn't provide a name for the
	for the agg operation.
	e.g.
	.agg('orig colname':['sum','count'])
	becomes column names
	('orig colname_sum','orig colname_count')

	'''
	def get_flat_header(group_results,combine=False):
	header_array = group_results.columns.ravel()
	#build new list
	flat_header = list()
	for colnames in list(header_array):
	colnames = list(filter(None, colnames)) #remove blank ones
	if combine: # if you didn't provide a name for the agg, this is best.
	colname = '_'.join(colnames)
	else: #use last name in list (if you provided a name for the agg, this is what you want.)
	colname = colnames[-1] # take last element
	flat_header.append(colname)
	return flat_header


	'''
	Example 1: if you didn't provide names in agg() the use combine=True

	group_results = df.groupby(['Group ID'],as_index=False).agg({'n instances':['sum','first','count']})

	list(group_results.columns.ravel())
	# [('Group ID', ''), ('n instances', 'sum'), ('n instances', 'first'), ('n instances', 'count')]

	group_results.columns = get_flat_header(group_results,combine=True)

	list(group_results.columns.ravel())
	# ['Group ID', 'n instances_sum', 'n instances_first', 'n instances_count']

	Example 2: If you provided column names in agg()
	group_results = df.groupby(['Group ID'],as_index=False)\
	.agg({
	'n instances':{
	'my name for col':'sum',
	'num_rows': 'count'
	},
	'peptides including PTMs':{
	'num_peps_PTMs': 'nunique',
	'peps_and_PTMs':lambda x : ';'.join(list(x))
	}
	})

	# e.g. you have:

	list(group_results.columns.ravel())
	# [('Group ID', ''), ('n instances', 'my name for col'), ('n instances', 'num_rows'), ('peptides including PTMs', 'num_peps_PTMs'), ('peptides including PTMs', 'peps_and_PTMs')]

	group_results.columns = get_flat_header(group_results)

	list(group_results.columns.ravel())
	# ['Group ID', 'my name for col', 'num_rows','num_peps_PTMs','peps_and_PTMs']
	'''