joefutrelle/wide_to_long.py

## wide_to_long.py
def wide_to_long(df, wide_cols_list, value_cols, long_col, long_labels):
    """converts selected columns from wide to long format. params:

    - df: the input dataframe
    - wide_cols_list: for each set of wide columns, a list of their names
    - value_cols: for each set of wide columns, the name of the long column to hold the values
    - long_col: the name of the column to indicate which set of wide columns the value comes from
    - long_labels: for each set of wide columns, what to call it in the long_col values.

    For example if I have the following DataFrame:

    +-----------+-----+-----+-----+-----+
    | other_col | x_a | x_b | y_a | y_b |
    +-----------+-----+-----+-----+-----+
    | something |  1  |  2  |  10 |  20 |
    +-----------+-----+-----+-----+-----+

    And I pass these arguments:

    wide_cols_list = [['x_a','y_a'],['x_b','y_b']]
    value_cols = ['x','y']
    long_col = 'replicate'
    long_labels = ['a','b']

    It'll generate this dataframe:

    +-----------+---+----+-----------+
    | other_col | x |  y | replicate |
    +-----------+---+----+-----------+
    | something | 1 | 10 |     a     |
    | something | 2 | 20 |     b     |
    +-----------+---+----+-----------+
    """
    assert len(wide_cols_list) == len(long_labels)
    for w in wide_cols_list:
        assert len(w) == len(value_cols)
    exclude_cols = []
    for w in wide_cols_list:
        exclude_cols = exclude_cols + w
    common_cols = [c for c in df.columns if c not in exclude_cols]
    dfs = []
    for wide_cols, long_label in zip(wide_cols_list, long_labels):
        sdf = df[common_cols + wide_cols].copy()
        sdf[long_col] =  long_label
        sdf.columns = common_cols + value_cols + [long_col]
        dfs.append(sdf)
    return pd.concat(dfs).sort_index()
	def wide_to_long(df, wide_cols_list, value_cols, long_col, long_labels):
	"""converts selected columns from wide to long format. params:

	- df: the input dataframe
	- wide_cols_list: for each set of wide columns, a list of their names
	- value_cols: for each set of wide columns, the name of the long column to hold the values
	- long_col: the name of the column to indicate which set of wide columns the value comes from
	- long_labels: for each set of wide columns, what to call it in the long_col values.

	For example if I have the following DataFrame:

	+-----------+-----+-----+-----+-----+
	\| other_col \| x_a \| x_b \| y_a \| y_b \|
	+-----------+-----+-----+-----+-----+
	\| something \| 1 \| 2 \| 10 \| 20 \|
	+-----------+-----+-----+-----+-----+

	And I pass these arguments:

	wide_cols_list = [['x_a','y_a'],['x_b','y_b']]
	value_cols = ['x','y']
	long_col = 'replicate'
	long_labels = ['a','b']

	It'll generate this dataframe:

	+-----------+---+----+-----------+
	\| other_col \| x \| y \| replicate \|
	+-----------+---+----+-----------+
	\| something \| 1 \| 10 \| a \|
	\| something \| 2 \| 20 \| b \|
	+-----------+---+----+-----------+
	"""
	assert len(wide_cols_list) == len(long_labels)
	for w in wide_cols_list:
	assert len(w) == len(value_cols)
	exclude_cols = []
	for w in wide_cols_list:
	exclude_cols = exclude_cols + w
	common_cols = [c for c in df.columns if c not in exclude_cols]
	dfs = []
	for wide_cols, long_label in zip(wide_cols_list, long_labels):
	sdf = df[common_cols + wide_cols].copy()
	sdf[long_col] = long_label
	sdf.columns = common_cols + value_cols + [long_col]
	dfs.append(sdf)
	return pd.concat(dfs).sort_index()