dimitri-justeau/niamoto_link_taxonomy.py

## niamoto_link_taxonomy.py
"""
Convert a flattened taxonomy csv file to a linked taxonomy csv file.
"""

import pandas as pd


def get_linked_taxonomy(flat_taxonomy, rank_cols, parent_id_col="id_parent", drop_rank_cols=True):
    """
    Converts a flat taxonomy DataFrame (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored) to a linked
    taxonomy DataFrame (i.e. where nodes in the taxonomic tree are only linked through their parent).

    This method assumes that the index of the DataFrame is the taxon identifier, and that every rank column points to a valid taxon.

    :param flat_taxonomy: A pandas DataFrame describing a flat taxonomy (with index being the taxon id).
    :param rank_cols: List the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level).
    :param parent_id_col: Output column label for parent identifier.
    :param drop_rank_cols: If True, drop the rank columns in output DataFrame.
    :return: The linked taxonomy in a Pandas DataFrame.
    """
    taxo_linked = flat_taxonomy.copy()
    prev = {rank_cols[i]: rank_cols[i + 1] for i in range(0, len(rank_cols) - 1)}
    parent_col = [r[prev[r[rank_cols].first_valid_index()]] if r[rank_cols].first_valid_index() in prev.keys() else None for r in taxo_linked.iloc]
    taxo_linked.insert(2, parent_id_col, parent_col)
    if drop_rank_cols:
        taxo_linked.drop(rank_cols, axis=1, inplace=True)
    return taxo_linked


if __name__ == '__main__':

    import argparse

    description = \
        """
        Converts a flat taxonomy csv file (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored)
        into a linked taxonomy csv file (i.e. where nodes in the taxonomic tree are only linked through their parent).
        Note: the input csv file must have a header row.
        """
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        'flat_taxonomy_csv',
        type=str,
        help="Path to the input flat taxonomy csv file."
    )
    parser.add_argument(
        'linked_taxonomy_csv',
        type=str,
        help="Path to the output linked taxonomy csv file."
    )
    parser.add_argument(
        'id_col',
        type=str,
        help="Label of the taxon id column."
    )
    parser.add_argument(
        'rank_cols',
        type=str,
        help="Comma-separated list of the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level)."
    )

    parser.add_argument(
        '--parent_col',
        type=str,
        default="id_parent",
        help="Label of output parent column."
    )
    parser.add_argument(
        '--drop_rank_cols',
        type=bool,
        default=True,
        help="If True, If True, drop the rank columns in output csv."
    )

    args = parser.parse_args()
    rank_cols = args.rank_cols.split(",")
    taxo_flat = pd.read_csv(args.flat_taxonomy_csv, index_col=args.id_col)
    linked = get_linked_taxonomy(taxo_flat, rank_cols=rank_cols, parent_id_col=args.parent_col, drop_rank_cols=args.drop_rank_cols)
    linked.to_csv(args.linked_taxonomy_csv)
	"""
	Convert a flattened taxonomy csv file to a linked taxonomy csv file.
	"""

	import pandas as pd


	def get_linked_taxonomy(flat_taxonomy, rank_cols, parent_id_col="id_parent", drop_rank_cols=True):
	"""
	Converts a flat taxonomy DataFrame (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored) to a linked
	taxonomy DataFrame (i.e. where nodes in the taxonomic tree are only linked through their parent).

	This method assumes that the index of the DataFrame is the taxon identifier, and that every rank column points to a valid taxon.

	:param flat_taxonomy: A pandas DataFrame describing a flat taxonomy (with index being the taxon id).
	:param rank_cols: List the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level).
	:param parent_id_col: Output column label for parent identifier.
	:param drop_rank_cols: If True, drop the rank columns in output DataFrame.
	:return: The linked taxonomy in a Pandas DataFrame.
	"""
	taxo_linked = flat_taxonomy.copy()
	prev = {rank_cols[i]: rank_cols[i + 1] for i in range(0, len(rank_cols) - 1)}
	parent_col = [r[prev[r[rank_cols].first_valid_index()]] if r[rank_cols].first_valid_index() in prev.keys() else None for r in taxo_linked.iloc]
	taxo_linked.insert(2, parent_id_col, parent_col)
	if drop_rank_cols:
	taxo_linked.drop(rank_cols, axis=1, inplace=True)
	return taxo_linked


	if __name__ == '__main__':

	import argparse

	description = \
	"""
	Converts a flat taxonomy csv file (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored)
	into a linked taxonomy csv file (i.e. where nodes in the taxonomic tree are only linked through their parent).
	Note: the input csv file must have a header row.
	"""
	parser = argparse.ArgumentParser(description=description)
	parser.add_argument(
	'flat_taxonomy_csv',
	type=str,
	help="Path to the input flat taxonomy csv file."
	)
	parser.add_argument(
	'linked_taxonomy_csv',
	type=str,
	help="Path to the output linked taxonomy csv file."
	)
	parser.add_argument(
	'id_col',
	type=str,
	help="Label of the taxon id column."
	)
	parser.add_argument(
	'rank_cols',
	type=str,
	help="Comma-separated list of the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level)."
	)

	parser.add_argument(
	'--parent_col',
	type=str,
	default="id_parent",
	help="Label of output parent column."
	)
	parser.add_argument(
	'--drop_rank_cols',
	type=bool,
	default=True,
	help="If True, If True, drop the rank columns in output csv."
	)

	args = parser.parse_args()
	rank_cols = args.rank_cols.split(",")
	taxo_flat = pd.read_csv(args.flat_taxonomy_csv, index_col=args.id_col)
	linked = get_linked_taxonomy(taxo_flat, rank_cols=rank_cols, parent_id_col=args.parent_col, drop_rank_cols=args.drop_rank_cols)
	linked.to_csv(args.linked_taxonomy_csv)