Created
February 1, 2024 18:06
-
-
Save dimitri-justeau/0b0fa091f9c506b64476b369e19e37b0 to your computer and use it in GitHub Desktop.
Convert a flattened taxonomy csv file to a linked taxonomy csv file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Convert a flattened taxonomy csv file to a linked taxonomy csv file. | |
""" | |
import pandas as pd | |
def get_linked_taxonomy(flat_taxonomy, rank_cols, parent_id_col="id_parent", drop_rank_cols=True): | |
""" | |
Converts a flat taxonomy DataFrame (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored) to a linked | |
taxonomy DataFrame (i.e. where nodes in the taxonomic tree are only linked through their parent). | |
This method assumes that the index of the DataFrame is the taxon identifier, and that every rank column points to a valid taxon. | |
:param flat_taxonomy: A pandas DataFrame describing a flat taxonomy (with index being the taxon id). | |
:param rank_cols: List the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level). | |
:param parent_id_col: Output column label for parent identifier. | |
:param drop_rank_cols: If True, drop the rank columns in output DataFrame. | |
:return: The linked taxonomy in a Pandas DataFrame. | |
""" | |
taxo_linked = flat_taxonomy.copy() | |
prev = {rank_cols[i]: rank_cols[i + 1] for i in range(0, len(rank_cols) - 1)} | |
parent_col = [r[prev[r[rank_cols].first_valid_index()]] if r[rank_cols].first_valid_index() in prev.keys() else None for r in taxo_linked.iloc] | |
taxo_linked.insert(2, parent_id_col, parent_col) | |
if drop_rank_cols: | |
taxo_linked.drop(rank_cols, axis=1, inplace=True) | |
return taxo_linked | |
if __name__ == '__main__': | |
import argparse | |
description = \ | |
""" | |
Converts a flat taxonomy csv file (i.e. where every taxonomic rank from the root parent of each taxa is explicitly stored) | |
into a linked taxonomy csv file (i.e. where nodes in the taxonomic tree are only linked through their parent). | |
Note: the input csv file must have a header row. | |
""" | |
parser = argparse.ArgumentParser(description=description) | |
parser.add_argument( | |
'flat_taxonomy_csv', | |
type=str, | |
help="Path to the input flat taxonomy csv file." | |
) | |
parser.add_argument( | |
'linked_taxonomy_csv', | |
type=str, | |
help="Path to the output linked taxonomy csv file." | |
) | |
parser.add_argument( | |
'id_col', | |
type=str, | |
help="Label of the taxon id column." | |
) | |
parser.add_argument( | |
'rank_cols', | |
type=str, | |
help="Comma-separated list of the rank columns labels, must be sorted from the highest rank (i.e. deepest level) to the lowest (i.e. root level)." | |
) | |
parser.add_argument( | |
'--parent_col', | |
type=str, | |
default="id_parent", | |
help="Label of output parent column." | |
) | |
parser.add_argument( | |
'--drop_rank_cols', | |
type=bool, | |
default=True, | |
help="If True, If True, drop the rank columns in output csv." | |
) | |
args = parser.parse_args() | |
rank_cols = args.rank_cols.split(",") | |
taxo_flat = pd.read_csv(args.flat_taxonomy_csv, index_col=args.id_col) | |
linked = get_linked_taxonomy(taxo_flat, rank_cols=rank_cols, parent_id_col=args.parent_col, drop_rank_cols=args.drop_rank_cols) | |
linked.to_csv(args.linked_taxonomy_csv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment