Skip to content

Instantly share code, notes, and snippets.

@Godrigos
Last active December 14, 2022 13:45
Show Gist options
  • Save Godrigos/ab5e64157461e070ed16745aed294a49 to your computer and use it in GitHub Desktop.
Save Godrigos/ab5e64157461e070ed16745aed294a49 to your computer and use it in GitHub Desktop.
A small module to join gene data tables from GenBank
# Version: 2022/12/07 - 10:51
import pandas as pd
import os
import sys
# Return all files in the given directory as a list
dir = "./tables"
files: list[str] = []
for file in os.listdir(dir):
# List files that end with specific extension (.tab)
if file.endswith(".tab"):
files.append(os.path.join(dir, file))
# Exit if files list is empty
if not files:
sys.exit(f"No table files in {dir}.")
# Iterate throw files and concatenate tables with Taxa and Lineages only
# no Accession Numbers for now.
table_list: list[pd.DataFrame] = []
for file in files:
table = pd.read_table(file, header=None)[[0, 1]]
table.columns = ["Taxa", "Lineage"] # type: ignore
table_list.append(table)
Taxa = pd.concat(table_list)
# Create a dictionary of all Lineages and Taxa attributed to them
names = Taxa.groupby('Lineage').Taxa.apply( # type: ignore
list).to_dict()
# Identify Taxa with more than one name and mark it with an *
for k, v in names.items():
if len(set(v)) == 1:
names[k] = v[0]
else:
names[k] = f"*{v[0]}"
# Turn names dict into a pandas dataframe
names = pd.DataFrame(list(names.items()), columns=["Lineage", "Taxa"])
# Merge dataframes and recover the Accession Numbers and other variables
# that may exist
for file in files:
table = pd.read_table(file, header=None)[[1, 2]]
table.columns = ["Lineage",
f"{os.path.basename(file.split('final', 1)[0])}"] # type: ignore
names = names.merge(table, how="outer", on="Lineage")
# Save final table with merged results as tab separated csv
names.to_csv('./JointTable.tab', sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment