Skip to content

Instantly share code, notes, and snippets.

@sminot
Created March 2, 2021 00:30
Show Gist options
  • Save sminot/7dc131a3f8382a876b60cfeb6c194dce to your computer and use it in GitHub Desktop.
Save sminot/7dc131a3f8382a876b60cfeb6c194dce to your computer and use it in GitHub Desktop.
Merge data from multiple files containing MetaPhlAn2 outputs
#!/usr/bin/env python3
import os
import pandas as pd
import sys
def read_metaphlan(fp):
"""Function to read a single file with metaphlan2 results encoded as a TSV."""
return pd.read_csv( # Function to read tabular data
fp, # Specify the file path
sep="\t", # Columns are delimited with tabs
comment="#", # Skip all columns starting with a # comment character
header=None, # Provide our own custom header (on the next line)
names=['clade', 'relative_abundance', 'coverage', 'average_genome_length_in_the_clade', 'estimated_number_of_reads_from_the_clade']
)
def read_metaphlan_folder(folder, file_suffix=".metaphlan2.tsv"):
"""Function to read in and merge all of the metaphlan2 results in a particular folder."""
# Format all data as a dict -- keys are sample names, values are the metaphlan abundances
dat = dict()
# Iterate over each of the files in the input folder
for fp in os.listdir(folder):
# If the folder ends with the specified file suffix
if fp.endswith(file_suffix):
# Parse the sample name from the file path
sample_name = fp.replace(file_suffix, "")
# Read in and save the data from this file
dat[sample_name] = read_metaphlan(os.path.join(folder, fp))
# Return all of the data that has been read in
return dat
# The user should provide two inputs, the path to the folder containing all input files, and the output path
# Make sure that there are two input arguments
assert len(sys.argv) == 3, "Please specify input folder and output path."
# Parse the arguments
input_folder = sys.argv[1]
output_path = sys.argv[2]
# Make sure the input folder exists
assert os.path.exists(input_folder), f"Input folder does not exist: {input_folder}"
# Make sure the output file does not exist (so that we don't overwrite it)
assert not os.path.exists(output_path), f"Output path already exists ({output_path}), please specify another."
# Read in all of the input data
dat = read_metaphlan_folder(input_folder)
print(f"Read in data for {len(dat):,} specimens")
# Make sure that we were able to read in some data
assert len(dat) > 0, f"No files were found to parse in the input folder: {input_folder}"
# Format the output by merging all tables into a DataFrame
df = pd.DataFrame({
sample_name: sample_dat.set_index("clade")["relative_abundance"]
for sample_name, sample_dat in dat.items()
}).fillna(0)
print(f"Read in data for {df.shape[0]:,} taxa")
# Write out to a file
df.to_csv(output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment