Created
February 21, 2022 14:03
-
-
Save SilasK/f95a2efdacb277145e878f95dcc0a497 to your computer and use it in GitHub Desktop.
A function to add long reads to an atlas sample table https://github.com/metagenome-atlas/atlas/issues/273
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os, shutil | |
from atlas.color_logger import logger | |
from pathlib2 import Path | |
import pandas as pd | |
import argparse | |
from snakemake.shell import shell | |
def import_long_reads(paths, sample_table,overwrite=False): | |
# if type(paths) ==str: paths=[paths] | |
## List fastq files | |
fastq_files= [] | |
for path in paths: | |
if not os.path.exists(path): | |
logger.critical(f"{path} does not exist") | |
exit(1) | |
fastq_files += [ str(p) for p in Path(path).glob("**/*.[fq|fastq]*")] | |
# transform to pandas Series, and dump | |
long_read_list_file = "long_reads_list.txt" | |
logger.info(f"Found {len(fastq_files)} fastq files. Save to {long_read_list_file}") | |
long_reads= pd.Series(fastq_files) | |
long_reads.to_csv(long_read_list_file,sep="\t",index=False,header=False) | |
# read sample table | |
sample_table = pd.read_csv(sample_table, sep="\t") | |
assert "Full_Name" in sample_table.columns, "Sample table must contain a column called Full_Name" | |
if ("longreads" in sample_table.columns) \ | |
and (not sample_table["longreads"].isnull().all())\ | |
and (not overwrite)\ | |
: | |
logger.error("longreads column already exists in sample table. Use --overwrite to overwrite") | |
exit(1) | |
sample_table["longreads"]= pd.NA | |
assert long_reads.shape[0] == sample_table.shape[0], "Sample table should have the same number of rows as the number of long reads" | |
# try to map long to short reads | |
one2one_mapping = True | |
for sample_id in sample_table.index: | |
# find long reads for this sample | |
sample_name = sample_table.loc[sample_id, "Full_Name"] | |
contained = long_reads.str.contains(sample_name) | |
N_mappings = contained.sum() | |
if N_mappings == 0: | |
logger.error(f"No long reads found for sample {sample_name}") | |
one2one_mapping = False | |
elif N_mappings > 1: | |
logger.error(f"Multiple long reads found for sample {sample_name}") | |
one2one_mapping = False | |
else: | |
# exact mapping | |
if not pd.isna(sample_table.loc[sample_id, "longreads"]): | |
logger.error(f"Sample {sample_name} already has a long read mapping") | |
one2one_mapping = False | |
else: | |
sample_table.loc[sample_id, "longreads"]= long_reads[contained].values[0] | |
# check duplicates | |
if sample_table["longreads"].duplicated().any(): | |
logger.error("Duplicated long reads found in sample table") | |
one2one_mapping = False | |
if not one2one_mapping: | |
logger.error("Long reads could not be mapped one to one to short reads") | |
exit(1) | |
### test | |
def create_test_files( test_directory_path= "test/test_long_reads/"): | |
"Create a folder structure for testing long reads and short reads" | |
test_directory= Path(test_directory_path) | |
if test_directory.exists(): | |
shutil.rmtree( test_directory ) | |
test_directory.mkdir(exist_ok=True) | |
for sample in ["2017.12.04_18.45.54_sample_0","2018.01.23_11.53.11_sample_0"]: | |
for read_type in ["short_read", "pacbio"]: | |
subfolder= test_directory /"fastqs" / read_type / sample | |
subfolder.mkdir(parents=True,exist_ok=True) | |
fastq_path= subfolder / "reads.fq.gz" | |
fastq_path.touch() | |
def test(test_directory_path= "test/test_long_reads/"): | |
create_test_files(test_directory_path= test_directory_path) | |
# atlas init | |
shell( "cd {test_directory_path}; atlas init fastqs/short_read" ) | |
# import long reads | |
import_long_reads([str(test_directory_path / "pacbio" )]) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser("Add long reads to sample table") | |
parser.add_argument("long_reads_paths", | |
help="Path to directory containing long reads fastq(.gz) files", nargs="+") | |
parser.add_argument("--sample_table", help="Path to sample table", default="samples.tsv") | |
parser.add_argument("--overwrite", help="Overwrite long reads column in sample table", action="store_true") | |
args = parser.parse_args() | |
import_long_reads(args.long_reads_paths, args.sample_table, args.overwrite) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment