Skip to content

Instantly share code, notes, and snippets.

@SilasK
Created February 21, 2022 14:03
Show Gist options
  • Save SilasK/f95a2efdacb277145e878f95dcc0a497 to your computer and use it in GitHub Desktop.
Save SilasK/f95a2efdacb277145e878f95dcc0a497 to your computer and use it in GitHub Desktop.
A function to add long reads to an atlas sample table https://github.com/metagenome-atlas/atlas/issues/273
#!/usr/bin/env python
import os, shutil
from atlas.color_logger import logger
from pathlib2 import Path
import pandas as pd
import argparse
from snakemake.shell import shell
def import_long_reads(paths, sample_table,overwrite=False):
# if type(paths) ==str: paths=[paths]
## List fastq files
fastq_files= []
for path in paths:
if not os.path.exists(path):
logger.critical(f"{path} does not exist")
exit(1)
fastq_files += [ str(p) for p in Path(path).glob("**/*.[fq|fastq]*")]
# transform to pandas Series, and dump
long_read_list_file = "long_reads_list.txt"
logger.info(f"Found {len(fastq_files)} fastq files. Save to {long_read_list_file}")
long_reads= pd.Series(fastq_files)
long_reads.to_csv(long_read_list_file,sep="\t",index=False,header=False)
# read sample table
sample_table = pd.read_csv(sample_table, sep="\t")
assert "Full_Name" in sample_table.columns, "Sample table must contain a column called Full_Name"
if ("longreads" in sample_table.columns) \
and (not sample_table["longreads"].isnull().all())\
and (not overwrite)\
:
logger.error("longreads column already exists in sample table. Use --overwrite to overwrite")
exit(1)
sample_table["longreads"]= pd.NA
assert long_reads.shape[0] == sample_table.shape[0], "Sample table should have the same number of rows as the number of long reads"
# try to map long to short reads
one2one_mapping = True
for sample_id in sample_table.index:
# find long reads for this sample
sample_name = sample_table.loc[sample_id, "Full_Name"]
contained = long_reads.str.contains(sample_name)
N_mappings = contained.sum()
if N_mappings == 0:
logger.error(f"No long reads found for sample {sample_name}")
one2one_mapping = False
elif N_mappings > 1:
logger.error(f"Multiple long reads found for sample {sample_name}")
one2one_mapping = False
else:
# exact mapping
if not pd.isna(sample_table.loc[sample_id, "longreads"]):
logger.error(f"Sample {sample_name} already has a long read mapping")
one2one_mapping = False
else:
sample_table.loc[sample_id, "longreads"]= long_reads[contained].values[0]
# check duplicates
if sample_table["longreads"].duplicated().any():
logger.error("Duplicated long reads found in sample table")
one2one_mapping = False
if not one2one_mapping:
logger.error("Long reads could not be mapped one to one to short reads")
exit(1)
### test
def create_test_files( test_directory_path= "test/test_long_reads/"):
"Create a folder structure for testing long reads and short reads"
test_directory= Path(test_directory_path)
if test_directory.exists():
shutil.rmtree( test_directory )
test_directory.mkdir(exist_ok=True)
for sample in ["2017.12.04_18.45.54_sample_0","2018.01.23_11.53.11_sample_0"]:
for read_type in ["short_read", "pacbio"]:
subfolder= test_directory /"fastqs" / read_type / sample
subfolder.mkdir(parents=True,exist_ok=True)
fastq_path= subfolder / "reads.fq.gz"
fastq_path.touch()
def test(test_directory_path= "test/test_long_reads/"):
create_test_files(test_directory_path= test_directory_path)
# atlas init
shell( "cd {test_directory_path}; atlas init fastqs/short_read" )
# import long reads
import_long_reads([str(test_directory_path / "pacbio" )])
if __name__ == "__main__":
parser = argparse.ArgumentParser("Add long reads to sample table")
parser.add_argument("long_reads_paths",
help="Path to directory containing long reads fastq(.gz) files", nargs="+")
parser.add_argument("--sample_table", help="Path to sample table", default="samples.tsv")
parser.add_argument("--overwrite", help="Overwrite long reads column in sample table", action="store_true")
args = parser.parse_args()
import_long_reads(args.long_reads_paths, args.sample_table, args.overwrite)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment