SilasK/import_long_reads.py

## import_long_reads.py
#!/usr/bin/env python

import os, shutil
from atlas.color_logger import logger
from pathlib2 import Path
import pandas as pd
import argparse
from snakemake.shell import shell


def import_long_reads(paths, sample_table,overwrite=False):

    # if type(paths) ==str: paths=[paths]
    ## List fastq files
    fastq_files= []

    for path in paths:

        if not os.path.exists(path):
            logger.critical(f"{path} does not exist")
            exit(1)

        fastq_files += [ str(p) for p in Path(path).glob("**/*.[fq|fastq]*")]

    # transform to pandas Series, and dump
    long_read_list_file = "long_reads_list.txt"
    logger.info(f"Found {len(fastq_files)} fastq files. Save to {long_read_list_file}")
    long_reads= pd.Series(fastq_files)
    long_reads.to_csv(long_read_list_file,sep="\t",index=False,header=False)


    # read sample table

    sample_table = pd.read_csv(sample_table, sep="\t")

    assert "Full_Name" in sample_table.columns, "Sample table must contain a column called Full_Name"


    if ("longreads" in sample_table.columns) \
        and (not sample_table["longreads"].isnull().all())\
            and (not overwrite)\
                :

            logger.error("longreads column already exists in sample table. Use --overwrite to overwrite")
            exit(1)

    sample_table["longreads"]= pd.NA


    assert long_reads.shape[0] == sample_table.shape[0], "Sample table should have the same number of rows as the number of long reads"


    # try to map long to short reads
    one2one_mapping = True
    for sample_id in sample_table.index:

        # find long reads for this sample
        sample_name = sample_table.loc[sample_id, "Full_Name"]

        contained = long_reads.str.contains(sample_name)
        N_mappings = contained.sum()

        if N_mappings == 0:
            logger.error(f"No long reads found for sample {sample_name}")
            one2one_mapping = False

        elif N_mappings > 1:
            logger.error(f"Multiple long reads found for sample {sample_name}")
            one2one_mapping = False
        else:
            # exact mapping

            if not pd.isna(sample_table.loc[sample_id, "longreads"]):
                logger.error(f"Sample {sample_name} already has a long read mapping")

                one2one_mapping = False
            else:
                sample_table.loc[sample_id, "longreads"]= long_reads[contained].values[0]


    # check duplicates
    if sample_table["longreads"].duplicated().any():
        logger.error("Duplicated long reads found in sample table")
        one2one_mapping = False

    if not one2one_mapping:
        logger.error("Long reads could not be mapped one to one to short reads")
        exit(1)


### test


def create_test_files( test_directory_path= "test/test_long_reads/"):
    "Create a folder structure for testing long reads and short reads"

    test_directory= Path(test_directory_path)

    if test_directory.exists():
        shutil.rmtree( test_directory )

    test_directory.mkdir(exist_ok=True)

    for sample in ["2017.12.04_18.45.54_sample_0","2018.01.23_11.53.11_sample_0"]:

        for read_type in ["short_read", "pacbio"]:

            subfolder= test_directory /"fastqs" / read_type / sample
            subfolder.mkdir(parents=True,exist_ok=True)
            fastq_path= subfolder / "reads.fq.gz"

            fastq_path.touch()

def test(test_directory_path= "test/test_long_reads/"):

    create_test_files(test_directory_path= test_directory_path)

    # atlas init
    shell( "cd {test_directory_path}; atlas init fastqs/short_read" )

    # import long reads

    import_long_reads([str(test_directory_path / "pacbio" )])


if __name__ == "__main__":

    parser = argparse.ArgumentParser("Add long reads to sample table")

    parser.add_argument("long_reads_paths",
                        help="Path to directory containing long reads fastq(.gz) files", nargs="+")

    parser.add_argument("--sample_table", help="Path to sample table", default="samples.tsv")

    parser.add_argument("--overwrite", help="Overwrite long reads column in sample table", action="store_true")

    args = parser.parse_args()

    import_long_reads(args.long_reads_paths, args.sample_table, args.overwrite)
	#!/usr/bin/env python

	import os, shutil
	from atlas.color_logger import logger
	from pathlib2 import Path
	import pandas as pd
	import argparse
	from snakemake.shell import shell




	def import_long_reads(paths, sample_table,overwrite=False):

	# if type(paths) ==str: paths=[paths]
	## List fastq files
	fastq_files= []

	for path in paths:

	if not os.path.exists(path):
	logger.critical(f"{path} does not exist")
	exit(1)

	fastq_files += [ str(p) for p in Path(path).glob("*/.[fq\|fastq]*")]

	# transform to pandas Series, and dump
	long_read_list_file = "long_reads_list.txt"
	logger.info(f"Found {len(fastq_files)} fastq files. Save to {long_read_list_file}")
	long_reads= pd.Series(fastq_files)
	long_reads.to_csv(long_read_list_file,sep="\t",index=False,header=False)


	# read sample table

	sample_table = pd.read_csv(sample_table, sep="\t")

	assert "Full_Name" in sample_table.columns, "Sample table must contain a column called Full_Name"


	if ("longreads" in sample_table.columns) \
	and (not sample_table["longreads"].isnull().all())\
	and (not overwrite)\
	:

	logger.error("longreads column already exists in sample table. Use --overwrite to overwrite")
	exit(1)

	sample_table["longreads"]= pd.NA


	assert long_reads.shape[0] == sample_table.shape[0], "Sample table should have the same number of rows as the number of long reads"


	# try to map long to short reads
	one2one_mapping = True
	for sample_id in sample_table.index:

	# find long reads for this sample
	sample_name = sample_table.loc[sample_id, "Full_Name"]

	contained = long_reads.str.contains(sample_name)
	N_mappings = contained.sum()

	if N_mappings == 0:
	logger.error(f"No long reads found for sample {sample_name}")
	one2one_mapping = False

	elif N_mappings > 1:
	logger.error(f"Multiple long reads found for sample {sample_name}")
	one2one_mapping = False
	else:
	# exact mapping

	if not pd.isna(sample_table.loc[sample_id, "longreads"]):
	logger.error(f"Sample {sample_name} already has a long read mapping")

	one2one_mapping = False
	else:
	sample_table.loc[sample_id, "longreads"]= long_reads[contained].values[0]


	# check duplicates
	if sample_table["longreads"].duplicated().any():
	logger.error("Duplicated long reads found in sample table")
	one2one_mapping = False

	if not one2one_mapping:
	logger.error("Long reads could not be mapped one to one to short reads")
	exit(1)






	### test


	def create_test_files( test_directory_path= "test/test_long_reads/"):
	"Create a folder structure for testing long reads and short reads"

	test_directory= Path(test_directory_path)

	if test_directory.exists():
	shutil.rmtree( test_directory )

	test_directory.mkdir(exist_ok=True)

	for sample in ["2017.12.04_18.45.54_sample_0","2018.01.23_11.53.11_sample_0"]:

	for read_type in ["short_read", "pacbio"]:

	subfolder= test_directory /"fastqs" / read_type / sample
	subfolder.mkdir(parents=True,exist_ok=True)
	fastq_path= subfolder / "reads.fq.gz"

	fastq_path.touch()

	def test(test_directory_path= "test/test_long_reads/"):

	create_test_files(test_directory_path= test_directory_path)

	# atlas init
	shell( "cd {test_directory_path}; atlas init fastqs/short_read" )

	# import long reads

	import_long_reads([str(test_directory_path / "pacbio" )])





	if __name__ == "__main__":

	parser = argparse.ArgumentParser("Add long reads to sample table")

	parser.add_argument("long_reads_paths",
	help="Path to directory containing long reads fastq(.gz) files", nargs="+")

	parser.add_argument("--sample_table", help="Path to sample table", default="samples.tsv")

	parser.add_argument("--overwrite", help="Overwrite long reads column in sample table", action="store_true")

	args = parser.parse_args()

	import_long_reads(args.long_reads_paths, args.sample_table, args.overwrite)