Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This script creates reference genome indexes with mixtures of organisms/assemblies
#!/usr/bin/env python
"""
This script creates reference genomes with mixtures of organisms/assemblies.
"""
import sys
import os
import argparse
import pysam
from ngs_toolkit.utils import download_gzip_file
from ngs_toolkit.utils import submit_job
BOWTIE2_EXEC = "bowtie2-build"
# STAR_EXEC = "/home/arendeiro/workspace/STAR-2.7.0e/bin/Linux_x86_64_static/STAR"
# PICARD_JAR = "/cm/shared/apps/picard-tools/1.140/picard.jar"
def parse_arguments():
organisms = ",".join(['homo_sapiens', 'drosophila_melanogaster'])
genome_providers = ",".join(['ensembl', 'ensembl'])
genome_provider_releases = ",".join(['97', '97'])
assemblies = ",".join(['GRCh38', 'BDGP6.22'])
compute_kwargs = {"partition": "develop", "time": "08:00:00"}
compute_kwargs = "{" + ",".join(
["'{}':'{}'".format(k, v) for k, v in compute_kwargs.items()]) + "}"
parser = argparse.ArgumentParser(
description=__doc__)
parser.add_argument(
"-o", "--output-dir",
dest="output_dir",
help="Output directory"
"Will be created if not existing.")
parser.add_argument(
"--organisms",
required=True,
dest="organisms",
default=organisms,
type=str,
help="Species name for organisms. Comma-delimited list.")
parser.add_argument(
"--genome-providers",
required=True,
dest="genome_providers",
default=genome_providers,
type=str,
help="Genome provider for each organism. Comma-delimited list.")
parser.add_argument(
"--genome-provider-releases",
required=True,
dest="genome_provider_releases",
default=genome_provider_releases,
type=str,
help="Release or version of genome provider for each organism. "
"Comma-delimited list.")
parser.add_argument(
"--assemblies",
required=True,
dest="assemblies",
default=assemblies,
type=str,
help="Genome assembly of each organism. Comma-delimited list.")
parser.add_argument(
"--compute-kwargs",
dest="compute_kwargs",
default=compute_kwargs,
type=str,
help="A python dictionary with arguments to pass to `divvy`."
"""Default is: "{}" """.format(compute_kwargs))
parser.add_argument(
"-d",
"--dry-run",
dest="dry",
action="store_true",
help="Don't actually do anything.")
args = parser.parse_args()
k = [
'organisms', 'genome_providers',
'genome_provider_releases', 'assemblies']
for arg in k:
setattr(args, arg, getattr(args, arg).split(","))
args.compute_kwargs = eval(args.compute_kwargs)
if not isinstance(args.compute_kwargs, dict):
parser.error("Wrong parsing of 'compute_kwargs'!")
return args
def main():
args = parse_arguments()
genome_handle = "_".join(args.assemblies)
genome_dir = os.path.join(args.output_dir, genome_handle)
genomes_dir = [os.path.join(genome_dir, assembl) for assembl in args.assemblies]
for _dir in [args.output_dir, genome_dir] + genomes_dir:
if not os.path.exists(_dir):
os.makedirs(_dir)
provider_url = {
"ensembl": "ftp://ftp.ensembl.org/pub/"
}
args_l = list(zip(
args.organisms, args.genome_providers,
args.genome_provider_releases,
args.assemblies, genomes_dir))
# Download genome files
fasta_files = dict()
gtf_files = dict()
for organism, genome_provider, release, assembly, _dir in args_l:
# Download FASTA
fasta_file = f"{organism.capitalize()}.{assembly}.dna.toplevel.fa.gz"
fasta_file = os.path.join(_dir, fasta_file).replace(".gz", "")
url = f"{provider_url[genome_provider]}release-{release}/fasta/{organism}"
url += f"/dna_index/{fasta_file}"
if not args.dry:
download_gzip_file(url, fasta_file)
fasta_files[organism] = fasta_file
# Download GTF
gtf_file = f"{organism.capitalize()}.{assembly}.{release}.gtf.gz"
gtf_file = os.path.join(_dir, gtf_file).replace(".gz", "")
url = f"{provider_url[genome_provider]}release-{release}/gtf/{organism}"
url += f"/{gtf_file}"
if not args.dry:
download_gzip_file(url, os.path.join(_dir, gtf_file))
gtf_files[organism] = gtf_file
# prefix files if needed (configurable)
prefixed_fasta_files = dict()
prefixed_gtf_files = dict()
for organism, assembly in zip(args.organisms, args.assemblies):
output = fasta_files[organism].replace(".fa", ".prefixed.fa")
sed = f"sed 's/^>/>{organism}:{assembly}::/'"
cmd = fr"{sed} {fasta_files[organism]} > {output}"
if not args.dry:
os.system(cmd)
prefixed_fasta_files[organism] = output
output = gtf_files[organism].replace(".gtf", ".prefixed.gtf")
sed = f"sed 's/^[^#]/{organism}:{assembly}::/'"
cmd = fr"{sed} {gtf_files[organism]} > {output}"
if not args.dry:
os.system(cmd)
prefixed_gtf_files[organism] = output
# Concatenate
output_fasta = os.path.join(genome_dir, genome_handle + ".fa")
cmd = f"cat {' '.join(prefixed_fasta_files.values())} > {output_fasta}"
if not args.dry:
os.system(cmd)
output_gtf = os.path.join(genome_dir, genome_handle + ".gtf")
cmd = f"cat {' '.join(prefixed_gtf_files.values())} > {output_gtf}"
if not args.dry:
os.system(cmd)
# Get chromosome size files
pysam.samtools.faidx(output_fasta) # this takes a whiiile
chrom_sizes_file = os.path.join(genome_dir, genome_handle + ".chrom.sizes")
with open(output_fasta.replace(".fa", ".fa.fai"), "r") as read:
with open(chrom_sizes_file, "w") as write:
for line in read:
write.write("\t".join(line.split("\t")[:2]) + "\n")
# Build indexes for Bowtie2 and STAR
# # Bowtie2
cmd = f"{BOWTIE2_EXEC}"
cmd += " --threads 8"
cmd += f" {output_fasta}"
cmd += f" {output_fasta.replace('.fa', '')}"
job_name = genome_handle + ".make_index.bowtie2"
bowtie2_script = os.path.join(genome_dir, job_name + ".sh")
args.compute_kwargs.update({"jobname": job_name, "cores": 8, "mem": 80000})
submit_job(cmd, bowtie2_script, **args.compute_kwargs, dry_run=args.dry)
# # # STAR
# # cmd = f"{STAR_EXEC}"
# # cmd += " --runThreadN 8"
# cmd += " --runMode genomeGenerate"
# cmd += f" --genomeDir {genome_dir}"
# cmd += f" --genomeFastaFiles {output_fasta}"
# cmd += f" --sjdbGTFfile {output_gtf}"
# cmd += " --sjdbOverhang 74"
# job_name = genome_handle + ".make_index.STAR"
# star_script = os.path.join(genome_dir, job_name + ".sh")
# args.compute_kwargs.update({"jobname": job_name, "cores": 8, "mem": 80000})
# submit_job(cmd, star_script, **args.compute_kwargs, dry_run=args.dry)
# # Create sequence dictionaries (for piccard)
# cmd = f"srun --mem 80000 -p develop java -Xmx8g -jar {PICARD_JAR}"
# cmd += " CreateSequenceDictionary"
# cmd += " REFERENCE={}".format(os.path.join(genome_dir, "Homo_sapiens.GRCh38.dna.primary_assembly.spiked.fa"))
# cmd += " OUTPUT={}".format(os.path.join(genome_dir, "Homo_sapiens.GRCh38.dna.primary_assembly.spiked.dict"))
# cmd += " GENOME_ASSEMBLY={}".format(genome_handle)
# cmd += " SPECIES=human"
# os.system(cmd)
# # Create reflat files
# cmd = "srun --mem 80000 -p develop java -Xmx80g -jar ~/Drop-seq_tools-1.12/jar/dropseq.jar ConvertToRefFlat"
# cmd += " SEQUENCE_DICTIONARY={}".format(os.path.join(genome_dir, "Homo_sapiens.GRCh38.dna.primary_assembly.spiked.dict"))
# cmd += " ANNOTATIONS_FILE= {}".format(os.path.join(genome_dir, "Homo_sapiens.GRCh38.77.spiked.gtf"))
# cmd += " OUTPUT={}".format(os.path.join(genome_dir, "Homo_sapiens.GRCh38.dna.primary_assembly.spiked.refFlat"))
# os.system(cmd)
# # Remove vanilla genome
# os.remove(os.path.join(genome_dir, "Homo_sapiens.GRCh38.dna.primary_assembly.fa"))
# os.remove(os.path.join(genome_dir, "Homo_sapiens.GRCh38.77.gtf"))
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment