Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created January 14, 2020 10:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/a5449450fbbe862da0c21bcde5a7d82c to your computer and use it in GitHub Desktop.
Save afrendeiro/a5449450fbbe862da0c21bcde5a7d82c to your computer and use it in GitHub Desktop.
Build a transcriptome reference with additional chromosomes for CRISPR gRNA constructs
#!/usr/bin/env python
"""
This script generates a transcriptome reference with additional
scaffolds/chromosomes for CRISPR gRNA constructs.
"""
import sys
import os
from argparse import ArgumentParser
import yaml
import pandas as pd
FASTA_HEADER_TEMPLATE = (
">{chrom}_chrom dna:chromosome chromosome:"
"GRCh38:{chrom}_chrom:1:{length}:1 REF"
)
GTF_TEMPLATE = (
"{chrom}_chrom\thavana\tgene\t1\t{length}\t.\t+\t.\t"
"""gene_id "{id}_gene"; gene_name "{id}_gene"; gene_source "ensembl_havana"; """
"""gene_biotype "lincRNA";"""
"""{chrom}_chrom\thavana\ttranscript\t1\t{length}\t.\t+\t.\t"""
"""gene_id "{id}_gene"; transcript_id "{id}_transcript"; """
"""gene_name "{id}_gene"; gene_source "ensembl_havana"; """
"""gene_biotype "lincRNA"; transcript_name "{id}_transcript"; """
"""transcript_source "havana";"""
"""{chrom}_chrom\thavana\texon\t1\t{length}\t.\t+\t.\tgene_id "{id}_gene"; """
"""transcript_id "{id}_transcript"; exon_number "1"; gene_name "{id}_gene"; """
"""gene_source "ensembl_havana"; gene_biotype "lincRNA";"""
"""transcript_name "{id}_transcript"; transcript_source "havana"; exon_id "{id}_exon";\n"""
)
CROPSEQ_VECTOR = {
"u6": "GAGGGCCTATTTCCCATGATTCCTTCATATTTGCATATACGATACAAGGCTGTTAGAGAGATAATTAGAATTAATTTGACTGTAAACACAAAGATATTAGTACAAAATACGTGACGTAGAAAGTAATAATTTCTTGGGTAGTTTGCAGTTTTAAAATTATGTTTTAAAATGGACTATCATATGCTTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG",
"rest": "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTAAGCTTGGCGTAACTAGATCTTGAGACACTGCTTTTTGCTTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGT",
"cas9": "ATGGACAAGAAGTACAGCATCGGCCTGGACATCGGCACCAACTCTGTGGGCTGGGCCGTGATCACCGACGAGTACAAGGTGCCCAGCAAGAAATTCAAGGTGCTGGGCAACACCGACCGGCACAGCATCAAGAAGAACCTGATCGGAGCCCTGCTGTTCGACAGCGGCGAAACAGCCGAGGCCACCCGGCTGAAGAGAACCGCCAGAAGAAGATACACCAGACGGAAGAACCGGATCTGCTATCTGCAAGAGATCTTCAGCAACGAGATGGCCAAGGTGGACGACAGCTTCTTCCACAGACTGGAAGAGTCCTTCCTGGTGGAAGAGGATAAGAAGCACGAGCGGCACCCCATCTTCGGCAACATCGTGGACGAGGTGGCCTACCACGAGAAGTACCCCACCATCTACCACCTGAGAAAGAAACTGGTGGACAGCACCGACAAGGCCGACCTGCGGCTGATCTATCTGGCCCTGGCCCACATGATCAAGTTCCGGGGCCACTTCCTGATCGAGGGCGACCTGAACCCCGACAACAGCGACGTGGACAAGCTGTTCATCCAGCTGGTGCAGACCTACAACCAGCTGTTCGAGGAAAACCCCATCAACGCCAGCGGCGTGGACGCCAAGGCCATCCTGTCTGCCAGACTGAGCAAGAGCAGACGGCTGGAAAATCTGATCGCCCAGCTGCCCGGCGAGAAGAAGAATGGCCTGTTCGGAAACCTGATTGCCCTGAGCCTGGGCCTGACCCCCAACTTCAAGAGCAACTTCGACCTGGCCGAGGATGCCAAACTGCAGCTGAGCAAGGACACCTACGACGACGACCTGGACAACCTGCTGGCCCAGATCGGCGACCAGTACGCCGACCTGTTTCTGGCCGCCAAGAACCTGTCCGACGCCATCCTGCTGAGCGACATCCTGAGAGTGAACACCGAGATCACCAAGGCCCCCCTGAGCGCCTCTATGATCAAGAGATACGACGAGCACCACCAGGACCTGACCCTGCTGAAAGCTCTCGTGCGGCAGCAGCTGCCTGAGAAGTACAAAGAGATTTTCTTCGACCAGAGCAAGAACGGCTACGCCGGCTACATTGACGGCGGAGCCAGCCAGGAAGAGTTCTACAAGTTCATCAAGCCCATCCTGGAAAAGATGGACGGCACCGAGGAACTGCTCGTGAAGCTGAACAGAGAGGACCTGCTGCGGAAGCAGCGGACCTTCGACAACGGCAGCATCCCCCACCAGATCCACCTGGGAGAGCTGCACGCCATTCTGCGGCGGCAGGAAGATTTTTACCCATTCCTGAAGGACAACCGGGAAAAGATCGAGAAGATCCTGACCTTCCGCATCCCCTACTACGTGGGCCCTCTGGCCAGGGGAAACAGCAGATTCGCCTGGATGACCAGAAAGAGCGAGGAAACCATCACCCCCTGGAACTTCGAGGAAGTGGTGGACAAGGGCGCTTCCGCCCAGAGCTTCATCGAGCGGATGACCAACTTCGATAAGAACCTGCCCAACGAGAAGGTGCTGCCCAAGCACAGCCTGCTGTACGAGTACTTCACCGTGTATAACGAGCTGACCAAAGTGAAATACGTGACCGAGGGAATGAGAAAGCCCGCCTTCCTGAGCGGCGAGCAGAAAAAGGCCATCGTGGACCTGCTGTTCAAGACCAACCGGAAAGTGACCGTGAAGCAGCTGAAAGAGGACTACTTCAAGAAAATCGAGTGCTTCGACTCCGTGGAAATCTCCGGCGTGGAAGATCGGTTCAACGCCTCCCTGGGCACATACCACGATCTGCTGAAAATTATCAAGGACAAGGACTTCCTGGACAATGAGGAAAACGAGGACATTCTGGAAGATATCGTGCTGACCCTGACACTGTTTGAGGACAGAGAGATGATCGAGGAACGGCTGAAAACCTATGCCCACCTGTTCGACGACAAAGTGATGAAGCAGCTGAAGCGGCGGAGATACACCGGCTGGGGCAGGCTGAGCCGGAAGCTGATCAACGGCATCCGGGACAAGCAGTCCGGCAAGACAATCCTGGATTTCCTGAAGTCCGACGGCTTCGCCAACAGAAACTTCATGCAGCTGATCCACGACGACAGCCTGACCTTTAAAGAGGACATCCAGAAAGCCCAGGTGTCCGGCCAGGGCGATAGCCTGCACGAGCACATTGCCAATCTGGCCGGCAGCCCCGCCATTAAGAAGGGCATCCTGCAGACAGTGAAGGTGGTGGACGAGCTCGTGAAAGTGATGGGCCGGCACAAGCCCGAGAACATCGTGATCGAAATGGCCAGAGAGAACCAGACCACCCAGAAGGGACAGAAGAACAGCCGCGAGAGAATGAAGCGGATCGAAGAGGGCATCAAAGAGCTGGGCAGCCAGATCCTGAAAGAACACCCCGTGGAAAACACCCAGCTGCAGAACGAGAAGCTGTACCTGTACTACCTGCAGAATGGGCGGGATATGTACGTGGACCAGGAACTGGACATCAACCGGCTGTCCGACTACGATGTGGACCATATCGTGCCTCAGAGCTTTCTGAAGGACGACTCCATCGACAACAAGGTGCTGACCAGAAGCGACAAGAACCGGGGCAAGAGCGACAACGTGCCCTCCGAAGAGGTCGTGAAGAAGATGAAGAACTACTGGCGGCAGCTGCTGAACGCCAAGCTGATTACCCAGAGAAAGTTCGACAATCTGACCAAGGCCGAGAGAGGCGGCCTGAGCGAACTGGATAAGGCCGGCTTCATCAAGAGACAGCTGGTGGAAACCCGGCAGATCACAAAGCACGTGGCACAGATCCTGGACTCCCGGATGAACACTAAGTACGACGAGAATGACAAGCTGATCCGGGAAGTGAAAGTGATCACCCTGAAGTCCAAGCTGGTGTCCGATTTCCGGAAGGATTTCCAGTTTTACAAAGTGCGCGAGATCAACAACTACCACCACGCCCACGACGCCTACCTGAACGCCGTCGTGGGAACCGCCCTGATCAAAAAGTACCCTAAGCTGGAAAGCGAGTTCGTGTACGGCGACTACAAGGTGTACGACGTGCGGAAGATGATCGCCAAGAGCGAGCAGGAAATCGGCAAGGCTACCGCCAAGTACTTCTTCTACAGCAACATCATGAACTTTTTCAAGACCGAGATTACCCTGGCCAACGGCGAGATCCGGAAGCGGCCTCTGATCGAGACAAACGGCGAAACCGGGGAGATCGTGTGGGATAAGGGCCGGGATTTTGCCACCGTGCGGAAAGTGCTGAGCATGCCCCAAGTGAATATCGTGAAAAAGACCGAGGTGCAGACAGGCGGCTTCAGCAAAGAGTCTATCCTGCCCAAGAGGAACAGCGATAAGCTGATCGCCAGAAAGAAGGACTGGGACCCTAAGAAGTACGGCGGCTTCGACAGCCCCACCGTGGCCTATTCTGTGCTGGTGGTGGCCAAAGTGGAAAAGGGCAAGTCCAAGAAACTGAAGAGTGTGAAAGAGCTACTGGGGATCACCATCATGGAAAGAAGCAGCTTCGAGAAGAATCCCATCGACTTTCTGGAAGCCAAGGGCTACAAAGAAGTGAAAAAGGACCTGATCATCAAGCTGCCTAAGTACTCCCTGTTCGAGCTGGAAAACGGCCGGAAGAGAATGCTGGCCTCTGCCGGCGAACTGCAGAAGGGAAACGAACTGGCCCTGCCCTCCAAATATGTGAACTTCCTGTACCTGGCCAGCCACTATGAGAAGCTGAAGGGCTCCCCCGAGGATAATGAGCAGAAACAGCTGTTTGTGGAACAGCACAAGCACTACCTGGACGAGATCATCGAGCAGATCAGCGAGTTCTCCAAGAGAGTGATCCTGGCCGACGCTAATCTGGACAAAGTGCTGTCCGCCTACAACAAGCACCGGGATAAGCCCATCAGAGAGCAGGCCGAGAATATCATCCACCTGTTTACCCTGACCAATCTGGGAGCCCCTGCCGCCTTCAAGTACTTTGACACCACCATCGACCGGAAGAGGTACACCAGCACCAAAGAGGTGCTGGACGCCACCCTGATCCACCAGAGCATCACCGGCCTGTACGAGACACGGATCGACCTGTCTCAGCTGGGAGGCGACA",
"nls": "AGCGACCTGCCGCCACAAAGAAGGCTGGACAGGCTAAGAAGAAGAAAG",
"flag": "ATTACAAAGACGATGACGATAAGG",
"p2a": "GATCCGGCGCAACAAACTTCTCTCTGCTGAAACAAGCCGGAGATGTCGAAGAGAATCCTGGACCGA",
"blast": "TGGCCAAGCCTTTGTCTCAAGAAGAATCCACCCTCATTGAAAGAGCAACGGCTACAATCAACAGCATCCCCATCTCTGAAGACTACAGCGTCGCCAGCGCAGCTCTCTCTAGCGACGGCCGCATCTTCACTGGTGTCAATGTATATCATTTTACTGGGGGACCTTGTGCAGAACTCGTGGTGCTGGGCACTGCTGCTGCTGCGGCAGCTGGCAACCTGACTTGTATCGTCGCGATCGGAAATGAGAACAGGGGCA",
"space": "TCTTGAGCCCCTGCGGACGGTGCCGACAGGTGCTTCTCGATCTGCATCCTGGGATCAAAGCCATAGTGAAGGACAGTGATGGACAGCCGACGGCAGTTGGGATTCGTGAATTGCTGCCCTCTGGTTATGTGTGGGAGGGCTAAGAATTCGATATCAAGCTTATCGGTAATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTCTTAACTATGTTGCTCCTTTTACGCTATGTGGATACGCTGCTTTAATGCCTTTGTATCATGCTATTGCTTCCCGTATGGCTTTCATTTTCTCCTCCTTGTATAAATCCTGGTTGCTGTCTCTTTATGAGGAGTTGTGGCCCGTTGTCAGGCAACGTGGCGTGGTGTGCACTGTGTTTGCTGACGCAACCCCCACTGGTTGGGGCATTGCCACCACCTGTCAGCTCCTTTCCGGGACTTTCGCTTTCCCCCTCCCTATTGCCACGGCGGAACTCATCGCCGCCTGCCTTGCCCGCTGCTGGACAGGGGCTCGGCTGTTGGGCACTGACAATTCCGTGGTGTTGTCGGGGAAATCATCGTCCTTTCCTTGGCTGCTCGCCTGTGTTGCCACCTGGATTCTGCGCGGGACGTCCTTCTGCTACGTCCCTTCGGCCCTCAATCCAGCGGACCTTCCTTCCCGCGGCCTGCTGCCGGCTCTGCGGCCTCTTCCGCGTCTTCGCCTTCGCCCTCAGACGAGTCGGATCTCCCTTTGGGCCGCCTCCCCGCATCGATACCGTCGACCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGATTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCAAGAGAAGGTAGAAGAAGCCAATGAAGGAGAGAACACCCGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTATTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAC",
"virus_ltr": "TGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGG",
}
def parse_args():
parser = ArgumentParser()
parser.add_argument("--genome", dest="genome", default="hg38", choices=["hg38"])
parser.add_argument("--assembly-name", dest="assembly", default="hg38_spiked")
parser.add_argument("-o", "--output-dir", dest="output_dir", default="spiked_genomes")
parser.add_argument("--config", dest="config", default=None)
cols = ['library', 'oligo_name', 'sequence']
help_ = "A CSV file with gRNA information. Required columns are: {}".format(", ".join(cols))
parser.add_argument("--annotation", dest="annotation", help=help_)
parser.add_argument(
"--build-indexes", dest="build_indexes", action="store_true",
help="Whether to generate indexes of the genome reference. Default is not to.")
parser.add_argument(
"--scheduler-command",
dest="scheduler_command",
default="srun",
help="The command used to submit a job to a scheduler. E.g. 'sh', 'srun'. Default is 'srun'")
parser.add_argument(
"--scheduler-parameters",
dest="scheduler_params",
default="--mem 80000 -p develop",
help="A string with parameters for the scheduler. Can be empty. Default is '--mem 80000 -p develop'.")
parser.add_argument(
"--picard-jar",
dest="picard_jar",
default="/cm/shared/apps/picard-tools/1.140/picard.jar",
help="Path to picard .jar. Default is /cm/shared/apps/picard-tools/1.140/picard.jar")
parser.add_argument(
"--dropseq-jar",
dest="dropseq_jar",
default="~/Drop-seq_tools-1.12/jar/dropseq.jar",
help="Path to Dropseq .jar. Default it ~/Drop-seq_tools-1.12/jar/dropseq.jar")
args = parser.parse_args()
print(args)
if args.config is None:
args.config = CROPSEQ_VECTOR
else:
args.config = yaml.safe_load(open(args.config, "r"))['crop-seq']
return args
def main():
args = parse_args()
# read in guide annotation
annotation = pd.read_csv(os.path.join("metadata", "guide_annotation.csv"))
output_dir = os.path.join("spiked_genomes")
for library in annotation["library"].drop_duplicates():
spiked_dir = os.path.join(output_dir, args.assembly) + "_" + library
for _dir in [output_dir, spiked_dir]:
if not os.path.exists(_dir):
os.makedirs(_dir)
output_fasta = os.path.join(spiked_dir, "gRNA_spikes.fa")
output_gtf = os.path.join(spiked_dir, "gRNA_spikes.gtf")
# write gRNA library annotation
write_annotation(
annotation.loc[annotation["library"] == library],
args.config, output_fasta, output_gtf)
if not args.build_indexes:
return
# Make STAR index and supporting files
# Get fasta genome
prefix = "Homo_sapiens.GRCh38"
dir_prefix = os.path.join(spiked_dir, prefix)
os.system(
"wget -O {} ftp://ftp.ensembl.org/pub/release-77/fasta/homo_sapiens/dna/{}.dna.primary_assembly.fa.gz".format(
dir_prefix + ".dna.primary_assembly.fa.gz", prefix
)
)
os.system(
"gzip -d {}".format(dir_prefix + ".dna.primary_assembly.fa.gz")
)
# Get gtf annotation
os.system(
"wget -O {} ftp://ftp.ensembl.org/pub/release-77/gtf/homo_sapiens/{}.77.gtf.gz".format(
dir_prefix + ".77.gtf.gz", prefix
)
)
os.system("gzip -d {}".format(dir_prefix + ".77.gtf.gz"))
# Add extra chromosomes (CROP-seq constructs) to genome
params = [
dir_prefix + ".dna.primary_assembly.fa",
output_fasta,
dir_prefix + ".dna.primary_assembly.spiked.fa",
]
os.system("cat {} {} > {}".format(*params))
params = [
dir_prefix + ".77.gtf",
output_gtf,
dir_prefix + ".77.spiked.gtf",
]
os.system("cat {} {} > {}".format(*params))
# Build STAR index (contruct + spiked with gRNAs)
cmd = "{} {} /cm/shared/apps/star/2.4.2a/STAR".format(
args.scheduler_command, args.scheduler_params
)
cmd += " --runThreadN 8"
cmd += " --runMode genomeGenerate"
cmd += " --genomeDir {}".format(spiked_dir)
cmd += " --genomeFastaFiles {}".format(
dir_prefix + ".dna.primary_assembly.spiked.fa"
)
cmd += " --sjdbGTFfile {}".format(dir_prefix + ".77.spiked.gtf")
cmd += " --sjdbOverhang 74"
os.system(cmd)
# Create sequence dictionaries (for piccard)
cmd = "{} {} java -Xmx8g -jar {}".format(
args.scheduler_command, args.scheduler_params, args.picard_jar
)
cmd += " CreateSequenceDictionary"
cmd += " REFERENCE={}".format(
dir_prefix + ".dna.primary_assembly.spiked.fa"
)
cmd += " OUTPUT={}".format(
dir_prefix + ".dna.primary_assembly.spiked.dict"
)
cmd += " GENOME_ASSEMBLY={}".format(args.assembly)
cmd += " SPECIES=human"
os.system(cmd)
# Create reflat files
cmd = "{} {} java -Xmx80g -jar {} ConvertToRefFlat".format(
args.scheduler_command, args.scheduler_params, args.dropseq_jar
)
cmd += " SEQUENCE_DICTIONARY={}".format(
dir_prefix + ".dna.primary_assembly.spiked.dict"
)
cmd += " ANNOTATIONS_FILE= {}".format(dir_prefix + ".77.spiked.gtf")
cmd += " OUTPUT={}".format(
dir_prefix + ".dna.primary_assembly.spiked.refFlat"
)
os.system(cmd)
# Remove vanilla genome
os.remove(dir_prefix + ".dna.primary_assembly.fa")
os.remove(dir_prefix + ".77.gtf")
def write_annotation(df, config, output_fasta, output_gtf, cas9=True):
# create fasta and gtf entries for each gRNA
fasta_entries = list()
gtf_entries = list()
for i in df.index:
oligo_name = df.loc[i, "oligo_name"]
guide_sequence = df.loc[i, "sequence"]
# add fasta entry
sequence = (
config["u6"]
+ guide_sequence
+ config["rest"]
)
header = FASTA_HEADER_TEMPLATE.format(
chrom=oligo_name, length=len(sequence)
)
fasta_entries.append(header)
fasta_entries.append(sequence)
# add gtf entry
gtf_entries.append(
GTF_TEMPLATE.format(
chrom=oligo_name, id=oligo_name, length=len(sequence)
)
)
if cas9:
# add Cas9 vector
seq_name = "Cas9_blast"
sequence = "".join(
[
config["cas9"],
config["nls"],
config["flag"],
config["p2a"],
config["blast"],
config["space"],
config["virus_ltr"],
]
)
header = FASTA_HEADER_TEMPLATE.format(
chrom=seq_name, length=len(sequence)
)
# add cas9 entry
fasta_entries.append(header)
fasta_entries.append(sequence)
gtf_entries.append(
GTF_TEMPLATE.format(
chrom=seq_name, id=seq_name, length=len(sequence)
)
)
# write to file
with open(output_fasta, "w") as fasta_handle:
fasta_handle.writelines("\n".join(fasta_entries))
with open(output_gtf, "w") as gtf_handle:
gtf_handle.writelines(gtf_entries)
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment