Created
January 14, 2020 10:59
-
-
Save afrendeiro/a5449450fbbe862da0c21bcde5a7d82c to your computer and use it in GitHub Desktop.
Build a transcriptome reference with additional chromosomes for CRISPR gRNA constructs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script generates a transcriptome reference with additional | |
scaffolds/chromosomes for CRISPR gRNA constructs. | |
""" | |
import sys | |
import os | |
from argparse import ArgumentParser | |
import yaml | |
import pandas as pd | |
FASTA_HEADER_TEMPLATE = ( | |
">{chrom}_chrom dna:chromosome chromosome:" | |
"GRCh38:{chrom}_chrom:1:{length}:1 REF" | |
) | |
GTF_TEMPLATE = ( | |
"{chrom}_chrom\thavana\tgene\t1\t{length}\t.\t+\t.\t" | |
"""gene_id "{id}_gene"; gene_name "{id}_gene"; gene_source "ensembl_havana"; """ | |
"""gene_biotype "lincRNA";""" | |
"""{chrom}_chrom\thavana\ttranscript\t1\t{length}\t.\t+\t.\t""" | |
"""gene_id "{id}_gene"; transcript_id "{id}_transcript"; """ | |
"""gene_name "{id}_gene"; gene_source "ensembl_havana"; """ | |
"""gene_biotype "lincRNA"; transcript_name "{id}_transcript"; """ | |
"""transcript_source "havana";""" | |
"""{chrom}_chrom\thavana\texon\t1\t{length}\t.\t+\t.\tgene_id "{id}_gene"; """ | |
"""transcript_id "{id}_transcript"; exon_number "1"; gene_name "{id}_gene"; """ | |
"""gene_source "ensembl_havana"; gene_biotype "lincRNA";""" | |
"""transcript_name "{id}_transcript"; transcript_source "havana"; exon_id "{id}_exon";\n""" | |
) | |
CROPSEQ_VECTOR = { | |
"u6": "GAGGGCCTATTTCCCATGATTCCTTCATATTTGCATATACGATACAAGGCTGTTAGAGAGATAATTAGAATTAATTTGACTGTAAACACAAAGATATTAGTACAAAATACGTGACGTAGAAAGTAATAATTTCTTGGGTAGTTTGCAGTTTTAAAATTATGTTTTAAAATGGACTATCATATGCTTACCGTAACTTGAAAGTATTTCGATTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCG", | |
"rest": "GTTTTAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGTTATCAACTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTAAGCTTGGCGTAACTAGATCTTGAGACACTGCTTTTTGCTTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGT", | |
"cas9": "ATGGACAAGAAGTACAGCATCGGCCTGGACATCGGCACCAACTCTGTGGGCTGGGCCGTGATCACCGACGAGTACAAGGTGCCCAGCAAGAAATTCAAGGTGCTGGGCAACACCGACCGGCACAGCATCAAGAAGAACCTGATCGGAGCCCTGCTGTTCGACAGCGGCGAAACAGCCGAGGCCACCCGGCTGAAGAGAACCGCCAGAAGAAGATACACCAGACGGAAGAACCGGATCTGCTATCTGCAAGAGATCTTCAGCAACGAGATGGCCAAGGTGGACGACAGCTTCTTCCACAGACTGGAAGAGTCCTTCCTGGTGGAAGAGGATAAGAAGCACGAGCGGCACCCCATCTTCGGCAACATCGTGGACGAGGTGGCCTACCACGAGAAGTACCCCACCATCTACCACCTGAGAAAGAAACTGGTGGACAGCACCGACAAGGCCGACCTGCGGCTGATCTATCTGGCCCTGGCCCACATGATCAAGTTCCGGGGCCACTTCCTGATCGAGGGCGACCTGAACCCCGACAACAGCGACGTGGACAAGCTGTTCATCCAGCTGGTGCAGACCTACAACCAGCTGTTCGAGGAAAACCCCATCAACGCCAGCGGCGTGGACGCCAAGGCCATCCTGTCTGCCAGACTGAGCAAGAGCAGACGGCTGGAAAATCTGATCGCCCAGCTGCCCGGCGAGAAGAAGAATGGCCTGTTCGGAAACCTGATTGCCCTGAGCCTGGGCCTGACCCCCAACTTCAAGAGCAACTTCGACCTGGCCGAGGATGCCAAACTGCAGCTGAGCAAGGACACCTACGACGACGACCTGGACAACCTGCTGGCCCAGATCGGCGACCAGTACGCCGACCTGTTTCTGGCCGCCAAGAACCTGTCCGACGCCATCCTGCTGAGCGACATCCTGAGAGTGAACACCGAGATCACCAAGGCCCCCCTGAGCGCCTCTATGATCAAGAGATACGACGAGCACCACCAGGACCTGACCCTGCTGAAAGCTCTCGTGCGGCAGCAGCTGCCTGAGAAGTACAAAGAGATTTTCTTCGACCAGAGCAAGAACGGCTACGCCGGCTACATTGACGGCGGAGCCAGCCAGGAAGAGTTCTACAAGTTCATCAAGCCCATCCTGGAAAAGATGGACGGCACCGAGGAACTGCTCGTGAAGCTGAACAGAGAGGACCTGCTGCGGAAGCAGCGGACCTTCGACAACGGCAGCATCCCCCACCAGATCCACCTGGGAGAGCTGCACGCCATTCTGCGGCGGCAGGAAGATTTTTACCCATTCCTGAAGGACAACCGGGAAAAGATCGAGAAGATCCTGACCTTCCGCATCCCCTACTACGTGGGCCCTCTGGCCAGGGGAAACAGCAGATTCGCCTGGATGACCAGAAAGAGCGAGGAAACCATCACCCCCTGGAACTTCGAGGAAGTGGTGGACAAGGGCGCTTCCGCCCAGAGCTTCATCGAGCGGATGACCAACTTCGATAAGAACCTGCCCAACGAGAAGGTGCTGCCCAAGCACAGCCTGCTGTACGAGTACTTCACCGTGTATAACGAGCTGACCAAAGTGAAATACGTGACCGAGGGAATGAGAAAGCCCGCCTTCCTGAGCGGCGAGCAGAAAAAGGCCATCGTGGACCTGCTGTTCAAGACCAACCGGAAAGTGACCGTGAAGCAGCTGAAAGAGGACTACTTCAAGAAAATCGAGTGCTTCGACTCCGTGGAAATCTCCGGCGTGGAAGATCGGTTCAACGCCTCCCTGGGCACATACCACGATCTGCTGAAAATTATCAAGGACAAGGACTTCCTGGACAATGAGGAAAACGAGGACATTCTGGAAGATATCGTGCTGACCCTGACACTGTTTGAGGACAGAGAGATGATCGAGGAACGGCTGAAAACCTATGCCCACCTGTTCGACGACAAAGTGATGAAGCAGCTGAAGCGGCGGAGATACACCGGCTGGGGCAGGCTGAGCCGGAAGCTGATCAACGGCATCCGGGACAAGCAGTCCGGCAAGACAATCCTGGATTTCCTGAAGTCCGACGGCTTCGCCAACAGAAACTTCATGCAGCTGATCCACGACGACAGCCTGACCTTTAAAGAGGACATCCAGAAAGCCCAGGTGTCCGGCCAGGGCGATAGCCTGCACGAGCACATTGCCAATCTGGCCGGCAGCCCCGCCATTAAGAAGGGCATCCTGCAGACAGTGAAGGTGGTGGACGAGCTCGTGAAAGTGATGGGCCGGCACAAGCCCGAGAACATCGTGATCGAAATGGCCAGAGAGAACCAGACCACCCAGAAGGGACAGAAGAACAGCCGCGAGAGAATGAAGCGGATCGAAGAGGGCATCAAAGAGCTGGGCAGCCAGATCCTGAAAGAACACCCCGTGGAAAACACCCAGCTGCAGAACGAGAAGCTGTACCTGTACTACCTGCAGAATGGGCGGGATATGTACGTGGACCAGGAACTGGACATCAACCGGCTGTCCGACTACGATGTGGACCATATCGTGCCTCAGAGCTTTCTGAAGGACGACTCCATCGACAACAAGGTGCTGACCAGAAGCGACAAGAACCGGGGCAAGAGCGACAACGTGCCCTCCGAAGAGGTCGTGAAGAAGATGAAGAACTACTGGCGGCAGCTGCTGAACGCCAAGCTGATTACCCAGAGAAAGTTCGACAATCTGACCAAGGCCGAGAGAGGCGGCCTGAGCGAACTGGATAAGGCCGGCTTCATCAAGAGACAGCTGGTGGAAACCCGGCAGATCACAAAGCACGTGGCACAGATCCTGGACTCCCGGATGAACACTAAGTACGACGAGAATGACAAGCTGATCCGGGAAGTGAAAGTGATCACCCTGAAGTCCAAGCTGGTGTCCGATTTCCGGAAGGATTTCCAGTTTTACAAAGTGCGCGAGATCAACAACTACCACCACGCCCACGACGCCTACCTGAACGCCGTCGTGGGAACCGCCCTGATCAAAAAGTACCCTAAGCTGGAAAGCGAGTTCGTGTACGGCGACTACAAGGTGTACGACGTGCGGAAGATGATCGCCAAGAGCGAGCAGGAAATCGGCAAGGCTACCGCCAAGTACTTCTTCTACAGCAACATCATGAACTTTTTCAAGACCGAGATTACCCTGGCCAACGGCGAGATCCGGAAGCGGCCTCTGATCGAGACAAACGGCGAAACCGGGGAGATCGTGTGGGATAAGGGCCGGGATTTTGCCACCGTGCGGAAAGTGCTGAGCATGCCCCAAGTGAATATCGTGAAAAAGACCGAGGTGCAGACAGGCGGCTTCAGCAAAGAGTCTATCCTGCCCAAGAGGAACAGCGATAAGCTGATCGCCAGAAAGAAGGACTGGGACCCTAAGAAGTACGGCGGCTTCGACAGCCCCACCGTGGCCTATTCTGTGCTGGTGGTGGCCAAAGTGGAAAAGGGCAAGTCCAAGAAACTGAAGAGTGTGAAAGAGCTACTGGGGATCACCATCATGGAAAGAAGCAGCTTCGAGAAGAATCCCATCGACTTTCTGGAAGCCAAGGGCTACAAAGAAGTGAAAAAGGACCTGATCATCAAGCTGCCTAAGTACTCCCTGTTCGAGCTGGAAAACGGCCGGAAGAGAATGCTGGCCTCTGCCGGCGAACTGCAGAAGGGAAACGAACTGGCCCTGCCCTCCAAATATGTGAACTTCCTGTACCTGGCCAGCCACTATGAGAAGCTGAAGGGCTCCCCCGAGGATAATGAGCAGAAACAGCTGTTTGTGGAACAGCACAAGCACTACCTGGACGAGATCATCGAGCAGATCAGCGAGTTCTCCAAGAGAGTGATCCTGGCCGACGCTAATCTGGACAAAGTGCTGTCCGCCTACAACAAGCACCGGGATAAGCCCATCAGAGAGCAGGCCGAGAATATCATCCACCTGTTTACCCTGACCAATCTGGGAGCCCCTGCCGCCTTCAAGTACTTTGACACCACCATCGACCGGAAGAGGTACACCAGCACCAAAGAGGTGCTGGACGCCACCCTGATCCACCAGAGCATCACCGGCCTGTACGAGACACGGATCGACCTGTCTCAGCTGGGAGGCGACA", | |
"nls": "AGCGACCTGCCGCCACAAAGAAGGCTGGACAGGCTAAGAAGAAGAAAG", | |
"flag": "ATTACAAAGACGATGACGATAAGG", | |
"p2a": "GATCCGGCGCAACAAACTTCTCTCTGCTGAAACAAGCCGGAGATGTCGAAGAGAATCCTGGACCGA", | |
"blast": "TGGCCAAGCCTTTGTCTCAAGAAGAATCCACCCTCATTGAAAGAGCAACGGCTACAATCAACAGCATCCCCATCTCTGAAGACTACAGCGTCGCCAGCGCAGCTCTCTCTAGCGACGGCCGCATCTTCACTGGTGTCAATGTATATCATTTTACTGGGGGACCTTGTGCAGAACTCGTGGTGCTGGGCACTGCTGCTGCTGCGGCAGCTGGCAACCTGACTTGTATCGTCGCGATCGGAAATGAGAACAGGGGCA", | |
"space": "TCTTGAGCCCCTGCGGACGGTGCCGACAGGTGCTTCTCGATCTGCATCCTGGGATCAAAGCCATAGTGAAGGACAGTGATGGACAGCCGACGGCAGTTGGGATTCGTGAATTGCTGCCCTCTGGTTATGTGTGGGAGGGCTAAGAATTCGATATCAAGCTTATCGGTAATCAACCTCTGGATTACAAAATTTGTGAAAGATTGACTGGTATTCTTAACTATGTTGCTCCTTTTACGCTATGTGGATACGCTGCTTTAATGCCTTTGTATCATGCTATTGCTTCCCGTATGGCTTTCATTTTCTCCTCCTTGTATAAATCCTGGTTGCTGTCTCTTTATGAGGAGTTGTGGCCCGTTGTCAGGCAACGTGGCGTGGTGTGCACTGTGTTTGCTGACGCAACCCCCACTGGTTGGGGCATTGCCACCACCTGTCAGCTCCTTTCCGGGACTTTCGCTTTCCCCCTCCCTATTGCCACGGCGGAACTCATCGCCGCCTGCCTTGCCCGCTGCTGGACAGGGGCTCGGCTGTTGGGCACTGACAATTCCGTGGTGTTGTCGGGGAAATCATCGTCCTTTCCTTGGCTGCTCGCCTGTGTTGCCACCTGGATTCTGCGCGGGACGTCCTTCTGCTACGTCCCTTCGGCCCTCAATCCAGCGGACCTTCCTTCCCGCGGCCTGCTGCCGGCTCTGCGGCCTCTTCCGCGTCTTCGCCTTCGCCCTCAGACGAGTCGGATCTCCCTTTGGGCCGCCTCCCCGCATCGATACCGTCGACCTCGAGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGATTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCAAGAGAAGGTAGAAGAAGCCAATGAAGGAGAGAACACCCGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTATTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAC", | |
"virus_ltr": "TGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGG", | |
} | |
def parse_args(): | |
parser = ArgumentParser() | |
parser.add_argument("--genome", dest="genome", default="hg38", choices=["hg38"]) | |
parser.add_argument("--assembly-name", dest="assembly", default="hg38_spiked") | |
parser.add_argument("-o", "--output-dir", dest="output_dir", default="spiked_genomes") | |
parser.add_argument("--config", dest="config", default=None) | |
cols = ['library', 'oligo_name', 'sequence'] | |
help_ = "A CSV file with gRNA information. Required columns are: {}".format(", ".join(cols)) | |
parser.add_argument("--annotation", dest="annotation", help=help_) | |
parser.add_argument( | |
"--build-indexes", dest="build_indexes", action="store_true", | |
help="Whether to generate indexes of the genome reference. Default is not to.") | |
parser.add_argument( | |
"--scheduler-command", | |
dest="scheduler_command", | |
default="srun", | |
help="The command used to submit a job to a scheduler. E.g. 'sh', 'srun'. Default is 'srun'") | |
parser.add_argument( | |
"--scheduler-parameters", | |
dest="scheduler_params", | |
default="--mem 80000 -p develop", | |
help="A string with parameters for the scheduler. Can be empty. Default is '--mem 80000 -p develop'.") | |
parser.add_argument( | |
"--picard-jar", | |
dest="picard_jar", | |
default="/cm/shared/apps/picard-tools/1.140/picard.jar", | |
help="Path to picard .jar. Default is /cm/shared/apps/picard-tools/1.140/picard.jar") | |
parser.add_argument( | |
"--dropseq-jar", | |
dest="dropseq_jar", | |
default="~/Drop-seq_tools-1.12/jar/dropseq.jar", | |
help="Path to Dropseq .jar. Default it ~/Drop-seq_tools-1.12/jar/dropseq.jar") | |
args = parser.parse_args() | |
print(args) | |
if args.config is None: | |
args.config = CROPSEQ_VECTOR | |
else: | |
args.config = yaml.safe_load(open(args.config, "r"))['crop-seq'] | |
return args | |
def main(): | |
args = parse_args() | |
# read in guide annotation | |
annotation = pd.read_csv(os.path.join("metadata", "guide_annotation.csv")) | |
output_dir = os.path.join("spiked_genomes") | |
for library in annotation["library"].drop_duplicates(): | |
spiked_dir = os.path.join(output_dir, args.assembly) + "_" + library | |
for _dir in [output_dir, spiked_dir]: | |
if not os.path.exists(_dir): | |
os.makedirs(_dir) | |
output_fasta = os.path.join(spiked_dir, "gRNA_spikes.fa") | |
output_gtf = os.path.join(spiked_dir, "gRNA_spikes.gtf") | |
# write gRNA library annotation | |
write_annotation( | |
annotation.loc[annotation["library"] == library], | |
args.config, output_fasta, output_gtf) | |
if not args.build_indexes: | |
return | |
# Make STAR index and supporting files | |
# Get fasta genome | |
prefix = "Homo_sapiens.GRCh38" | |
dir_prefix = os.path.join(spiked_dir, prefix) | |
os.system( | |
"wget -O {} ftp://ftp.ensembl.org/pub/release-77/fasta/homo_sapiens/dna/{}.dna.primary_assembly.fa.gz".format( | |
dir_prefix + ".dna.primary_assembly.fa.gz", prefix | |
) | |
) | |
os.system( | |
"gzip -d {}".format(dir_prefix + ".dna.primary_assembly.fa.gz") | |
) | |
# Get gtf annotation | |
os.system( | |
"wget -O {} ftp://ftp.ensembl.org/pub/release-77/gtf/homo_sapiens/{}.77.gtf.gz".format( | |
dir_prefix + ".77.gtf.gz", prefix | |
) | |
) | |
os.system("gzip -d {}".format(dir_prefix + ".77.gtf.gz")) | |
# Add extra chromosomes (CROP-seq constructs) to genome | |
params = [ | |
dir_prefix + ".dna.primary_assembly.fa", | |
output_fasta, | |
dir_prefix + ".dna.primary_assembly.spiked.fa", | |
] | |
os.system("cat {} {} > {}".format(*params)) | |
params = [ | |
dir_prefix + ".77.gtf", | |
output_gtf, | |
dir_prefix + ".77.spiked.gtf", | |
] | |
os.system("cat {} {} > {}".format(*params)) | |
# Build STAR index (contruct + spiked with gRNAs) | |
cmd = "{} {} /cm/shared/apps/star/2.4.2a/STAR".format( | |
args.scheduler_command, args.scheduler_params | |
) | |
cmd += " --runThreadN 8" | |
cmd += " --runMode genomeGenerate" | |
cmd += " --genomeDir {}".format(spiked_dir) | |
cmd += " --genomeFastaFiles {}".format( | |
dir_prefix + ".dna.primary_assembly.spiked.fa" | |
) | |
cmd += " --sjdbGTFfile {}".format(dir_prefix + ".77.spiked.gtf") | |
cmd += " --sjdbOverhang 74" | |
os.system(cmd) | |
# Create sequence dictionaries (for piccard) | |
cmd = "{} {} java -Xmx8g -jar {}".format( | |
args.scheduler_command, args.scheduler_params, args.picard_jar | |
) | |
cmd += " CreateSequenceDictionary" | |
cmd += " REFERENCE={}".format( | |
dir_prefix + ".dna.primary_assembly.spiked.fa" | |
) | |
cmd += " OUTPUT={}".format( | |
dir_prefix + ".dna.primary_assembly.spiked.dict" | |
) | |
cmd += " GENOME_ASSEMBLY={}".format(args.assembly) | |
cmd += " SPECIES=human" | |
os.system(cmd) | |
# Create reflat files | |
cmd = "{} {} java -Xmx80g -jar {} ConvertToRefFlat".format( | |
args.scheduler_command, args.scheduler_params, args.dropseq_jar | |
) | |
cmd += " SEQUENCE_DICTIONARY={}".format( | |
dir_prefix + ".dna.primary_assembly.spiked.dict" | |
) | |
cmd += " ANNOTATIONS_FILE= {}".format(dir_prefix + ".77.spiked.gtf") | |
cmd += " OUTPUT={}".format( | |
dir_prefix + ".dna.primary_assembly.spiked.refFlat" | |
) | |
os.system(cmd) | |
# Remove vanilla genome | |
os.remove(dir_prefix + ".dna.primary_assembly.fa") | |
os.remove(dir_prefix + ".77.gtf") | |
def write_annotation(df, config, output_fasta, output_gtf, cas9=True): | |
# create fasta and gtf entries for each gRNA | |
fasta_entries = list() | |
gtf_entries = list() | |
for i in df.index: | |
oligo_name = df.loc[i, "oligo_name"] | |
guide_sequence = df.loc[i, "sequence"] | |
# add fasta entry | |
sequence = ( | |
config["u6"] | |
+ guide_sequence | |
+ config["rest"] | |
) | |
header = FASTA_HEADER_TEMPLATE.format( | |
chrom=oligo_name, length=len(sequence) | |
) | |
fasta_entries.append(header) | |
fasta_entries.append(sequence) | |
# add gtf entry | |
gtf_entries.append( | |
GTF_TEMPLATE.format( | |
chrom=oligo_name, id=oligo_name, length=len(sequence) | |
) | |
) | |
if cas9: | |
# add Cas9 vector | |
seq_name = "Cas9_blast" | |
sequence = "".join( | |
[ | |
config["cas9"], | |
config["nls"], | |
config["flag"], | |
config["p2a"], | |
config["blast"], | |
config["space"], | |
config["virus_ltr"], | |
] | |
) | |
header = FASTA_HEADER_TEMPLATE.format( | |
chrom=seq_name, length=len(sequence) | |
) | |
# add cas9 entry | |
fasta_entries.append(header) | |
fasta_entries.append(sequence) | |
gtf_entries.append( | |
GTF_TEMPLATE.format( | |
chrom=seq_name, id=seq_name, length=len(sequence) | |
) | |
) | |
# write to file | |
with open(output_fasta, "w") as fasta_handle: | |
fasta_handle.writelines("\n".join(fasta_entries)) | |
with open(output_gtf, "w") as gtf_handle: | |
gtf_handle.writelines(gtf_entries) | |
if __name__ == "__main__": | |
try: | |
sys.exit(main()) | |
except KeyboardInterrupt: | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment