Skip to content

Instantly share code, notes, and snippets.

@martinholub
Last active October 10, 2018 11:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.
Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.
Generating Transcriptomic SNV coordinates and annotation
class TranscriptomicSNP(object):
""" Generate a VCF file with transcriptomic coordinates
Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>.
# available:
# <chrom> <transc> <start> <end>
# <chrom> <SNVpos>
"""
def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True):
self.anno_path = anno_path
self.vcf_path = vcf_path
self.out_path = out_path
self.do_add_transcript_version = do_add_transcript_version
@property
def anno_path(self):
"""Path to annotation GTF file"""
return self._anno_path
@anno_path.setter
def anno_path(self, value):
value = value.rstrip("/")
value = os.path.expanduser(value)
assert os.path.isfile(value), "File {} doesn't exist".format(value)
self._anno_path = value
@property
def vcf_path(self):
"""Path to annotation VCF file"""
return self._vcf_path
@vcf_path.setter
def vcf_path(self, value):
value = value.rstrip("/")
value = os.path.expanduser(value)
assert os.path.isfile(value), "File {} doesn't exist".format(value)
self._vcf_path = value
@property
def out_path(self):
return self._out_path
@out_path.setter
def out_path(self, value):
value = rm_ext(value, "gz")
value = value.rstrip("/")
value = os.path.expanduser(value)
self._out_path = value
def get_transcript_snvs(self):
"""Generate a VCF file with transcriptomic coordinates
Refernces:
https://docs.python.org/3/library/subprocess.html
https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed
http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
"""
# Extract transcripts from GTF file, sort
_, transcripts_gtf = tempfile.mkstemp()
command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf)
_ = subprocess.run(command, check = True, shell = True)
# Get only SNVs from VCF, to save some memory/time
_, snv_vcf = tempfile.mkstemp()
command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf)
subprocess.run(command, check = True, shell = True)
# Get such transcripts, that have an overlap with a known SNV
# Exploits fact that both coordinates are chromosome-based
# writes information from both files next to each other
_, transcripts_vcf = tempfile.mkstemp()
command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}"
command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf)
subprocess.run(command, check = True, shell = True)
# Pull out name of transcripts as obtained in previous step
_, transcripts = tempfile.mkstemp()
if self.do_add_transcript_version:
command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; "
command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}"
else:
command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}"
command = command.format(transcripts_vcf, transcripts)
subprocess.run(command, check = True, shell = True)
# Combine transcript name with the SNV information
# Converts the coordinates to transcriptomic ones
# `$11-$4+1` computes coordinate of SNV along transcript,where
# $4 start of transcript and $11 postion of SNV both in chromosme-based coords.
# transcripts file carries information on transcript
# columns $12..$17 carry information on the SNV itself
command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
command = command.format(transcripts_vcf, transcripts, self.out_path)
subprocess.run(command, shell = True, check = True)
# Zip and index
command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path)
subprocess.run(command, shell = True, check = True)
# Cleanup
for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]:
os.remove(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment