Last active
October 10, 2018 11:18
-
-
Save martinholub/2dbb4cfcb28d12aed0e0302afc58cb57 to your computer and use it in GitHub Desktop.
Generating Transcriptomic SNV coordinates and annotation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TranscriptomicSNP(object): | |
""" Generate a VCF file with transcriptomic coordinates | |
Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>. | |
# available: | |
# <chrom> <transc> <start> <end> | |
# <chrom> <SNVpos> | |
""" | |
def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True): | |
self.anno_path = anno_path | |
self.vcf_path = vcf_path | |
self.out_path = out_path | |
self.do_add_transcript_version = do_add_transcript_version | |
@property | |
def anno_path(self): | |
"""Path to annotation GTF file""" | |
return self._anno_path | |
@anno_path.setter | |
def anno_path(self, value): | |
value = value.rstrip("/") | |
value = os.path.expanduser(value) | |
assert os.path.isfile(value), "File {} doesn't exist".format(value) | |
self._anno_path = value | |
@property | |
def vcf_path(self): | |
"""Path to annotation VCF file""" | |
return self._vcf_path | |
@vcf_path.setter | |
def vcf_path(self, value): | |
value = value.rstrip("/") | |
value = os.path.expanduser(value) | |
assert os.path.isfile(value), "File {} doesn't exist".format(value) | |
self._vcf_path = value | |
@property | |
def out_path(self): | |
return self._out_path | |
@out_path.setter | |
def out_path(self, value): | |
value = rm_ext(value, "gz") | |
value = value.rstrip("/") | |
value = os.path.expanduser(value) | |
self._out_path = value | |
def get_transcript_snvs(self): | |
"""Generate a VCF file with transcriptomic coordinates | |
Refernces: | |
https://docs.python.org/3/library/subprocess.html | |
https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed | |
http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html | |
""" | |
# Extract transcripts from GTF file, sort | |
_, transcripts_gtf = tempfile.mkstemp() | |
command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf) | |
_ = subprocess.run(command, check = True, shell = True) | |
# Get only SNVs from VCF, to save some memory/time | |
_, snv_vcf = tempfile.mkstemp() | |
command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf) | |
subprocess.run(command, check = True, shell = True) | |
# Get such transcripts, that have an overlap with a known SNV | |
# Exploits fact that both coordinates are chromosome-based | |
# writes information from both files next to each other | |
_, transcripts_vcf = tempfile.mkstemp() | |
command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}" | |
command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf) | |
subprocess.run(command, check = True, shell = True) | |
# Pull out name of transcripts as obtained in previous step | |
_, transcripts = tempfile.mkstemp() | |
if self.do_add_transcript_version: | |
command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; " | |
command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}" | |
else: | |
command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}" | |
command = command.format(transcripts_vcf, transcripts) | |
subprocess.run(command, check = True, shell = True) | |
# Combine transcript name with the SNV information | |
# Converts the coordinates to transcriptomic ones | |
# `$11-$4+1` computes coordinate of SNV along transcript,where | |
# $4 start of transcript and $11 postion of SNV both in chromosme-based coords. | |
# transcripts file carries information on transcript | |
# columns $12..$17 carry information on the SNV itself | |
command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}" | |
command = command.format(transcripts_vcf, transcripts, self.out_path) | |
subprocess.run(command, shell = True, check = True) | |
# Zip and index | |
command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path) | |
subprocess.run(command, shell = True, check = True) | |
# Cleanup | |
for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]: | |
os.remove(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment