martinholub/transcript_snp.py

## transcript_snp.py
class TranscriptomicSNP(object):
    """ Generate a VCF file with transcriptomic coordinates

    Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>.

    # available:
    # <chrom> <transc> <start> <end>
    # <chrom> <SNVpos>

    """
    def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True):
        self.anno_path = anno_path
        self.vcf_path = vcf_path
        self.out_path = out_path
        self.do_add_transcript_version = do_add_transcript_version

    @property
    def anno_path(self):
        """Path to annotation GTF file"""
        return self._anno_path

    @anno_path.setter
    def anno_path(self, value):
        value = value.rstrip("/")
        value = os.path.expanduser(value)
        assert os.path.isfile(value), "File {} doesn't exist".format(value)
        self._anno_path = value

    @property
    def vcf_path(self):
        """Path to annotation VCF file"""
        return self._vcf_path

    @vcf_path.setter
    def vcf_path(self, value):
        value = value.rstrip("/")
        value = os.path.expanduser(value)
        assert os.path.isfile(value), "File {} doesn't exist".format(value)
        self._vcf_path = value

    @property
    def out_path(self):
        return self._out_path

    @out_path.setter
    def out_path(self, value):
        value = rm_ext(value, "gz")
        value = value.rstrip("/")
        value = os.path.expanduser(value)
        self._out_path = value

    def get_transcript_snvs(self):
        """Generate a VCF file with transcriptomic coordinates

        Refernces:
            https://docs.python.org/3/library/subprocess.html
            https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed
            http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
        """

        # Extract transcripts from GTF file, sort
        _, transcripts_gtf = tempfile.mkstemp()
        command = 'grep -P "\ttranscript\t" {} | sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf)
        _ = subprocess.run(command, check = True, shell = True)

        # Get only SNVs from VCF, to save some memory/time
        _, snv_vcf = tempfile.mkstemp()
        command = 'grep -P "(^#|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf)
        subprocess.run(command, check = True, shell = True)

        # Get such transcripts, that have an overlap with a known SNV
        # Exploits fact that both coordinates are chromosome-based
        # writes information from both files next to each other
        _, transcripts_vcf = tempfile.mkstemp()
        command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}"
        command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf)
        subprocess.run(command, check = True, shell = True)

        # Pull out name of transcripts as obtained in previous step
        _, transcripts = tempfile.mkstemp()
        if self.do_add_transcript_version:
            command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\"; "
            command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}"
        else:
            command = "cut -f9 {} | sed -E 's/.*\stranscript_id \"([^;]+)\".*/\\1/' > {}"
        command = command.format(transcripts_vcf, transcripts)
        subprocess.run(command, check = True, shell = True)

        # Combine transcript name with the SNV information
        # Converts the coordinates to transcriptomic ones

        # `$11-$4+1` computes coordinate of SNV along transcript,where
        # $4 start of transcript and $11 postion of SNV both in chromosme-based coords.
        # transcripts file carries information on transcript
        # columns $12..$17 carry information on the SNV itself
        command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} | paste {} - > {}"
        command = command.format(transcripts_vcf, transcripts, self.out_path)
        subprocess.run(command, shell = True, check = True)

        # Zip and index
        command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path)
        subprocess.run(command, shell = True, check = True)

        # Cleanup
        for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]:
            os.remove(f)
	class TranscriptomicSNP(object):
	""" Generate a VCF file with transcriptomic coordinates

	Gets transcriptomic snp loci in format <transc> <SNPpos> <additional_info...>.

	# available:
	# <chrom> <transc> <start> <end>
	# <chrom> <SNVpos>

	"""
	def __init__(self, anno_path, vcf_path, out_path, do_add_transcript_version = True):
	self.anno_path = anno_path
	self.vcf_path = vcf_path
	self.out_path = out_path
	self.do_add_transcript_version = do_add_transcript_version

	@property
	def anno_path(self):
	"""Path to annotation GTF file"""
	return self._anno_path

	@anno_path.setter
	def anno_path(self, value):
	value = value.rstrip("/")
	value = os.path.expanduser(value)
	assert os.path.isfile(value), "File {} doesn't exist".format(value)
	self._anno_path = value

	@property
	def vcf_path(self):
	"""Path to annotation VCF file"""
	return self._vcf_path

	@vcf_path.setter
	def vcf_path(self, value):
	value = value.rstrip("/")
	value = os.path.expanduser(value)
	assert os.path.isfile(value), "File {} doesn't exist".format(value)
	self._vcf_path = value

	@property
	def out_path(self):
	return self._out_path

	@out_path.setter
	def out_path(self, value):
	value = rm_ext(value, "gz")
	value = value.rstrip("/")
	value = os.path.expanduser(value)
	self._out_path = value

	def get_transcript_snvs(self):
	"""Generate a VCF file with transcriptomic coordinates

	Refernces:
	https://docs.python.org/3/library/subprocess.html
	https://bedops.readthedocs.io/en/latest/content/reference/file-management/conversion/convert2bed.html#convert2bed
	http://bedtools.readthedocs.io/en/latest/content/tools/intersect.html
	"""

	# Extract transcripts from GTF file, sort
	_, transcripts_gtf = tempfile.mkstemp()
	command = 'grep -P "\ttranscript\t" {} \| sort -k1,1 -k4,4n > {}'.format(self.anno_path,transcripts_gtf)
	_ = subprocess.run(command, check = True, shell = True)

	# Get only SNVs from VCF, to save some memory/time
	_, snv_vcf = tempfile.mkstemp()
	command = 'grep -P "(^#\|TSA=SNV)" {} > {}'.format(self.vcf_path, snv_vcf)
	subprocess.run(command, check = True, shell = True)

	# Get such transcripts, that have an overlap with a known SNV
	# Exploits fact that both coordinates are chromosome-based
	# writes information from both files next to each other
	_, transcripts_vcf = tempfile.mkstemp()
	command = "bedtools intersect -sorted -F 1 -wo -a {} -b {} > {}"
	command = command.format(transcripts_gtf, snv_vcf, transcripts_vcf)
	subprocess.run(command, check = True, shell = True)

	# Pull out name of transcripts as obtained in previous step
	_, transcripts = tempfile.mkstemp()
	if self.do_add_transcript_version:
	command = "cut -f9 {} \| sed -E 's/.*\stranscript_id \"([^;]+)\"; "
	command += "transcript_version \"([^;]+)\";.*/\\1.\\2/' > {}"
	else:
	command = "cut -f9 {} \| sed -E 's/.\stranscript_id \"([^;]+)\"./\\1/' > {}"
	command = command.format(transcripts_vcf, transcripts)
	subprocess.run(command, check = True, shell = True)

	# Combine transcript name with the SNV information
	# Converts the coordinates to transcriptomic ones

	# `$11-$4+1` computes coordinate of SNV along transcript,where
	# $4 start of transcript and $11 postion of SNV both in chromosme-based coords.
	# transcripts file carries information on transcript
	# columns $12..$17 carry information on the SNV itself
	command = "awk -F\"\t\" -v OFS=\"\t\" '{{print $11-$4+1,$12,$13,$14,$15,$16,$17}}' {} \| paste {} - > {}"
	command = command.format(transcripts_vcf, transcripts, self.out_path)
	subprocess.run(command, shell = True, check = True)

	# Zip and index
	command = "bgzip --force {0} && tabix -p vcf {0}.gz".format(self.out_path)
	subprocess.run(command, shell = True, check = True)

	# Cleanup
	for f in [transcripts_gtf, snv_vcf, transcripts, transcripts_vcf]:
	os.remove(f)