Skip to content

Instantly share code, notes, and snippets.

@lennax
Last active August 11, 2021 16:14
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lennax/3172753 to your computer and use it in GitHub Desktop.
Save lennax/3172753 to your computer and use it in GitHub Desktop.
# CoordinateMapper -- map between genomic, cds, and protein coordinates
# AUTHOR: Reece Hart <reecehart@gmail.com>
# Modifications: Lenna X. Peterson <arklenna@gmail.com>
# LICENSE: BioPython
# Examples:
# AB026906.1:g.7872G>T
# AB026906.1:c.274G>T
# BA...:p.Asp92Tyr
#
# All refer to congruent variants. A coordinate mapper is needed to
# translate between at least these three coordinate frames. The mapper
# should deal with specialized syntax for splicing and UTR (e.g., 88+1,
# 89-2, -14, *46). In addition, care should be taken to ensure consistent 0-
# or 1-based numbering (0 internally, as with Python/BioPython and
# Perl/BioPerl).
#
# g -----------00000000000-----1111111----------22222222222*222-----
# s0 e0 s1 e1 s2 e2
# \ \ | | / /
# +--+ +--+ | | +-------+ +-------+
# \ \| |/ /
# c 00000000000111111122222222222*222
# c0 c1 c2
# aaabbbcccdddeeefffggghhhiiijj*kkk
# p A B C D E F G H I J K
# p0 p1 p2 ... pn
#
#
# TODO:
# * g2c returns index + extended syntax
# * c2g accepts index + extended syntax
from math import floor
import re
import warnings
# FIXME change to absolute import once CompoundLocation is merged to master
import SeqFeature
class CMConfig(object):
#index = 0
index = 1
pre_CDS_fmt = "%+d"
post_CDS_fmt = "%+d"
#post_CDS_fmt = "*%d"
class InvalidPositionError(ValueError):
"Exception for bad coordinates"
class GenomePositionError(InvalidPositionError):
"Exception for bad genome coordinates"
class CDSPositionError(InvalidPositionError):
"Exception for bad CDS coordinates"
class ProteinPositionError(InvalidPositionError):
"Exception for bad protein coordinates"
class NonExonPosition(object):
"""
Represent a non-exon position relative to CDS coords.
offset: distance to anchor
anchor: index of closest CDS base (default None, required for introns)
(Note: Python slices are right-open,
so [4:15] includes indices 4 through 14 but not 15)
out_idx: index for display (default: None is dynamic CMConfig.index)
"""
def __init__(self, offset, anchor=None, out_idx=None):
self.anchor = anchor
self.offset = offset
self.out_idx = out_idx
@property
def offset(self):
return self._offset
@offset.setter
def offset(self, val):
if val == 0:
raise CDSPositionError("Offset cannot be zero.")
self._offset = val
@property
def anchor(self):
return self._anchor
@anchor.setter
def anchor(self, val):
if val is not None and val < 0:
raise CDSPositionError("Anchor cannot be negative.")
self._anchor = val
@property
def out_idx(self):
"Default value of None allows dynamic use of CMConfig.index"
if self._out_idx is not None:
return self._out_idx
return CMConfig.index
@out_idx.setter
def out_idx(self, val):
self._out_idx = val
def __str__(self):
# intron
if self.anchor is not None:
return "%d%+d" % (self.anchor + self.out_idx, self.offset)
else:
# pre-CDS
if self.offset < 0:
return CMConfig.pre_CDS_fmt % self.offset
# post-CDS
if self.offset > 0:
return CMConfig.post_CDS_fmt % self.offset
assert False
def __repr__(self):
return self.__str__()
def __eq__(self, other):
"""Allow equality testing"""
return self.anchor == other.anchor and self.offset == other.offset
class CoordinateMapper(object):
def __init__(self, selist=None, seqrecord=None):
# Note: seqrecord parameter will take SeqRecord or SeqFeature
if seqrecord is not None:
self._exons = self._CDS_from_Seq(seqrecord)
else:
self._exons = self._exons_from_list(selist)
@property # read-only
def exons(self):
return self._exons
@property # read-only
def exon_list(self):
return list(self.exons)
@staticmethod
def _CDS_from_Seq(seq):
if hasattr(seq, 'features'):
# generator
cdsf = (f for f in seq.features if f.type == 'CDS').next()
return cdsf.location
if hasattr(seq, 'location'):
if seq.type != 'CDS':
# XXX should this be a fatal error?
warnings.warn("Provided SeqFeature should be CDS",
RuntimeWarning)
return seq.location
@staticmethod
def _exons_from_list(selist):
return sum([SeqFeature.FeatureLocation(*pair) for pair in selist])
def g2c(self, gpos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert integer from genomic to CDS coordinates"
if gpos < in_idx:
raise GenomePositionError("Genome position cannot be negative.")
def _simple_g2c(g):
return self.exon_list.index(g - in_idx) + out_idx
# within exon
if gpos in self.exons:
return _simple_g2c(gpos)
# before CDS
if gpos < self.exons.start:
return NonExonPosition(offset=gpos - self.exons.start)
# after CDS
if gpos >= self.exons.end:
return NonExonPosition(offset=gpos - self.exons.end + 1)
# intron
# set start of first intron
prev_end = self.exons.parts[0].end
for part in self.exons.parts[1:]:
# not in this intron
if gpos > part.start:
prev_end = part.end
continue
len_intron = part.start - prev_end
# second half (exclusive) of intron
# offset > middle of intron
if gpos - prev_end > floor(len_intron / 2.0):
anchor = _simple_g2c(part.start)
offset = gpos - part.start
assert offset < 0
# first half (inclusive) of intron
else:
anchor = _simple_g2c(prev_end - 1)
offset = gpos - prev_end + 1
assert offset > 0
return NonExonPosition(anchor=anchor, offset=offset,
out_idx=out_idx)
assert False # function should return for every integer
# TODO verify that values of offset are sane
def check_intron(self, anchor, offset):
"""Verify that CDS-relative intron position is valid with given exons."""
for exon in self.exons.parts:
start = self.g2c(exon.start)
assert isinstance(start, int)
if anchor == start:
if offset > 0:
raise CDSPositionError("Invalid intron: offset from exon start must be negative.")
return True
end = self.g2c(exon.end - 1)
assert isinstance(end, int)
if anchor == end:
if offset < 0:
raise CDSPositionError("Invalid intron: offset from exon end must be positive.")
return True
raise CDSPositionError("Invalid intron: anchor must be start or end of an exon.")
def c2g(self, cpos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert from CDS to genomic coordinates"
def _simple_c2g(cpos):
return self.exon_list[cpos - in_idx] + out_idx
if isinstance(cpos, int):
# only allow non-negative integers as indices
if cpos >= in_idx:
#return self.exon_list[cpos - in_idx] + out_idx
return _simple_c2g(cpos)
if in_idx == 1 and cpos == 0:
raise CDSPositionError("base 0 does not exist in 1-index coordinates")
# cast negative integers to str (assuming pre-CDS)
cpos = str(cpos)
if isinstance(cpos, str):
# Note: () in re pattern cause separator to be captured
parsed = re.split("([*+-])", cpos, 1)
if len(parsed) != 3:
raise CDSPositionError("String '%s' not parseable for this position. Please provide int or NonExonPosition object." % cpos)
if parsed[0] == "":
anchor = None
else:
anchor = int(parsed[0])
if CMConfig.post_CDS_fmt[0] == "*" == parsed[1]:
parsed[1] = parsed[1].replace("*", "+")
offset = int("".join(parsed[1:]))
# internal use only, out_idx is irrelevant
cpos = NonExonPosition(offset=offset, anchor=anchor)
if isinstance(cpos, NonExonPosition):
# intron
if cpos.anchor:
if self.check_intron(cpos.anchor, cpos.offset):
return _simple_c2g(cpos.anchor) + cpos.offset
# pre-CDS
if cpos.offset < 0:
return self.exons.start + cpos.offset
# post-CDS
if cpos.offset > 0:
# Note: end is right-open
return self.exons.end - 1 + cpos.offset
raise CDSPositionError("String '%s' not parsed" % cpos)
def c2p(self, cpos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert from CDS to protein coordinates"
if isinstance(cpos, int):
return (int(cpos) - in_idx) / 3 + out_idx
elif isinstance(cpos, NonExonPosition):
raise CDSPositionError("'%s' does not correspond to a protein" % cpos)
raise CDSPositionError("Arg must be int")
def g2p(self, gpos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert integer from genomic to protein coordinates"
# inner function takes in_idx from parent
# inner function passes 0 to outer function
# outer function takes out_idx from parent
return self.c2p(self.g2c(gpos, in_idx, 0), 0, out_idx)
def p2c(self, ppos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert integer from protein coordinate to CDS closed range"
if isinstance(ppos, int):
if ppos < in_idx:
raise ProteinPositionError("'%s' should not be negative")
# FIXME is CDS guaranteed to have len % 3 == 0?
first_base = (ppos - in_idx) * 3 + out_idx
last_base = first_base + 2
if last_base > len(self.exons):
raise ProteinPositionError("'%s' is too large")
return (first_base, last_base)
def p2g(self, ppos, in_idx=CMConfig.index, out_idx=CMConfig.index):
"Convert integer from protein to genomic coordinates"
# p2c takes in_idx from parent
# p2c passes 0 to c2g
# c2g takes out_idx from parent
first, last = self.p2c(ppos, in_idx, 0)
c2g_args = (0, out_idx)
return (self.c2g(first, *c2g_args), self.c2g(last, *c2g_args))
if __name__ == '__main__':
# The following exons are from AB026906.1.
# test case: g.7872 -> c.274 -> p.92
# N.B. These are python counting coordinates (0-based)
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)]
def test_list():
cm = CoordinateMapper(exons)
for g1 in (7870, 7871, 7872, 7873, 7874):
print g1,
c1 = cm.g2c(g1)
print c1,
p1 = cm.c2p(c1)
print p1,
c2 = cm.p2c(p1)[0]
print ' | ', c2,
g2 = cm.c2g(c2)
print g2
#print g1, c1, p1, ' | ', c2, g2
print cm.g2p(7872)
print cm.p2g(92)
def test_sf():
sf_exons = [SeqFeature.FeatureLocation(*pr) for pr in exons]
fl = SeqFeature.SeqFeature(sum(sf_exons), type="CDS")
cm = CoordinateMapper(seqrecord=fl)
for g1 in (5807, 5808, 5809,
6871, 6872, 6873, 6874, 6875,
7766, 7767, 7768, 7769,
13784, 13785, 13786):
print g1,
c1 = cm.g2c(g1)
print c1,
p1 = cm.c2p(c1)
print p1,
if p1:
c2 = cm.p2c(p1)[0]
else:
c2 = c1
print ' | ', c2,
g2 = cm.c2g(c2)
print g2,
print
#print g1, c1, p1, ' | ', c2, g2
print cm.g2p(7872)
print cm.p2g(92)
def test_simple():
simple_exons = SeqFeature.SeqFeature(sum([SeqFeature.FeatureLocation(2, 4), SeqFeature.FeatureLocation(8, 11), SeqFeature.FeatureLocation(16, 18)]), type="CDS")
#print simple_exons
cm = CoordinateMapper(seqrecord=simple_exons)
print cm.exons
print list(cm.exons)
print range(len(cm.exons))
for i in range(len(cm.exons)):
print "%3s" % cm.c2g(i),
print
for i in xrange(20):
print "%3d" % i,
print
for i in xrange(20):
print "%3s" % cm.g2c(i),
print
test_list()
#test_sf()
test_simple()
#!/usr/bin/python
import unittest
from CoordinateMapper import *
from SeqFeature import FeatureLocation, SeqFeature
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)]
cmap = CoordinateMapper(exons)
g2c_exons = {7870: 272, 7871: 273, 7872: 274, 7873: 275, 7874: 276}
c2p_exons = {270: 90, 271: 90, 272: 90,
273: 91, 274: 91, 275: 91,
276: 92, 277: 92, 278: 92}
p2c_exons = {90: (270, 272), 91: (273, 275), 92: (276, 278)}
# check that c2p_exons and p2c_exons have correct correspondence
for p, c_tup in p2c_exons.iteritems():
for c in xrange(c_tup[0], c_tup[1] + 1):
assert c2p_exons[c] == p
g2c_introns = {5860: '51+1', 5861: '51+2', 6308: '51+449',
6309: '52-448', 6755: '52-2', 6756: '52-1'}
g2c_intron_tups = {5860: (1, 51), 5861: (2, 51), 6308: (449, 51),
6309: (-448, 52), 6755: (-2, 52), 6756: (-1, 52)}
g2c_outside = {0: '-5808', 13785: '+1', 14000: '+216'}
g2c_outside_tups = {0: (-5808, None), 13785: (1, None), 14000: (216, None)}
class TestNonExonPosition(unittest.TestCase):
"""Test that NonExonPosition works properly"""
def setUp(self):
pass
def testGoodIntron(self):
"""NonExonPosition should match good intron values"""
self.assertEqual(CMConfig.index, 0)
for k, args in g2c_intron_tups.iteritems():
self.assertEqual(str(NonExonPosition(*args)), g2c_introns[k])
def testGoodOutside(self):
"""NonExonPosition should match good outside-CDS values"""
for k, args in g2c_outside_tups.iteritems():
self.assertEqual(str(NonExonPosition(*args)), g2c_outside[k])
def testEqual(self):
"""NonExonPosition should test equal with same args"""
for args in g2c_intron_tups.itervalues():
NEPos = NonExonPosition
self.assertEqual(NEPos(*args), NEPos(*args))
self.assertEqual(str(NEPos(*args)), str(NEPos(*args)))
def testBadAnchor(self):
"""NonExonPosition should fail with negative CDS anchor"""
self.assertRaises(CDSPositionError, NonExonPosition, 1, -1)
def testBadOffset(self):
"""NonExonPosition should fail with zero offset"""
self.assertRaises(CDSPositionError, NonExonPosition, 0)
class TestCoordinateMapper(unittest.TestCase):
"""Test that CoordinateMapper works properly"""
def testListInit(self):
"""CoordinateMapper should take list of exon pairs"""
CoordinateMapper(exons)
# TODO
def testSeqRecordInit(self):
pass
def testSeqFeatureInit(self):
"""CoordinateMapper should take a CDS SeqFeature"""
sf = SeqFeature(sum((FeatureLocation(*pr) for pr in exons)),
type="CDS")
CoordinateMapper(seqrecord=sf)
# FIXME Maybe RuntimeWarning printed to stderr isn't captured?
#def testNotCDS(self):
#sf = SeqFeature(sum((FeatureLocation(*pr) for pr in exons)))
#self.assertRaises(RuntimeWarning, CoordinateMapper._CDS_from_Seq, sf)
def testEmptyInit(self):
"""CoordinateMapper should fail with no arguments"""
self.assertRaises(Exception, CoordinateMapper)
# TODO HGVS and GenBank coords
# TODO 0 and 1 index should both work
# \*\d+ # XXX only allowed if * is in Config.post_CDS_fmt
# TODO in HGVS, base 0 doesn't exist
class Testg2c(unittest.TestCase):
"""Test success of good input and failure of bad input to g2c"""
# what it should do
# any integer should return
def testGoodExons(self):
"""g2c should work for exon positions"""
for arg, expected in g2c_exons.iteritems():
self.assertEqual(cmap.g2c(arg), expected)
def testGoodIntronsStr(self):
"""g2c should work for intron positions (string)"""
for arg, expected in g2c_introns.iteritems():
self.assertEqual(str(cmap.g2c(arg)), expected)
def testGoodIntrons(self):
"""g2c should work for intron positions (NonExonPosition)"""
for arg, tup in g2c_intron_tups.iteritems():
actual = cmap.g2c(arg)
expect = NonExonPosition(*tup)
self.assertEqual(actual, expect)
self.assertEqual(str(actual), str(expect))
def testGoodOutsideStr(self):
"""g2c should work for outside positions (string)"""
for arg, expected in g2c_outside.iteritems():
self.assertEqual(str(cmap.g2c(arg)), expected)
def testGoodOutside(self):
"""g2c should work for outside positions (NonExonPosition)"""
for arg, tup in g2c_outside_tups.iteritems():
actual = cmap.g2c(arg)
expect = NonExonPosition(*tup)
self.assertEqual(actual, expect)
self.assertEqual(str(actual), str(expect))
# what it shouldn't do
# should it handle non-exact positions?
def testBadArg(self):
"""g2c should fail on string, float, None, or negative"""
bad = ("string", None, 3.14, )
for arg in bad:
self.assertRaises(Exception, cmap.g2c, arg)
self.assertRaises(GenomePositionError, cmap.g2c, -5)
class Testc2p(unittest.TestCase):
"""Test success of good input and failure of bad input to c2p"""
# what it should do
# integer within length of exon should return correct protein
def testGoodExons(self):
"""c2p should work for exon positions"""
for arg, expect in c2p_exons.iteritems():
self.assertEqual(cmap.c2p(arg), expect)
# FIXME should NonExonPosition return None or raise error?
def testBadNotProtein(self):
"""c2p should fail for NonExonPosition or str"""
bad = ("string", NonExonPosition(-5, 7))
for arg in bad:
self.assertRaises(CDSPositionError, cmap.c2p, arg)
class Testp2c(unittest.TestCase):
"""Test success of good input and failure of bad input to p2c"""
# what it should do
# integer within length of protein should return correct range
def testGoodExons(self):
"""p2c should work for exon positions"""
for arg, expect in p2c_exons.iteritems():
self.assertEqual(cmap.p2c(arg), expect)
def testBadTooLarge(self):
"""p2c should fail for positions longer than the max length"""
self.assertRaises(ProteinPositionError, cmap.p2c, 500)
def testBadNegative(self):
"""p2c should fail for negative protein coordinate"""
self.assertRaises(ProteinPositionError, cmap.p2c, -50)
class Testc2g(unittest.TestCase):
"""Test success of good input and failure of bad input to c2g"""
# what it should do
# integer within length of exon should return correct value
def testGoodExons(self):
"""c2g should work for exon positions"""
for expected, arg in g2c_exons.iteritems():
self.assertEqual(cmap.c2g(arg), expected)
# NonExonPosition object should return correct value
def testGoodIntrons(self):
"""c2g should work for introns (NonExonPosition)"""
# dict -- genomic integer: arg tuple for NEP
for expect, args in g2c_intron_tups.iteritems():
self.assertEqual(cmap.c2g(NonExonPosition(*args)), expect)
def testGoodOutside(self):
"""c2g should work for outside (NonExonPosition)"""
for expect, args in g2c_outside_tups.iteritems():
self.assertEqual(cmap.c2g(NonExonPosition(*args)), expect)
# simple string should return correct value
# \d*[+-]\d+
def testGoodIntronsStr(self):
"""c2g should work for intron positions (string)"""
for expected, arg in g2c_introns.iteritems():
self.assertEqual(cmap.c2g(arg), expected)
def testGoodOutsideStr(self):
"""c2g should work for outside positions (string)"""
for expected, arg in g2c_outside.iteritems():
self.assertEqual(cmap.c2g(arg), expected)
def testNegativeInt(self):
"""c2g should treat negative integer as pre-CDS"""
# XXX negative integers are treated as pre-CDS
self.assertEqual(cmap.c2g(-2), 5806)
# what it shouldn't do
# bad positions in form 123+4 where 123 is not end of an exon
# or where sign is wrong
def testBadNotEnd(self):
"""c2g should fail on invalid introns"""
bad_introns = ("160+5", # 160 is exonic
"51-6", # 51 is end
"52+10", # 52 is start
)
for arg in bad_introns:
self.assertRaises(CDSPositionError, cmap.c2g, arg)
def testBadString(self):
"""c2g should fail on arbitrary string"""
# for non-HGVS, treat * as meaningless
assert CMConfig.post_CDS_fmt[0] != "*"
bad_vals = ["xxx", "4-5'", "*10"]
for arg in bad_vals:
self.assertRaises(ValueError, cmap.c2g, arg)
# integers outside length of exon should raise IndexError
def testBadTooLarge(self):
"""c2g should fail on values beyond the exon"""
assert 400 > len(cmap.exons)
self.assertRaises(IndexError, cmap.c2g, 400)
self.assertRaises(Exception, cmap.c2g, "400-12")
if __name__ == "__main__":
#unittest.main()
suite0 = unittest.TestLoader().loadTestsFromTestCase(TestNonExonPosition)
suite1 = unittest.TestLoader().loadTestsFromTestCase(TestCoordinateMapper)
suite2 = unittest.TestLoader().loadTestsFromTestCase(Testc2g)
suite3 = unittest.TestLoader().loadTestsFromTestCase(Testg2c)
suite4 = unittest.TestLoader().loadTestsFromTestCase(Testc2p)
suite5 = unittest.TestLoader().loadTestsFromTestCase(Testp2c)
all_suites = [suite0, suite1, suite2, suite3, suite4, suite5]
all = unittest.TestSuite(all_suites)
unittest.TextTestRunner(verbosity=3).run(all)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment