Last active
August 11, 2021 16:14
-
-
Save lennax/3172753 to your computer and use it in GitHub Desktop.
coordinate mapper by Reece Hart: http://lists.open-bio.org/pipermail/biopython/2010-June/006598.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CoordinateMapper -- map between genomic, cds, and protein coordinates | |
# AUTHOR: Reece Hart <reecehart@gmail.com> | |
# Modifications: Lenna X. Peterson <arklenna@gmail.com> | |
# LICENSE: BioPython | |
# Examples: | |
# AB026906.1:g.7872G>T | |
# AB026906.1:c.274G>T | |
# BA...:p.Asp92Tyr | |
# | |
# All refer to congruent variants. A coordinate mapper is needed to | |
# translate between at least these three coordinate frames. The mapper | |
# should deal with specialized syntax for splicing and UTR (e.g., 88+1, | |
# 89-2, -14, *46). In addition, care should be taken to ensure consistent 0- | |
# or 1-based numbering (0 internally, as with Python/BioPython and | |
# Perl/BioPerl). | |
# | |
# g -----------00000000000-----1111111----------22222222222*222----- | |
# s0 e0 s1 e1 s2 e2 | |
# \ \ | | / / | |
# +--+ +--+ | | +-------+ +-------+ | |
# \ \| |/ / | |
# c 00000000000111111122222222222*222 | |
# c0 c1 c2 | |
# aaabbbcccdddeeefffggghhhiiijj*kkk | |
# p A B C D E F G H I J K | |
# p0 p1 p2 ... pn | |
# | |
# | |
# TODO: | |
# * g2c returns index + extended syntax | |
# * c2g accepts index + extended syntax | |
from math import floor | |
import re | |
import warnings | |
# FIXME change to absolute import once CompoundLocation is merged to master | |
import SeqFeature | |
class CMConfig(object): | |
#index = 0 | |
index = 1 | |
pre_CDS_fmt = "%+d" | |
post_CDS_fmt = "%+d" | |
#post_CDS_fmt = "*%d" | |
class InvalidPositionError(ValueError): | |
"Exception for bad coordinates" | |
class GenomePositionError(InvalidPositionError): | |
"Exception for bad genome coordinates" | |
class CDSPositionError(InvalidPositionError): | |
"Exception for bad CDS coordinates" | |
class ProteinPositionError(InvalidPositionError): | |
"Exception for bad protein coordinates" | |
class NonExonPosition(object): | |
""" | |
Represent a non-exon position relative to CDS coords. | |
offset: distance to anchor | |
anchor: index of closest CDS base (default None, required for introns) | |
(Note: Python slices are right-open, | |
so [4:15] includes indices 4 through 14 but not 15) | |
out_idx: index for display (default: None is dynamic CMConfig.index) | |
""" | |
def __init__(self, offset, anchor=None, out_idx=None): | |
self.anchor = anchor | |
self.offset = offset | |
self.out_idx = out_idx | |
@property | |
def offset(self): | |
return self._offset | |
@offset.setter | |
def offset(self, val): | |
if val == 0: | |
raise CDSPositionError("Offset cannot be zero.") | |
self._offset = val | |
@property | |
def anchor(self): | |
return self._anchor | |
@anchor.setter | |
def anchor(self, val): | |
if val is not None and val < 0: | |
raise CDSPositionError("Anchor cannot be negative.") | |
self._anchor = val | |
@property | |
def out_idx(self): | |
"Default value of None allows dynamic use of CMConfig.index" | |
if self._out_idx is not None: | |
return self._out_idx | |
return CMConfig.index | |
@out_idx.setter | |
def out_idx(self, val): | |
self._out_idx = val | |
def __str__(self): | |
# intron | |
if self.anchor is not None: | |
return "%d%+d" % (self.anchor + self.out_idx, self.offset) | |
else: | |
# pre-CDS | |
if self.offset < 0: | |
return CMConfig.pre_CDS_fmt % self.offset | |
# post-CDS | |
if self.offset > 0: | |
return CMConfig.post_CDS_fmt % self.offset | |
assert False | |
def __repr__(self): | |
return self.__str__() | |
def __eq__(self, other): | |
"""Allow equality testing""" | |
return self.anchor == other.anchor and self.offset == other.offset | |
class CoordinateMapper(object): | |
def __init__(self, selist=None, seqrecord=None): | |
# Note: seqrecord parameter will take SeqRecord or SeqFeature | |
if seqrecord is not None: | |
self._exons = self._CDS_from_Seq(seqrecord) | |
else: | |
self._exons = self._exons_from_list(selist) | |
@property # read-only | |
def exons(self): | |
return self._exons | |
@property # read-only | |
def exon_list(self): | |
return list(self.exons) | |
@staticmethod | |
def _CDS_from_Seq(seq): | |
if hasattr(seq, 'features'): | |
# generator | |
cdsf = (f for f in seq.features if f.type == 'CDS').next() | |
return cdsf.location | |
if hasattr(seq, 'location'): | |
if seq.type != 'CDS': | |
# XXX should this be a fatal error? | |
warnings.warn("Provided SeqFeature should be CDS", | |
RuntimeWarning) | |
return seq.location | |
@staticmethod | |
def _exons_from_list(selist): | |
return sum([SeqFeature.FeatureLocation(*pair) for pair in selist]) | |
def g2c(self, gpos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert integer from genomic to CDS coordinates" | |
if gpos < in_idx: | |
raise GenomePositionError("Genome position cannot be negative.") | |
def _simple_g2c(g): | |
return self.exon_list.index(g - in_idx) + out_idx | |
# within exon | |
if gpos in self.exons: | |
return _simple_g2c(gpos) | |
# before CDS | |
if gpos < self.exons.start: | |
return NonExonPosition(offset=gpos - self.exons.start) | |
# after CDS | |
if gpos >= self.exons.end: | |
return NonExonPosition(offset=gpos - self.exons.end + 1) | |
# intron | |
# set start of first intron | |
prev_end = self.exons.parts[0].end | |
for part in self.exons.parts[1:]: | |
# not in this intron | |
if gpos > part.start: | |
prev_end = part.end | |
continue | |
len_intron = part.start - prev_end | |
# second half (exclusive) of intron | |
# offset > middle of intron | |
if gpos - prev_end > floor(len_intron / 2.0): | |
anchor = _simple_g2c(part.start) | |
offset = gpos - part.start | |
assert offset < 0 | |
# first half (inclusive) of intron | |
else: | |
anchor = _simple_g2c(prev_end - 1) | |
offset = gpos - prev_end + 1 | |
assert offset > 0 | |
return NonExonPosition(anchor=anchor, offset=offset, | |
out_idx=out_idx) | |
assert False # function should return for every integer | |
# TODO verify that values of offset are sane | |
def check_intron(self, anchor, offset): | |
"""Verify that CDS-relative intron position is valid with given exons.""" | |
for exon in self.exons.parts: | |
start = self.g2c(exon.start) | |
assert isinstance(start, int) | |
if anchor == start: | |
if offset > 0: | |
raise CDSPositionError("Invalid intron: offset from exon start must be negative.") | |
return True | |
end = self.g2c(exon.end - 1) | |
assert isinstance(end, int) | |
if anchor == end: | |
if offset < 0: | |
raise CDSPositionError("Invalid intron: offset from exon end must be positive.") | |
return True | |
raise CDSPositionError("Invalid intron: anchor must be start or end of an exon.") | |
def c2g(self, cpos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert from CDS to genomic coordinates" | |
def _simple_c2g(cpos): | |
return self.exon_list[cpos - in_idx] + out_idx | |
if isinstance(cpos, int): | |
# only allow non-negative integers as indices | |
if cpos >= in_idx: | |
#return self.exon_list[cpos - in_idx] + out_idx | |
return _simple_c2g(cpos) | |
if in_idx == 1 and cpos == 0: | |
raise CDSPositionError("base 0 does not exist in 1-index coordinates") | |
# cast negative integers to str (assuming pre-CDS) | |
cpos = str(cpos) | |
if isinstance(cpos, str): | |
# Note: () in re pattern cause separator to be captured | |
parsed = re.split("([*+-])", cpos, 1) | |
if len(parsed) != 3: | |
raise CDSPositionError("String '%s' not parseable for this position. Please provide int or NonExonPosition object." % cpos) | |
if parsed[0] == "": | |
anchor = None | |
else: | |
anchor = int(parsed[0]) | |
if CMConfig.post_CDS_fmt[0] == "*" == parsed[1]: | |
parsed[1] = parsed[1].replace("*", "+") | |
offset = int("".join(parsed[1:])) | |
# internal use only, out_idx is irrelevant | |
cpos = NonExonPosition(offset=offset, anchor=anchor) | |
if isinstance(cpos, NonExonPosition): | |
# intron | |
if cpos.anchor: | |
if self.check_intron(cpos.anchor, cpos.offset): | |
return _simple_c2g(cpos.anchor) + cpos.offset | |
# pre-CDS | |
if cpos.offset < 0: | |
return self.exons.start + cpos.offset | |
# post-CDS | |
if cpos.offset > 0: | |
# Note: end is right-open | |
return self.exons.end - 1 + cpos.offset | |
raise CDSPositionError("String '%s' not parsed" % cpos) | |
def c2p(self, cpos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert from CDS to protein coordinates" | |
if isinstance(cpos, int): | |
return (int(cpos) - in_idx) / 3 + out_idx | |
elif isinstance(cpos, NonExonPosition): | |
raise CDSPositionError("'%s' does not correspond to a protein" % cpos) | |
raise CDSPositionError("Arg must be int") | |
def g2p(self, gpos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert integer from genomic to protein coordinates" | |
# inner function takes in_idx from parent | |
# inner function passes 0 to outer function | |
# outer function takes out_idx from parent | |
return self.c2p(self.g2c(gpos, in_idx, 0), 0, out_idx) | |
def p2c(self, ppos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert integer from protein coordinate to CDS closed range" | |
if isinstance(ppos, int): | |
if ppos < in_idx: | |
raise ProteinPositionError("'%s' should not be negative") | |
# FIXME is CDS guaranteed to have len % 3 == 0? | |
first_base = (ppos - in_idx) * 3 + out_idx | |
last_base = first_base + 2 | |
if last_base > len(self.exons): | |
raise ProteinPositionError("'%s' is too large") | |
return (first_base, last_base) | |
def p2g(self, ppos, in_idx=CMConfig.index, out_idx=CMConfig.index): | |
"Convert integer from protein to genomic coordinates" | |
# p2c takes in_idx from parent | |
# p2c passes 0 to c2g | |
# c2g takes out_idx from parent | |
first, last = self.p2c(ppos, in_idx, 0) | |
c2g_args = (0, out_idx) | |
return (self.c2g(first, *c2g_args), self.c2g(last, *c2g_args)) | |
if __name__ == '__main__': | |
# The following exons are from AB026906.1. | |
# test case: g.7872 -> c.274 -> p.92 | |
# N.B. These are python counting coordinates (0-based) | |
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)] | |
def test_list(): | |
cm = CoordinateMapper(exons) | |
for g1 in (7870, 7871, 7872, 7873, 7874): | |
print g1, | |
c1 = cm.g2c(g1) | |
print c1, | |
p1 = cm.c2p(c1) | |
print p1, | |
c2 = cm.p2c(p1)[0] | |
print ' | ', c2, | |
g2 = cm.c2g(c2) | |
print g2 | |
#print g1, c1, p1, ' | ', c2, g2 | |
print cm.g2p(7872) | |
print cm.p2g(92) | |
def test_sf(): | |
sf_exons = [SeqFeature.FeatureLocation(*pr) for pr in exons] | |
fl = SeqFeature.SeqFeature(sum(sf_exons), type="CDS") | |
cm = CoordinateMapper(seqrecord=fl) | |
for g1 in (5807, 5808, 5809, | |
6871, 6872, 6873, 6874, 6875, | |
7766, 7767, 7768, 7769, | |
13784, 13785, 13786): | |
print g1, | |
c1 = cm.g2c(g1) | |
print c1, | |
p1 = cm.c2p(c1) | |
print p1, | |
if p1: | |
c2 = cm.p2c(p1)[0] | |
else: | |
c2 = c1 | |
print ' | ', c2, | |
g2 = cm.c2g(c2) | |
print g2, | |
#print g1, c1, p1, ' | ', c2, g2 | |
print cm.g2p(7872) | |
print cm.p2g(92) | |
def test_simple(): | |
simple_exons = SeqFeature.SeqFeature(sum([SeqFeature.FeatureLocation(2, 4), SeqFeature.FeatureLocation(8, 11), SeqFeature.FeatureLocation(16, 18)]), type="CDS") | |
#print simple_exons | |
cm = CoordinateMapper(seqrecord=simple_exons) | |
print cm.exons | |
print list(cm.exons) | |
print range(len(cm.exons)) | |
for i in range(len(cm.exons)): | |
print "%3s" % cm.c2g(i), | |
for i in xrange(20): | |
print "%3d" % i, | |
for i in xrange(20): | |
print "%3s" % cm.g2c(i), | |
test_list() | |
#test_sf() | |
test_simple() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import unittest | |
from CoordinateMapper import * | |
from SeqFeature import FeatureLocation, SeqFeature | |
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)] | |
cmap = CoordinateMapper(exons) | |
g2c_exons = {7870: 272, 7871: 273, 7872: 274, 7873: 275, 7874: 276} | |
c2p_exons = {270: 90, 271: 90, 272: 90, | |
273: 91, 274: 91, 275: 91, | |
276: 92, 277: 92, 278: 92} | |
p2c_exons = {90: (270, 272), 91: (273, 275), 92: (276, 278)} | |
# check that c2p_exons and p2c_exons have correct correspondence | |
for p, c_tup in p2c_exons.iteritems(): | |
for c in xrange(c_tup[0], c_tup[1] + 1): | |
assert c2p_exons[c] == p | |
g2c_introns = {5860: '51+1', 5861: '51+2', 6308: '51+449', | |
6309: '52-448', 6755: '52-2', 6756: '52-1'} | |
g2c_intron_tups = {5860: (1, 51), 5861: (2, 51), 6308: (449, 51), | |
6309: (-448, 52), 6755: (-2, 52), 6756: (-1, 52)} | |
g2c_outside = {0: '-5808', 13785: '+1', 14000: '+216'} | |
g2c_outside_tups = {0: (-5808, None), 13785: (1, None), 14000: (216, None)} | |
class TestNonExonPosition(unittest.TestCase): | |
"""Test that NonExonPosition works properly""" | |
def setUp(self): | |
pass | |
def testGoodIntron(self): | |
"""NonExonPosition should match good intron values""" | |
self.assertEqual(CMConfig.index, 0) | |
for k, args in g2c_intron_tups.iteritems(): | |
self.assertEqual(str(NonExonPosition(*args)), g2c_introns[k]) | |
def testGoodOutside(self): | |
"""NonExonPosition should match good outside-CDS values""" | |
for k, args in g2c_outside_tups.iteritems(): | |
self.assertEqual(str(NonExonPosition(*args)), g2c_outside[k]) | |
def testEqual(self): | |
"""NonExonPosition should test equal with same args""" | |
for args in g2c_intron_tups.itervalues(): | |
NEPos = NonExonPosition | |
self.assertEqual(NEPos(*args), NEPos(*args)) | |
self.assertEqual(str(NEPos(*args)), str(NEPos(*args))) | |
def testBadAnchor(self): | |
"""NonExonPosition should fail with negative CDS anchor""" | |
self.assertRaises(CDSPositionError, NonExonPosition, 1, -1) | |
def testBadOffset(self): | |
"""NonExonPosition should fail with zero offset""" | |
self.assertRaises(CDSPositionError, NonExonPosition, 0) | |
class TestCoordinateMapper(unittest.TestCase): | |
"""Test that CoordinateMapper works properly""" | |
def testListInit(self): | |
"""CoordinateMapper should take list of exon pairs""" | |
CoordinateMapper(exons) | |
# TODO | |
def testSeqRecordInit(self): | |
pass | |
def testSeqFeatureInit(self): | |
"""CoordinateMapper should take a CDS SeqFeature""" | |
sf = SeqFeature(sum((FeatureLocation(*pr) for pr in exons)), | |
type="CDS") | |
CoordinateMapper(seqrecord=sf) | |
# FIXME Maybe RuntimeWarning printed to stderr isn't captured? | |
#def testNotCDS(self): | |
#sf = SeqFeature(sum((FeatureLocation(*pr) for pr in exons))) | |
#self.assertRaises(RuntimeWarning, CoordinateMapper._CDS_from_Seq, sf) | |
def testEmptyInit(self): | |
"""CoordinateMapper should fail with no arguments""" | |
self.assertRaises(Exception, CoordinateMapper) | |
# TODO HGVS and GenBank coords | |
# TODO 0 and 1 index should both work | |
# \*\d+ # XXX only allowed if * is in Config.post_CDS_fmt | |
# TODO in HGVS, base 0 doesn't exist | |
class Testg2c(unittest.TestCase): | |
"""Test success of good input and failure of bad input to g2c""" | |
# what it should do | |
# any integer should return | |
def testGoodExons(self): | |
"""g2c should work for exon positions""" | |
for arg, expected in g2c_exons.iteritems(): | |
self.assertEqual(cmap.g2c(arg), expected) | |
def testGoodIntronsStr(self): | |
"""g2c should work for intron positions (string)""" | |
for arg, expected in g2c_introns.iteritems(): | |
self.assertEqual(str(cmap.g2c(arg)), expected) | |
def testGoodIntrons(self): | |
"""g2c should work for intron positions (NonExonPosition)""" | |
for arg, tup in g2c_intron_tups.iteritems(): | |
actual = cmap.g2c(arg) | |
expect = NonExonPosition(*tup) | |
self.assertEqual(actual, expect) | |
self.assertEqual(str(actual), str(expect)) | |
def testGoodOutsideStr(self): | |
"""g2c should work for outside positions (string)""" | |
for arg, expected in g2c_outside.iteritems(): | |
self.assertEqual(str(cmap.g2c(arg)), expected) | |
def testGoodOutside(self): | |
"""g2c should work for outside positions (NonExonPosition)""" | |
for arg, tup in g2c_outside_tups.iteritems(): | |
actual = cmap.g2c(arg) | |
expect = NonExonPosition(*tup) | |
self.assertEqual(actual, expect) | |
self.assertEqual(str(actual), str(expect)) | |
# what it shouldn't do | |
# should it handle non-exact positions? | |
def testBadArg(self): | |
"""g2c should fail on string, float, None, or negative""" | |
bad = ("string", None, 3.14, ) | |
for arg in bad: | |
self.assertRaises(Exception, cmap.g2c, arg) | |
self.assertRaises(GenomePositionError, cmap.g2c, -5) | |
class Testc2p(unittest.TestCase): | |
"""Test success of good input and failure of bad input to c2p""" | |
# what it should do | |
# integer within length of exon should return correct protein | |
def testGoodExons(self): | |
"""c2p should work for exon positions""" | |
for arg, expect in c2p_exons.iteritems(): | |
self.assertEqual(cmap.c2p(arg), expect) | |
# FIXME should NonExonPosition return None or raise error? | |
def testBadNotProtein(self): | |
"""c2p should fail for NonExonPosition or str""" | |
bad = ("string", NonExonPosition(-5, 7)) | |
for arg in bad: | |
self.assertRaises(CDSPositionError, cmap.c2p, arg) | |
class Testp2c(unittest.TestCase): | |
"""Test success of good input and failure of bad input to p2c""" | |
# what it should do | |
# integer within length of protein should return correct range | |
def testGoodExons(self): | |
"""p2c should work for exon positions""" | |
for arg, expect in p2c_exons.iteritems(): | |
self.assertEqual(cmap.p2c(arg), expect) | |
def testBadTooLarge(self): | |
"""p2c should fail for positions longer than the max length""" | |
self.assertRaises(ProteinPositionError, cmap.p2c, 500) | |
def testBadNegative(self): | |
"""p2c should fail for negative protein coordinate""" | |
self.assertRaises(ProteinPositionError, cmap.p2c, -50) | |
class Testc2g(unittest.TestCase): | |
"""Test success of good input and failure of bad input to c2g""" | |
# what it should do | |
# integer within length of exon should return correct value | |
def testGoodExons(self): | |
"""c2g should work for exon positions""" | |
for expected, arg in g2c_exons.iteritems(): | |
self.assertEqual(cmap.c2g(arg), expected) | |
# NonExonPosition object should return correct value | |
def testGoodIntrons(self): | |
"""c2g should work for introns (NonExonPosition)""" | |
# dict -- genomic integer: arg tuple for NEP | |
for expect, args in g2c_intron_tups.iteritems(): | |
self.assertEqual(cmap.c2g(NonExonPosition(*args)), expect) | |
def testGoodOutside(self): | |
"""c2g should work for outside (NonExonPosition)""" | |
for expect, args in g2c_outside_tups.iteritems(): | |
self.assertEqual(cmap.c2g(NonExonPosition(*args)), expect) | |
# simple string should return correct value | |
# \d*[+-]\d+ | |
def testGoodIntronsStr(self): | |
"""c2g should work for intron positions (string)""" | |
for expected, arg in g2c_introns.iteritems(): | |
self.assertEqual(cmap.c2g(arg), expected) | |
def testGoodOutsideStr(self): | |
"""c2g should work for outside positions (string)""" | |
for expected, arg in g2c_outside.iteritems(): | |
self.assertEqual(cmap.c2g(arg), expected) | |
def testNegativeInt(self): | |
"""c2g should treat negative integer as pre-CDS""" | |
# XXX negative integers are treated as pre-CDS | |
self.assertEqual(cmap.c2g(-2), 5806) | |
# what it shouldn't do | |
# bad positions in form 123+4 where 123 is not end of an exon | |
# or where sign is wrong | |
def testBadNotEnd(self): | |
"""c2g should fail on invalid introns""" | |
bad_introns = ("160+5", # 160 is exonic | |
"51-6", # 51 is end | |
"52+10", # 52 is start | |
) | |
for arg in bad_introns: | |
self.assertRaises(CDSPositionError, cmap.c2g, arg) | |
def testBadString(self): | |
"""c2g should fail on arbitrary string""" | |
# for non-HGVS, treat * as meaningless | |
assert CMConfig.post_CDS_fmt[0] != "*" | |
bad_vals = ["xxx", "4-5'", "*10"] | |
for arg in bad_vals: | |
self.assertRaises(ValueError, cmap.c2g, arg) | |
# integers outside length of exon should raise IndexError | |
def testBadTooLarge(self): | |
"""c2g should fail on values beyond the exon""" | |
assert 400 > len(cmap.exons) | |
self.assertRaises(IndexError, cmap.c2g, 400) | |
self.assertRaises(Exception, cmap.c2g, "400-12") | |
if __name__ == "__main__": | |
#unittest.main() | |
suite0 = unittest.TestLoader().loadTestsFromTestCase(TestNonExonPosition) | |
suite1 = unittest.TestLoader().loadTestsFromTestCase(TestCoordinateMapper) | |
suite2 = unittest.TestLoader().loadTestsFromTestCase(Testc2g) | |
suite3 = unittest.TestLoader().loadTestsFromTestCase(Testg2c) | |
suite4 = unittest.TestLoader().loadTestsFromTestCase(Testc2p) | |
suite5 = unittest.TestLoader().loadTestsFromTestCase(Testp2c) | |
all_suites = [suite0, suite1, suite2, suite3, suite4, suite5] | |
all = unittest.TestSuite(all_suites) | |
unittest.TextTestRunner(verbosity=3).run(all) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment