Skip to content

Instantly share code, notes, and snippets.

@dansmith01
Last active October 28, 2016 00:42
Show Gist options
  • Save dansmith01/666144ea34af3dafaaca79a7b950e16b to your computer and use it in GitHub Desktop.
Save dansmith01/666144ea34af3dafaaca79a7b950e16b to your computer and use it in GitHub Desktop.
Generate an amino acid alignment with phylogenetic tree and detected peptides overlay.
#!/usr/bin/perl
#--------------------------------------------------------------#
# Author: Daniel Patrick Smith #
# Version: 1.0 / April 21st, 2010 #
# Affiliation: Giovannoni Laboratory, Oregon State University #
# Copyright: GNU General Public License v3 #
# Please Cite: Sowell et al, ISME (2009) 3, 93-105 #
#--------------------------------------------------------------#
use strict;
use warnings;
use PostScript::Simple;
# Change the following input data as needed:
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# The amino acid sequence alignment. Names must not contain '(', ')', ':', ' ', or ','. Names must contain at least one non-numerical character.
our %proteins = qw(
M_extorquens_moxF ----MSRFVTSVSALAMLALAPAAL---SSGAYANDKLVELSKSDDNWVMPGKNYDSNNFSDLKQINKGNVKQLRPAWTFSTGLLNGHEGAPLVVDGKMYIHTSFPNNTFALGLDDPGTILWQDK-----PKQNPAARAVACCDLVNRGLAYWPGDGKTPALILKTQLDGNVAALNAETGETVWKVENSDIKVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVKTGEQVWRAYATGPDKDLLLASDFNI----KNPHY-----------------------------GQKGLGTGTW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFGRDADTGEAKFGYQKTPHDEWDYAGVNVMMLSEQK---DKDGKARKLLTHPDRNGIVYTLDRTDGALVSANKLDDTVNVFKSVDLKT-----GQPVRDPEYGTRM---DHLAKDICPSAMGYHNQGHDSYDPKRELFFMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNYEGLGQIKAYNAITGDYKWEKMERFAVWGGTMATAGDLVFYGTLDGYLKARDSDT-GDLLWKFKIPSGAIGYPMTYTHKGTQYVAIYYGVGGWPGVGLVFDLADPTAGLGAVGAFKKLANYTQM--GGGVVVFSLDGKGPYDDPNVGEWKSAAK-
M_extorquens_putMdhLarge -----MRAVHLLALGAGLAAASPAL--------ANESVLKGVANPAEQVLQTVDYANTRYSKLDQINASNVKNLQVAWTFSTGVLRGHEGSPLVVGNIMYVHTPFPNIVYALDLDQGAKIVWKYE-----PKQDPSVIPVMCCDTVNRGLAY--ADGA----ILLHQADTTLVSLDAKSGKVNWSVKNGDPSKGETNTATVLPVKDKVIVGISGGEFGVQCHVTAYDLKSGKKVWRGYSIGPDDQLIVDPEKTT----SLGKP-----------------------------IAKDSSLKTW-----EGDQWKTGGGCTWGWFSYDPKLDLMYYGSGNPSTWNPKQRPGDNKWSMTIWARNPDTGMAKWVYQMTPHDEWDFDGINEMILTDQ----KFDGKDRPLLTHFDRNGFGYTLDRATGEVLVAEKFDPVVNWATKVDLDKGSKTYGRPLVVSKYSTEQNGEDVNSKGICPAALGTKDQQPAAFSPKTGLFYVPTNHVCMDYEPFRVTYTPGQPYVGATLSMYPA----PGSH----GGMGNFIAWDNLQGKIKWSNPEQFSAWGGALATSGDVLFYGTLEGFLKAVDSKT-GKELYKFKTPSGIIGNVMTYEHKGKQHVAVLSGVGGWAGIGLAAGLTDPNAGLGAVGGYAALSSYTNL--GGQLTVFSLPNN-----------------
M_methylotrophus_MdhLarge --------------------------------MADADLDKQVNTAGAWPIATGGYYSQHNSPLAQINKSNVKNVKAAWSFSTGVLNGHEGAPLVIGDMMYVHSAFPNNTYALNLNDPGKIVWQHK-----PKQDASTKAVMCCDVVDRGLAY--GAGQ----IVKKQANGHLLALDAKTGKINWEVEVCDPKVGSTLTQAPFVAKDTVLMGCSGAELGVRGAVNAFDLKTGELKWRAFATGSDDSVRLAKDFNS----ANPHY-----------------------------GQFGLGTKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWGRDLDTGMAKWGYQKTPHDEWDFAGVNQMVLTDQ----PVNGKMTPLLSHIDRNGILYTLNRENGNLIVAEKVDPAVNVFKKVDLKT-----GTPVRDPEFATRM---DHKGTNICPSAMGFHNQGVDSYDPESRTLYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PNGP--TKKEMGQIRAFDLTTGKAKWTKWEKFAAWGGTLYTKGGLVWYATLDGYLKALDNKD-GKELWNFKMPSGGIGSPMTYSFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFRELQNHTQM--GGGLMVFSL--------------------
M_organophilum_MdhLarge ----MSRFVTSVSALAMLALAPAAL---SSVAYANDKLVELSKSDDNWVMPGKNYDSNNYSELKQVNKSNVKQLRPAWTFSTGLLNGHEGAPLVVDGKMYVHTSFPNNTFALDLDDPGHILWQDK-----PKQNPAARAVACCDLVNRGLAYWPGDGKTPALILKTQLDRHVVALNAETGETVWKVENSDIKVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVKTGGQVWRAYATGPDKDLLLADDFNV----KNAHY-----------------------------GQKGLGTATW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFGRDADTGEAKFGYQKTPHDEWDYAGVNVMMPSEQK---DKDGKTRKLLTHPDRNGIVYTLDRTDGALVSANKLDDTVNVFKTVDLKT-----GQPVRDPEYGTRM---DHLAKDVCPSAMGYHNQGHDSYDPKRELFFMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNYEGLGQIKAYNAITGSYKWEKMERFAVWGGTLATAGDLVFYGTLDGYLKARDSDT-GDLLWKFKIPSGAIGYPMTYTHKGTQYVAIYYGVGGWPGVGLVFDLADPTAGLGAVGAFKKLANYTQQ--GGGVIVFSLDGKGPYDDPNVGEWKSASK-
M_nodulans_MxaF MLGKIVVRCRTGVSVAALAALLPLA---GPSALANDKLVELSKSDGNWVMPGKNYDSDNYSKLKQINAENVKNLKVSWQFSTGLLNGHEGAPLVVDGTMYVHTSFPNNTFALGLDDPGKILWQDK-----PKQNPAARSVACCDLVNRGLAYWPGDGKTPSLILKTLLDGHVVALNAQTGETVWKIENSDIRVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVRTGEQKWRAYATGPDSDLLLAKDFNI----HNAHY-----------------------------GQKGLGTSTW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFARDVDTGEAKFGYQKTPHDEWDYAGVNVMMLSTQK---DRSGKERKLLTHPDRNGIVYTLDRTNGDLISAHKIDDTVNVFKTVDLKS-----GLPVRDPEYGTRM---DHLAKDICPSAMGYHNQGHDSYDPERKLFYMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNAEGLGQIKAYDAITGKFKWEKMERFAVWGGTLATAGNVVFYGTLDGFIKARHSDT-GELLWKAKLPSGAIGYPVTYTHKGTQYVAIYYGVGGWPGVGLVFDLQDPTAGLGAVGAFKKLANYTQM--GGGVTVFSLDGKGPYDDPNTGEYVAAN--
P_denitrificans_MdhLarge -MNRNTPKARGASSLAMAVAMGLAV-LTTAPATANDQLVELAKDPANWVMTGRDYNAQNYSEMTDINKENVKQLRPAWSFSTGVLHGHEGTPLVVGDRMFIHTPFPNTTFALDLNEPGKILWQNK-----PKQNPTARTVACCDVVNRGLAYWPGDDQVKPLIFRTQLDGHIVAMDAETGETRWIMENSDIKVGSTLTIAPYVIKDLVLVGSSGAELGVRGYVTAYDVKSGEMRWRAFATGPDEELLLAEDFNA----PNPHY-----------------------------GQKNLGLETW-----EGDAWKIGGGTNWGWYAYDPEVDLFYYGSGNPAPWNETMRPGDNKWTMAIWGREATTGEAKFAYQKTPHDEWDYAGVNVMMLSEQE---DKQGQMRKLLTHPDRNGIVYTLDRTNGDLISADKMDDTVNWVKEVQLDT-----GLPVRDPEFGTRM---DHKARDICPSAMGYHNQGHDSYDPERKVFMLGINHICMDWEPFMLPYRAGQFFVGATLTMYPG----PKGDRQNALGLGQIKAYDAISGEMKWEKMERFSVWGGTMATAGGLTFYGTLDGFIKARDSDT-GDLLWKFKLPSGVIGHPMTYKHDGRQYVAIMYGVGGWPGVGLVFDLADPTAGLGSVGAFKRLQEFTQM--GGGVMVFSLDGESPYSDPNVGEYAPGEPT
M_capsulatus_Bath_MdhLarge -----MQICKLASGCGGSMLAMAAVLA-AQSTHANSELDRLSKDDRNWVMQTKDYSATHFSRLTEINSHNVKNLKVAWTLSTGTLHGHEGAPLVVDGIMYIHTPFPNNVYAVDLNDTRKMLWQYK-----PKQNPAARAVACCDVVNRGLAYVPAGEHGPAKIFLNQLDGHIVALNAKTGEEIWKMENSDIAMGSTLTGAPFVVKDKVLVGSAGAELGVRGYVTAYNIKDGKQEWRAYATGPDEDLLLDKDFNK----DNPHY-----------------------------GQFGLGLSTW-----EGDAWKIGGGTNWGWYAYDPKLDMIYYGSGNPAPWNETMRPGDNKWTMTIWGRDADTGRAKFGYQKTPHDEWDYAGVNYMGLSEQ----EVDGKLTPLLTHPDRNGLVYTLNRETGALVNAFKIDDTVNWVKKVDLKT-----GLPIRDPEYSTRM---DHNAKGICPSAMGYHNQGIESYDPDKKLFFMGVNHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGM------LGQVKAMNAVTGKMEWEVPEKFAVWGGTLATAGDLVFYGTLDGFIKARDTRT-GELKWQFQLPSGVIGHPITYQHNGKQYIAIYSGVGGWPGVGLVFDLKDPTAGLGAVGAFRELAHYTQM--GGSVFVFSL--------------------
M_capsulatus_Bath_putMdhLarge -MKKPVKSWLIASSIASLLAVPGVS-------FANAEVEALTKDPKNFATWGGNYAGTRYSTLDQINFKNAKHLQPVWTFSTGMLRGHEGGPLVVNDVIYIHTGYPHKVYALDQATQ-SVIWEYVYAPDKGTDQSQVISVMCCDVVNRGLAY--GDGK----IFLAQGDATLVALDAKTGKIVWKVKNGDPKTGMTATNAPLVVKDKVLTGISGGEFGVRGFLAAYNIKDGSLVWKKYSMGPDDEVGLDPEHTMTWTDGKMAP-----------------------------VGKDSSLKTW-----QGDQWKIGGGTTWGWYSYDPDLNLVYYGSGNPSTWNPVQRPGDNKWSMTIWARDVDTGEAKWVYQMTPHDEWDYDGINEMMLIDQEMTAKDGSKHSKLLTHFDRNGFGYTLDRVTGELLVAEKFDKAVNWATHVDMKT-----GRPQVNPKYSTQHGGQDVDTKGICPSAMGAKNEPPVTYSPRTKLIYIPGNHTCMNYEPFEVEYTAGQPYVGATLNIFPARANVKTGEKESSNHMGSFTAWDPTTGTIAWQFDEPFSLWSGMVSTAGDIVIYGTLEGYLKVRDAKT-GEELYRFKTPSGIIGNVSTWTYNGKQYIGVLSGIGGWAGVGMAAGLEGDTEGLGAVGAYKGLSSHTKL--GGVFTVFALP-------------------
S_meliloti_1021_MdhLarge -------MKRLLTMLAIMSIGGGAQ-----VAFANDELQKLIDDPNQWAIQTGDYANLRYSKLDQINKDNVGKLQVAWTFSTGVLRGHEGSPLVIGDLMYVHTPFPNTVYALDLSKDGQIVWKYE-----PKQDPNVIPVMCCDTVNRGVAY--ADNK----IFLHQADTTVVALDAKTGKVIWSVKNGDATKGETNTATVMPVKDKILVGISGGEFGVRGHVTAYSMADGKVLWRGYSMGPDSDTLIDPEKTT----HLGKP-----------------------------VGKDSGLTTW-----EGDQWKIGGGTTWGWYSYDPEENLVYYGTGNPSTWNPTQRPGDNRWSMTIFARDVDTGMAKWLYQMTPHDEWDYDGVNEMILTEQ----QIDGKDRKLLTHFDRNGFGYTMDRVTGELLVAEKYDPTVNWATEVVMDPKSDKYGRPQVVAQYSTEQNGEDTNTTGVCPAALGTKDQQPAAYSPKTELFYVPTNHVCMDYEPFRVSYTAGQPYVGATLSMYPP----KDSH----GGMGNFIAWDNKEGKIKWSLPEPFSVWSGALATAGDVVFYGTLEGYLKAVDAAT-GKELYRFKTPSGVIGNVMTYAREGKQYVAVLSGVGGWAGIGLAAGLTNPTEGLGAVGGYSALSNYTAL--GGTLTVFKLPE------------------
S_meliloti_MdhLarge -------MNRLLTMLAIMSIGGGAQ-----VAFANDELQKLIDDPNQWAIQTGDYANLRYSKLDQINKDNVGKLQVAWTFSTGVLRGHEGSPLVIGDLMYVHTPFPNTVYALDLSKDGQIVWKYE-----PKQDPNVIPVMCCDTVNRGVAY--ADNK----IFLHQADTTVVALDAKTGKVIWSVKNGDATKGETNTATVMPVKDKILVGISGGEFGVRGHVTAYSMADGKVLWRGYSMGPDSDTLIDPEKTT----HLGKP-----------------------------VGKDSGLTTW-----EGDQWKIGGGTTWGWYSYDPEENLVYYGTGNPSTWNPTQRPGDNRWSMTIFARDVDTGMAKWLYQMTPHDEWDYDGVNEMILTGQ----HIDGKDRKLLTHFDRNGFGYTMDRVTGELLVAEKYDPTVNWATEVVMDPKSDKYGRPQVVAQYSTEQNGEDTNTTGVCPAALGTKDQQPAAYSPKTELFYVPTNHVCMDYEPFRVSYTAGQPYVGATLSMYPP----KDSH----GGMGNFIACDNKEGKIKWSLPEPFSVWSGALATAGDVVFYGTLEGYLKAVDAAT-GKELYRFKTPSGVIGNVMTYAREGKQYVAVLSGVGGWAGIGLAAGLTNPTEGLGAVGGYSDLSNYNAL--GGTLTEFKLPE------------------
B_japonicum_USDA_110_MdhLarge -----MRKVLLATYLGSAAALAVGS------ASANDELIKMSQNPKDWVMPAGDYANTRYSKLNQINAQNVGKLQVAWTFSTGVLRGHEGGPLIIGNMMYVHTPFPNKVYAIDLSNENKIVWKYE-----PKQDPNVIPVMCCDTVNRGLSY--GDGK----IILHQADTNLVALDAKTGQVAWSATNGDPSKGQTGTSAALVVKDKVLVGISGGEFGVQCHVTAYDLKSGKQVWRAFSEGPDDQIKVDPAKTT----SLGKP-----------------------------VGADSSLKTW-----QGDQWKIGGGCTWGWMSYDPALNLVYYGSGNPSTWNPKQRPGDNKWSMTIFARDADTGMAKWVYQMTPHDEWDYDGVNEMILSDQ----QINGQARKLLTHFDRNGLGYTMDRESGELLVAEKYDPKVNWTSGVDMDKNSPTYGRPKVLDAASTDKAGEDHNVKGICPAALGTKDEQPAAYSPDTQLFYVPTNHVCMDYEPFKVSYTAGQPYVGATLSMYP-----PQGE----SHMGNFIAWDGKTGKIVWSNKEQFSVWSGALATAGGVVFYGTLEGYLKAVDAKS-GKELYKFKTPSGIIGNVTTYENGGKQYVAVLSGVGGWAGIGLAAGLTDPTAGLGAVGGYAALSNYTAL--GGTLTVFSLPAN-----------------
HTCC2181_putMdhLarge -MEINMQLNKIKLALGFAAAATMAM---PMVASAAADQEKAMSNANNWAHPRGQHDNQAYSKLTQLNKGNVKNLKAAWTFATGVNRGHEGSPLVIGNMMYVHTAFPNNVYALDLNNDQKIVWSYF-----PKQDPSVQAVLCCDNVNRGLGF--GDGK----IFLQQNDGMLVALDAKTGAKVWDASNVDPKVGATNTNAPHVIKDKVLTGCSGAEFGVRCFMAAYNIADGSLAWKAMSTGPDSEVLIGADFNK----ENPLYSALSVYEDVNGGNKQGGSFKKIPTDQLQGGVADLGVKTWLKPQAVKDGWQHGGGSVWGWWPYDAKTNLVYYGTGNPSVWNPDVRPGDNKWSMTVFARDLDTGMARWGMQMTPHDEWDYDGINEVILFDK------GGK--TYAWHHDRNGFAYTWTAAEGTLVAAEKVHPFVNWATDVDLKS-----GVPNKLAEHSTHQ---DYNTKGTCPAALGTKDQQPAAYSPKTGLIYSPLNHVCMTYEPVESKYVAGQPWVGATLTMFAG----PDGV------MGGFAAYDPMTNKKVWYNKEKFSAWGGAMTTASDLVFYGTLDRWFKALDAKS-GKELWKFQVGSGVIGNAFTYAHKGKQHVGVLSGIGGWAGVAMNLGLTNDTDALGAAGGYKELTKYNAAPGGGGLTVFSL--------------------
M_flagellatus_KT_Mfla_2314 -----MEMSKIKLALGVAFGMAMAV---PMVASAAADQEAAMKDPNNWVHPRGQHNNQGYSTLNQINKSNVKNVKAAWQFATGVNRGHEGSPVVIGNMMYVHTAFPNNVYALDLNDNQKIVWAYF-----PKQDPSVQAVLCCDNVSRGLGY--GDGK----IFLQQNDGVLVALDAKTGAKVWDVKVNDPKVGATNTNAPHVIKDKVLTGCSGAEFGVRCFLVAYNIKDGSQVWKAYSTGPDSEVLIGKDFNK----DNPHYSALSVYEDINGGNKEGGSFKALPKEKLKFPETDLGVKTWLKPQAVKDGWQHGGGSTWGWWPYDHRTNLVYYGTGNPSVWNPDVRPGDNKWSMTIFARDVDTGIAKWGYQMTPHDEWDYDGINEVILFDK------GGK--TYAWHHDRNGFAYTFDAHTGSIIAAEKVHPFVNWATHVDKAT-----GIPHKDGKYSTHQ---DYNTKGICPAALGTKDQQPAAYSPRTGLVYSPLNHVCMTYEPVESKYIAGQPWVGATLTMFAG----PDGV------MGGFAAYDPMTNKAKWYNKEKFSAWGGALATASDLVFYGTLDRWFKALDAQT-GKELWKFQVGSGVIGNAFTYSHKGKQYVGVLSGIGGWAGVAMNLGLENETDALGAAGGYKELTKYNAAPGGGALNVFSL--------------------
M_flagellatus_KT_Mfla_0344 -----MKAKHLKIALGASLIGLASI-----PGFAATDLEKLMQDDNQWATQRKDYANTGYSKLSQINQSNIKNLKAAWTFATGVNRGHEGAPLVVDDTLYFTTAFPNNVYALDLNNEEKIKWSYF-----PKQDPSVQALLCCDNVTRGLAY--GDGK----IFLQQNDGQLVALDAKTGAKVWEVGVVDVKQGATTTNAPHVFKDKVITGCSGGEYGVRCYLTAYNIKDGSIAWRAYATGPDSDVLIGKDFNK----DNPHYSALSVYEDINGGNQEGGSFKALPKEKLKFPETDLGVKTWLKPQAAKNGWEHGGGGTWGWYSYDPALNLVYYATGNPSVWNPDVRPGDNKWSMTIFARDLDTGLAKWGYQLTPHDEWDYDAVNETILWDA------DGK--KLATHFDRNGFGYTLDRQTGKLLVAEKMHPFVNWATGIDLTT-----GTPIKDPKYATHQ---DVQISGICPAALGVKDQQPAAYSPKTKLFYVPLNHVCMDYEPVEVKYVAGQPWVGATLSMYPG----PDGV------MGGFMAWDGLKGKQVWYKKEKFSVWSGALATATDIVFYGTLDRWFKAVDAKS-GKELWKFRVGSGVVGNPITYTHKGKQYVALLSGIGGWAGVAMNLGLTNDNEGLGAAGGYKELKEWNAAPGGGALNVFSL--------------------
M_flagellatus_KT_Mfla_1717 -----MRISPFLIVVSLNLVSAVSS--------AADPLLPLLQDDKQWVSPRKDYYNQGYSRLSQINHRNVANLKLAWSFSTGVQRGHEGAPLVVGNVMYVHTAFPNNVYALDLDHDQKILWAYF-----PKQTSDIEAILCCDSVSRGLAY--GDGK----ILLQQNNGILLALDARDGRKVWEVQVNDPRMGATNTNAPYVFRDKVLTGCSGGEFGVRCFLAAYALQDGRLQWKAYSTGPDSEVLIGPGFNA----ENPHYSALSTYEDVNGGNREGGSFRALPKERLKFPETELGVKTWLKPQAAANGWQHGGGPVWGWFSYDPKLNLVYYGTGNPSVWNPDIRPGDNKWAMTLFARDLDTGMARWGYQLTPHDEWDYDSVNELILWDQ-------GS-RKLATHFDRNGFAYTLDRQNGKLLAAEKMHPFVNWATGIDLQT-----GIPLKDGRYATHE---DQETSAICPSAIGVKNIQPAAYSPQNGWFYVPMNHLCMTYEAVEAKYVAGQPWAGASLSMYPG----PDGD------MGAFMAWDALKHKPVWYTREKYPVWSGALATAGQLAFYGTLDRWFKALDARN-GKELWRFQVGSGIVGNPMTYQHAGKQYVAVFSGIGGWAGVALNQGLTGDSDGAGTAGAFRGLSQENAAPGSGALNVFAL--------------------
M_flagellatus_KT_MxaF ---MKGRVSHVGISAAVSSLLVLAT---VQGAQANQEVLNLQKNADNWALQTGNYTGQHNSTLSQINKSNVKNLKAAWSFSTGVLHGHEGAPLVIGDMMYVHSAFPNNTFAINLNDPGVIAWQHK-----PKQIASVKAVACCDIVNRGLAY--GDGK----IVKTQLDGRLVALDAKSGKIVWEIEVCDPKVGATLTQAPFVAKNSVLVGCSGAELGVRGAVNSFNLKTGELQWRAFATGPDEEVRLAKNFNS----DNPHY-----------------------------GQFGLGLKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWARDLDTGEAKWGYQKTPHDEWDFAGVNQMILSDH----KVDGKVTPLLTHIDRNGIMYTLNRDNGNLIQAAKVDPAVNVFKKVDLKT-----GTPVRDPEFSTRM---DHKSTNVCPSAMGFHNQGLDALDLDEPIVYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PSGP--TKKEMGQVRAMDIVTGKYKWTKWEKFAVWGGTLATKGGLVVYNTLDGYIKALDKDN-GKDLWKFKMPSGGIGAPMTYQFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFKELQNHTQM--GGGLMVFSL--------------------
P_denitrificans_xoxF -------MKNLMNGACLALLMSGTA------ALANEQRAGRDRQAPQWAIQMGDYANTRYSTLDQINKDNVKDLRVAWTFSTGVLRGHEGSPLVIGDVMYVHTPFPNRVFALDLNDNGKILWRYE-----PQQDPNVIAVMSCDTVYRGLSY--ADGM----ILLGQADTTVVALDATSGEVKWSTKIGDPGIGETLTATVVPVKDKVLVGISGGEYGVRGRMTALNLTDGSEAWKAWSTGPDEELLVDPETTT----HLGKP-----------------------------IGADSSLNSW-----EGDQWQIGGGTIWGWFSYDPDLNLVYYGTGNPSTWNPSQRPGDNKWSMTIMARDADTGMAKWFYQMTPHDEWDYDGVNEMILTNQ----TVDGQERKLLTHFDRNGLAYTMDRETGELLVAEKYDPVVNWTTGVDMDPNSETYGRPAVVAEYSTAQNGEDENTTGVCPAALGTKDQQPAAFSPKTNLFYVPTNHVCMDYEPFRVAYTAGQPYVGATLSMYPA----PNSH----GGMGNFIAWHNTTGEIKWSVPEQFSVWSGALATAGDVVFYGTLEGYLKPVDAQT-GEELYKFKTPSGIIGNVMTYEHGGKQYVGILSGVGGWAGIGLAAGLTNPNDGLGAVGGYASLSQYTEL--GGQLTVFELPG------------------
B_fungorum_GluDH -----MNLRTTVLGLAILASAALSS----FVAQADSQLDGLMKNPSNWAAQAGDYANHRYSPLKQINENNVGKLQVAWTMSTGVLRGHEGAPLVIGDTMYIHSPFPNKVIAINLKDQ-TFIWQYL-----PKQDDQVVSVMCCDTVNRGLAY--GDGK----IFLQQADTKLVALNAKTGDVVWTAQNGNPKAGETNTNAPHVFGDKVLTGISGGEFGVRGRLIAYDIKTGKPAWTAYSTGPDKDMLIDPDKTTTYADGKMVP-----------------------------VGADSSLKSW-----KGDQWKLGGGTTWGWYAWDPKLNLVYYGTGNPGTWNPTQRPGDNKWSMSIFARDLNTGQARWVYQMTPHDEWDYDGVNEMILSDL----SIDGKKVPAIVHFDRNGFGYTLNRETGQLLVAQKFDPAVNWADHVDMKS-----GKPIRNAAYSTQAAGSDHNVKGICPAALGSKDQQPAAYDPGSSLFLVPTNHVCMDYEPFDVDYVSGQPYVGATLSMYPG----PNDN----NSMGNFIAWDASKGKIVWSKPERFSVWSGVLATAGGVAFYGTLEGYIKAVRIKD-GKELWRFKTPSGIIGNVFTYEYQGKQFIGVYSGIGGWAGIGMAAGLEKSTEGLGAVGGYRELAKYTAL--GGTLFVFAIPGGNS---------------
H_methylovorum_mxaF MGMKKVVSTPMLMSASCMAIAVALQVGVASSAYANDKLIELSKSNENWVMPGKNYDSNNYSESTQVNAENVKQLKHAWSFSTGELHGHEGAPLVIGDMMYVHSSFPNKTFALNLNDPGHILWQHS-----PKQDPAARSVACCDLVNRGLAYWPGDDKTPALVIKTQLDGHLVALNAKTGEEFWKVENGDIKVGQTLTQAPYVVHDLAIVGSSGAELGVRGHVTAYNVKTGEQAWRYYATGPDEEIGLADDFNS----ANPHY-----------------------------GQKGLGTATW-----EGDAWKIGGGTNWGWYAYDPQANLIYYGSGNPAPWNETMRPGDNKWTMTITARDADTGKMKFGYQKTPHDEWDFAGVNVIMLSEQT---DKEGKKRKLLTHPDRNGIVYTLDRENGDLISADKLDDTVNVFKHVDLKS-----GLPVRDPEFGTRM---DHKGTEICPSAMGYHNQGHDSYDPTKQLFFMGINHICMDWEPFMLPYRAGQFFVGATLWMYPG----PKGDRQNYLGLGQIKAYNAITNTYKWEHMERFSVWGGTLATAGNLVFYGTLDGFLKARNSDT-GELLWKHKLPSGVIGYPMTYEHKGVQYIAVMSGVGGWPGVGLVFDLQDPTAGLGAVGAFKNLQRYTQM--GGSLEVFSLDGKNPYDDVNVANWTKGCI-
R_sphaeroides_GluDH -------MKMLKRGLAATLLLSSAP------VYANDSVMQAIGDSTQWAIQTGDYANTRYSELDQINRENVGKLQVAWTFSTGVLRGHEGSPLVIDGIMYVHTPFPNNVYALDLNNEGRILWRYE-----PQQNPDVVGVMCCDTVNRGVAY--ADGM----IFLHQADTTIVALDAKSGEVKWSVVNGDPTKGETNTATVLPVKDKILVGISGGEFGVRGHLTAYDMQTGEQVWRAFSTGPDEEMLVDPEQTT----HLGKP-----------------------------IGPNSSLESW-----EGDQWQIGGGTTWGWYSYDPDLNLVYYGTGNPSTWNPSQRPGDNKWSMTIMARDVDTGMAKWFYQKTPHDEWDYDGVNENILVDQ----EIDGQMRKLLVNLDRNGFGYTLDRETGELLVAEKYDPAVNWATEVVMDPESDQYGRPQVVAEYSTAQNGEDTNTTGVCPAALGSKDQQPAAFSPKTGLFYVPTNHVCMDYEPYRVSYTAGQPYVGATLSMYPA----PDSH----GGMGNFIAWDATKGEIKWSLPEQFSVWSGALATAGDVVFYGTLEGHLKAIDAET-GELLYRFKTPSGVIGNVMTYELNGKQYIGILSGVGGWAGIGLAAGLTNPNEGLGAVGGYAALSDYTEL--GGQLTVFAVPD------------------
Methylobacillus_sp_SK5_MdhLarge ---MKGRVTHVGISAAVSSLLVLAT---MQGAQANQDLQNLTKNADNWALQTGNYTGQHNSTLSQINKGNVKNLKAAWSFSTGVLHGHEGAPLVIGDMMYIHSAFPNNTFAVNLNDPGVIAWQHK-----PKQIASVKAVACCDIVNRGLAY--GDGK----IVKTQLDGKLVALDAKSGKVVWEIEVCDPKVGATLTQAPFIVKNTVLVGCSGAELGVRGAVNSFNLKTGELQWRAFATGPDEEVRLAKNFNS----DNPHY-----------------------------GQFGLGLKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWARDLDTGEAKWGYQKTPHDEWDFAGVNQMILSDH----KVDGKVTPLLTHIDRNGIMYTLNRDNGNLVQAAKVDPAVNVFKKVDLKT-----GTPVRDPEFSTRM---DHKSTNVCPSAMGFHNQGLDALDLDEPIVYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PSGP--TKKEMGQVRAMDIVTGKYKWTKWEKFAVWGGTLATKGGLVAYNTLDGYIKALDKDN-GKELWKFKMPSGGIGAPMTYQFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFKELQNHTQM--GGGLMVFSL--------------------
M_petroleiphilum_PM1_Mpe_A3393 MKVSKHSGWRLMRPLGLALLAIPAV------VQANADVEKNIANSKNWAMQAGDMFNQRYSKLDQINKGNVGKMQVAWTFSTGVLRGHEGSPLVIDGTMYLHSPFPNKVFAIDLNTQ-KILWKYE-----PKQDPAVIPQMCCDTVNRGLAY--AEGK----VILQQADSNLVALDAKSGKVVWSVKNGDPKLGAVNTNAPHVFKDKVITGISGGEWGVRGFIAAYNLKDGKPAWKGYSVGPDAEMLIDPAKTTTWIDGKVAP-----------------------------VGADSSLKTW-----KGDQWKIGGGTTWGWYSYDKALNAMYYGTGNPSTWNPSQRPGDNKWSMSIWSRDVDTGKVNWVYQMTPFDEWDFDGINEMILADI----NVKGKPTKALVHFDRNGFAYTMDRTNGALLVAEKYDPKVNWATHVDMKT-----GRPQVVKQYSTAQNGPDVNTKGICPAALGSKDQQPASFDPNTKLFYVPTNHVCMDYEPFKVEYTAGQPYVGATLSMFPA----PGSH----GGMGNYITWDAGTGKIVQSKAEKFSVWSGSLNTAGGLSCYGTLEGYFKCVDAKDISKELFKFKTPSGIIGNVFTYEHKGKQYMGVFSGIGGWAGIGMAAGLEKDQDGLGAVGGYKELNQYTEL--GGSLTVFALPN------------------
);
# The list of peptides that are to be highlighted.
our @peptides = unique(qw(
TNLVYYGTGNPSVWNPDVRPGDNK AMSNSANWAHPR VITGCSGAEFGVR LTQVNKGNVK EAAHSTHQDYNTK
GLGFGNGKIFLQQNDGNLVALNAK QDPSVQAVLCCDNVNR NGFAYTWVASNGTLVSAEK KIPTDQLQGGVADLGVK
VGATNTNAPHVIKDK VLTGCSGAEFGVR AAWTISTGVNR GQHDNQAYSK RIPNDQLQAPVADLGIK
WGMQMTPHDEWDYDGINEVILFDKGGK IFLQQNDGNLVALNAK VGATNTNAPHVINDK LTQLNKGNVK
DQQPAAYSPK VVWSYFPKQDPSVQAVLCCDNVNR DGWQHGGGSTWGWWPYDAK TGLIYSPLNHVCMTYEPVESK
AAWTLSTGVNR WGMQMTPHDEWDYDGINEVILFDK RIPNDQLQGGVADLGLK DGWQHGGGSVWGWWPYDAK
VGATNTNAPHVIK LTQINKGNVK VWSTLNTDPK ELTKYNAAPGGGALTVFAL TWLKPQAVK VVWSYFPK
));
# The phylogenetic tree in nexus format. Names must be identical to those in the alignment.
# Make sure the string ends in "):0.0;", as in the example below.
our $tree = "((((M_capsulatus_Bath_MdhLarge:0.25935233153341924472,((H_methylovorum_mxaF:0.20878168596712515237,(M_nodulans_MxaF:0.12190741832194466887,(M_extorquens_moxF:0.02274678088090198447,M_organophilum_MdhLarge:0.02307171873930398062):0.06011530645090353009):0.08366283963156732895):0.06995643960127288785,P_denitrificans_MdhLarge:0.20442045141664449970):0.12242880488028372310):0.14514292177919754789,((Methylobacillus_sp_SK5_MdhLarge:0.01600304631180017589,M_flagellatus_KT_MxaF:0.01526122172147443765):0.10767993129317506018,M_methylotrophus_MdhLarge:0.14280191029943056780):0.19923880678630695451):0.42551609073153434659,((M_flagellatus_KT_Mfla_1717:0.35231344928419872087,M_flagellatus_KT_Mfla_0344:0.14695574411891779154):0.06730097283858139934,(M_flagellatus_KT_Mfla_2314:0.06839777440088301574,HTCC2181_putMdhLarge:0.14290213930651829433):0.15771954238829591355):0.22366507158804760369):0.19229690543073024722,((M_petroleiphilum_PM1_Mpe_A3393:0.27375376371657816899,B_fungorum_GluDH:0.32568960184059214669):0.07065861624495217708,((M_extorquens_putMdhLarge:0.20052498854951039675,B_japonicum_USDA_110_MdhLarge:0.20384942278865467169):0.06929269248036029616,((R_sphaeroides_GluDH:0.13212536758338755405,P_denitrificans_xoxF:0.18460378179484654426):0.08680998014121153739,(S_meliloti_1021_MdhLarge:0.00000087024666238173,S_meliloti_MdhLarge:0.01447501190371387053):0.11910797865841461773):0.04195425485701579499):0.13380076964851062016):0.02532900152403316091,M_capsulatus_Bath_putMdhLarge:0.44978920877462219563):0.0; ";
# Parameters concerning the size of the .eps formatted output.
our ($height, $width, $buffer, $textsize, $tree_zoom) = (225, 600, 20, 6, 1);
# Output file. Must be EPS. To convert to other formats, use Imagemagick (`convert out.eps out.png`)
# or ghostscript (`gs -sDEVICE=pdfwrite -sOutputFile="out.pdf" -dNOPAUSE -dEPSCrop -f out.eps -c quit`)
our $outfile = "out.eps";
# That's all
#------------------------------------------------------------------------#
# Don't change anything below this line unless you know what you are doing
# Do some quick sanity checks
die ("No input alignment.\n") unless (scalar(keys %proteins));
our $alignment_len = length((values %proteins)[0]);
die ("Error: Zero-length alignment.\n") unless ($alignment_len);
while (my ($id, $seq) = each %proteins) {
die ("Sequences are not aligned.\n") if (length($seq) != $alignment_len);
die ("Name '$id' contains invalid characters.\n") if ($id =~ m/[\(\)\:\,]/);
die ("Sequence names cannot be all numbers ($id).\n") if ($id =~ m/^\d+$/);
$proteins{$id} = $seq = uc($seq);
die ("Invalid sequence for '$seq'.\n") unless ($seq =~ m/^[\-ARNDCEQGHILKMFPSTWYV]+$/);
}
if (scalar(@peptides)) {
foreach my $i (0..$#peptides) {
my $seq = $peptides[$i] = uc($peptides[$i]);
die("Invalid peptide '$seq'.\n") unless ($seq =~ m/^[\-ARNDCEQGHILKMFPSTWYV]+$/);
}
} else {
print STDERR "Warning: No peptides provided in input. Proceeding...\n";
}
# Find out how often each amino acid occurs at each position
our @consensus = ();
foreach my $p (0..($alignment_len - 1)) {
my $hash = {};
$hash->{substr($_, $p, 1)}++ foreach (values %proteins);
my $total = sum(values %$hash);
$hash->{$_} /= $total foreach (keys %$hash);
push @consensus, $hash;
}
# Digest the tree data into a form we can recurse over
our @nodes = ();
our @members = ();
while ($tree =~ m/(\([^\(\)]+\))/) {
substr($tree, index($tree, $1), length($1), scalar(@nodes));
my @pair = ();
my $member_count = 0;
foreach (split(',', substr($1, 1, -1))) {
my ($name, $len) = split(':', $_);
push @pair, { 'name' => $name, 'len' => $len };
if ($name =~ /^(\d+)$/) {
$member_count += $members[$1];
} else {
$member_count++;
if (!defined($proteins{$name})) {
die ("Name in tree '$name' not in the alignment.\n");
}
}
}
push @nodes, [@pair];
push @members, $member_count;
}
# Set up variables to describe our drawing area
my $scalebar_height = 10 + ($textsize * 2);
my $n_leaves = $members[-1];
my $usable_height = $height - (2 * $buffer) - $scalebar_height;
our $rect_height = $usable_height / $n_leaves;
our $rect_halfheight = $rect_height / 2;
our ($rect_x1, $rect_x2) = ($width * .45, $width - $buffer);
our $rect_width = ($rect_x2 - $rect_x1) + 1;
our @names_ypos = map($buffer + $usable_height * $_ / $n_leaves, 0..($n_leaves-1));
our $text_x_adjust = ($textsize * .3);
our $text_y_adjust = -1 * ($textsize * .3);
our $linesize = $textsize / 10;
our $p = new PostScript::Simple(colour => 1, xsize => $width, ysize => $height, units => "pt");
$p->setfont("Arial", $textsize);
$p->setlinewidth($linesize);
# Draw the scale bar for the genes
my $scale_ypos = $height - $buffer - $scalebar_height + 2;
my $interval = int(($alignment_len / $rect_width) * 50); # Number of genes per 40 pixels
my $base = 10 ** (length($interval) - 1);
$interval = sprintf("%.0i", $interval / $base) * $base;
my $extended_len = $interval * int($alignment_len / $interval);
$extended_len += $interval if ($extended_len < $alignment_len);
my $extended_x2 = $rect_x1 + $rect_width * ($extended_len / $alignment_len);
$p->line($rect_x1 - .5 * $linesize,$scale_ypos, $extended_x2 + .5 * $linesize,$scale_ypos);
for (my $i = 0; $i <= $extended_len; $i += $interval) {
my $x = $rect_x1 + $rect_width * ($i / $alignment_len);
$p->line($x,$scale_ypos, $x,$scale_ypos+4);
$p->text( {align => 'center'}, $x,$scale_ypos+4+$textsize*.2, $i);
}
$p->text( {align => 'center'}, ($rect_x1+$extended_x2)/2,$scale_ypos+8+$textsize+$textsize*.2, "Position in amino acid alignment");
# Call the drawing command and save the image
if ($tree =~ m/(\d+)\:([0-9\.]+)/) {
print_tree({'name' => $1, 'len' => $2}, 0);
$p->output($outfile);
}
sub print_tree {
my $node = shift;
my $dist = shift;
my $x1 = $dist * ($width * .25) + $buffer - $linesize * .5;
$dist += $node->{'len'} * $tree_zoom;
my $x2 = $dist * ($width * .25) + $buffer;
if ($node->{'name'} =~ m/^\d+$/) {
my @subnodes = @{$nodes[$node->{'name'}]};
my @ypos_arr = map(print_tree($_, $dist), @subnodes);
my $y_avg = avg(@ypos_arr);
my $y_min = min(@ypos_arr);
my $y_max = max(@ypos_arr);
$p->setcolour(0,0,0);
$p->line($x2,$y_min, $x2,$y_max);
$p->line($x1,$y_avg, $x2,$y_avg);
return $y_avg;
} else {
my $y_pos = shift @names_ypos;
my $y_min = $y_pos - $rect_halfheight + $linesize * .5;
my $y_max = $y_pos + $rect_halfheight - $linesize * .5;
$x2 = $x1 + $linesize if ($x2 - $x1 < $linesize);
$p->setcolour(0,0,0);
$p->line($x1,$y_pos, $x2,$y_pos);
my $name = $node->{'name'};
$name =~ tr/\_/\ /;
$name =~ s/^([A-Z])\ /$1\.\ /;
$p->text($x2+$text_x_adjust,$y_pos+$text_y_adjust, $name);
my $protein_seq = $proteins{$node->{'name'}};
# Grey box representing the gene
$p->setcolour(200,200,200);
$protein_seq =~ m/^(\-*)(.*?)(\-*)$/;
my $protein_start = $rect_x1 + $rect_width * (length($1) / $alignment_len);
my $protein_end = $rect_x2 - $rect_width * ((length($3) ) / $alignment_len);
$p->box( {filled => 1}, $protein_start,$y_min, $protein_end,$y_max);
# White out gapped regions
$p->setcolour("white");
while ($protein_seq =~ m/(\-+)/g) {
my $gap_start = length($`);
my $gap_end = $gap_start + length($1);
my $aa_x1 = $rect_x1 + $rect_width * ($gap_start / $alignment_len);
my $aa_x2 = $rect_x1 + $rect_width * ($gap_end / $alignment_len);
$p->box( {filled => 1}, $aa_x1,$y_min, $aa_x2,$y_max);
}
# Mapping of positions between gapped and ungapped
my @u2g = ();
my $protein_useq = $protein_seq; $protein_useq =~ tr/\-//d;
foreach (0..($alignment_len-1)) {
push @u2g, $_ unless (substr($protein_seq, $_, 1) eq '-');
}
# Heat map amino acids according to conservation if peptide was found
foreach my $pep (@peptides) {
my $pep_start = -1;
while (($pep_start = index($protein_useq, $pep, $pep_start + 1)) > -1) {
my $pep_end = $pep_start + length($pep) - 1;
foreach ($pep_start..$pep_end) {
my $aa_pos = $u2g[$_];
my $aa = substr($protein_seq, $aa_pos, 1);
next if ($aa eq '-');
my $aa_x1 = $rect_x1 + $rect_width * ($aa_pos / $alignment_len);
my $aa_x2 = $rect_x1 + $rect_width * (($aa_pos+1) / $alignment_len);
my $cons = $consensus[$aa_pos]->{$aa};
$p->setcolour(255*$cons, 255-255*$cons, 0);
$p->box( {filled => 1}, $aa_x1,$y_min, $aa_x2,$y_max);
}
}
}
# Black border around protein in alignment
$p->setcolour(0,0,0);
$p->box( {filled => 0}, $protein_start - .5 * $linesize,$y_min, $protein_end + .5 * $linesize,$y_max);
return $y_pos;
}
}
sub avg {
my $sum = 0;
$sum+= $_ foreach(@_);
my $avg = $sum / scalar(@_);
return $avg;
}
sub min {
my $min = shift;
foreach (@_) { $min = $_ if ($_ < $min); };
return $min;
}
sub max {
my $max = shift;
foreach (@_) { $max = $_ if ($_ > $max); };
return $max;
}
sub sum {
my $sum = 0;
$sum += $_ foreach (@_);
return $sum;
}
sub unique {
my %hash = ();
$hash{$_} = 1 foreach (@_);
my @uniq = keys %hash;
return @uniq;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment