Last active
October 28, 2016 00:42
-
-
Save dansmith01/666144ea34af3dafaaca79a7b950e16b to your computer and use it in GitHub Desktop.
Generate an amino acid alignment with phylogenetic tree and detected peptides overlay.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#--------------------------------------------------------------# | |
# Author: Daniel Patrick Smith # | |
# Version: 1.0 / April 21st, 2010 # | |
# Affiliation: Giovannoni Laboratory, Oregon State University # | |
# Copyright: GNU General Public License v3 # | |
# Please Cite: Sowell et al, ISME (2009) 3, 93-105 # | |
#--------------------------------------------------------------# | |
use strict; | |
use warnings; | |
use PostScript::Simple; | |
# Change the following input data as needed: | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
# The amino acid sequence alignment. Names must not contain '(', ')', ':', ' ', or ','. Names must contain at least one non-numerical character. | |
our %proteins = qw( | |
M_extorquens_moxF ----MSRFVTSVSALAMLALAPAAL---SSGAYANDKLVELSKSDDNWVMPGKNYDSNNFSDLKQINKGNVKQLRPAWTFSTGLLNGHEGAPLVVDGKMYIHTSFPNNTFALGLDDPGTILWQDK-----PKQNPAARAVACCDLVNRGLAYWPGDGKTPALILKTQLDGNVAALNAETGETVWKVENSDIKVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVKTGEQVWRAYATGPDKDLLLASDFNI----KNPHY-----------------------------GQKGLGTGTW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFGRDADTGEAKFGYQKTPHDEWDYAGVNVMMLSEQK---DKDGKARKLLTHPDRNGIVYTLDRTDGALVSANKLDDTVNVFKSVDLKT-----GQPVRDPEYGTRM---DHLAKDICPSAMGYHNQGHDSYDPKRELFFMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNYEGLGQIKAYNAITGDYKWEKMERFAVWGGTMATAGDLVFYGTLDGYLKARDSDT-GDLLWKFKIPSGAIGYPMTYTHKGTQYVAIYYGVGGWPGVGLVFDLADPTAGLGAVGAFKKLANYTQM--GGGVVVFSLDGKGPYDDPNVGEWKSAAK- | |
M_extorquens_putMdhLarge -----MRAVHLLALGAGLAAASPAL--------ANESVLKGVANPAEQVLQTVDYANTRYSKLDQINASNVKNLQVAWTFSTGVLRGHEGSPLVVGNIMYVHTPFPNIVYALDLDQGAKIVWKYE-----PKQDPSVIPVMCCDTVNRGLAY--ADGA----ILLHQADTTLVSLDAKSGKVNWSVKNGDPSKGETNTATVLPVKDKVIVGISGGEFGVQCHVTAYDLKSGKKVWRGYSIGPDDQLIVDPEKTT----SLGKP-----------------------------IAKDSSLKTW-----EGDQWKTGGGCTWGWFSYDPKLDLMYYGSGNPSTWNPKQRPGDNKWSMTIWARNPDTGMAKWVYQMTPHDEWDFDGINEMILTDQ----KFDGKDRPLLTHFDRNGFGYTLDRATGEVLVAEKFDPVVNWATKVDLDKGSKTYGRPLVVSKYSTEQNGEDVNSKGICPAALGTKDQQPAAFSPKTGLFYVPTNHVCMDYEPFRVTYTPGQPYVGATLSMYPA----PGSH----GGMGNFIAWDNLQGKIKWSNPEQFSAWGGALATSGDVLFYGTLEGFLKAVDSKT-GKELYKFKTPSGIIGNVMTYEHKGKQHVAVLSGVGGWAGIGLAAGLTDPNAGLGAVGGYAALSSYTNL--GGQLTVFSLPNN----------------- | |
M_methylotrophus_MdhLarge --------------------------------MADADLDKQVNTAGAWPIATGGYYSQHNSPLAQINKSNVKNVKAAWSFSTGVLNGHEGAPLVIGDMMYVHSAFPNNTYALNLNDPGKIVWQHK-----PKQDASTKAVMCCDVVDRGLAY--GAGQ----IVKKQANGHLLALDAKTGKINWEVEVCDPKVGSTLTQAPFVAKDTVLMGCSGAELGVRGAVNAFDLKTGELKWRAFATGSDDSVRLAKDFNS----ANPHY-----------------------------GQFGLGTKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWGRDLDTGMAKWGYQKTPHDEWDFAGVNQMVLTDQ----PVNGKMTPLLSHIDRNGILYTLNRENGNLIVAEKVDPAVNVFKKVDLKT-----GTPVRDPEFATRM---DHKGTNICPSAMGFHNQGVDSYDPESRTLYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PNGP--TKKEMGQIRAFDLTTGKAKWTKWEKFAAWGGTLYTKGGLVWYATLDGYLKALDNKD-GKELWNFKMPSGGIGSPMTYSFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFRELQNHTQM--GGGLMVFSL-------------------- | |
M_organophilum_MdhLarge ----MSRFVTSVSALAMLALAPAAL---SSVAYANDKLVELSKSDDNWVMPGKNYDSNNYSELKQVNKSNVKQLRPAWTFSTGLLNGHEGAPLVVDGKMYVHTSFPNNTFALDLDDPGHILWQDK-----PKQNPAARAVACCDLVNRGLAYWPGDGKTPALILKTQLDRHVVALNAETGETVWKVENSDIKVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVKTGGQVWRAYATGPDKDLLLADDFNV----KNAHY-----------------------------GQKGLGTATW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFGRDADTGEAKFGYQKTPHDEWDYAGVNVMMPSEQK---DKDGKTRKLLTHPDRNGIVYTLDRTDGALVSANKLDDTVNVFKTVDLKT-----GQPVRDPEYGTRM---DHLAKDVCPSAMGYHNQGHDSYDPKRELFFMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNYEGLGQIKAYNAITGSYKWEKMERFAVWGGTLATAGDLVFYGTLDGYLKARDSDT-GDLLWKFKIPSGAIGYPMTYTHKGTQYVAIYYGVGGWPGVGLVFDLADPTAGLGAVGAFKKLANYTQQ--GGGVIVFSLDGKGPYDDPNVGEWKSASK- | |
M_nodulans_MxaF MLGKIVVRCRTGVSVAALAALLPLA---GPSALANDKLVELSKSDGNWVMPGKNYDSDNYSKLKQINAENVKNLKVSWQFSTGLLNGHEGAPLVVDGTMYVHTSFPNNTFALGLDDPGKILWQDK-----PKQNPAARSVACCDLVNRGLAYWPGDGKTPSLILKTLLDGHVVALNAQTGETVWKIENSDIRVGSTLTIAPYVVKDKVIIGSSGAELGVRGYLTAYDVRTGEQKWRAYATGPDSDLLLAKDFNI----HNAHY-----------------------------GQKGLGTSTW-----EGDAWKIGGGTNWGWYAYDPGTNLIYFGTGNPAPWNETMRPGDNKWTMTIFARDVDTGEAKFGYQKTPHDEWDYAGVNVMMLSTQK---DRSGKERKLLTHPDRNGIVYTLDRTNGDLISAHKIDDTVNVFKTVDLKS-----GLPVRDPEYGTRM---DHLAKDICPSAMGYHNQGHDSYDPERKLFYMGINHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGDRQNAEGLGQIKAYDAITGKFKWEKMERFAVWGGTLATAGNVVFYGTLDGFIKARHSDT-GELLWKAKLPSGAIGYPVTYTHKGTQYVAIYYGVGGWPGVGLVFDLQDPTAGLGAVGAFKKLANYTQM--GGGVTVFSLDGKGPYDDPNTGEYVAAN-- | |
P_denitrificans_MdhLarge -MNRNTPKARGASSLAMAVAMGLAV-LTTAPATANDQLVELAKDPANWVMTGRDYNAQNYSEMTDINKENVKQLRPAWSFSTGVLHGHEGTPLVVGDRMFIHTPFPNTTFALDLNEPGKILWQNK-----PKQNPTARTVACCDVVNRGLAYWPGDDQVKPLIFRTQLDGHIVAMDAETGETRWIMENSDIKVGSTLTIAPYVIKDLVLVGSSGAELGVRGYVTAYDVKSGEMRWRAFATGPDEELLLAEDFNA----PNPHY-----------------------------GQKNLGLETW-----EGDAWKIGGGTNWGWYAYDPEVDLFYYGSGNPAPWNETMRPGDNKWTMAIWGREATTGEAKFAYQKTPHDEWDYAGVNVMMLSEQE---DKQGQMRKLLTHPDRNGIVYTLDRTNGDLISADKMDDTVNWVKEVQLDT-----GLPVRDPEFGTRM---DHKARDICPSAMGYHNQGHDSYDPERKVFMLGINHICMDWEPFMLPYRAGQFFVGATLTMYPG----PKGDRQNALGLGQIKAYDAISGEMKWEKMERFSVWGGTMATAGGLTFYGTLDGFIKARDSDT-GDLLWKFKLPSGVIGHPMTYKHDGRQYVAIMYGVGGWPGVGLVFDLADPTAGLGSVGAFKRLQEFTQM--GGGVMVFSLDGESPYSDPNVGEYAPGEPT | |
M_capsulatus_Bath_MdhLarge -----MQICKLASGCGGSMLAMAAVLA-AQSTHANSELDRLSKDDRNWVMQTKDYSATHFSRLTEINSHNVKNLKVAWTLSTGTLHGHEGAPLVVDGIMYIHTPFPNNVYAVDLNDTRKMLWQYK-----PKQNPAARAVACCDVVNRGLAYVPAGEHGPAKIFLNQLDGHIVALNAKTGEEIWKMENSDIAMGSTLTGAPFVVKDKVLVGSAGAELGVRGYVTAYNIKDGKQEWRAYATGPDEDLLLDKDFNK----DNPHY-----------------------------GQFGLGLSTW-----EGDAWKIGGGTNWGWYAYDPKLDMIYYGSGNPAPWNETMRPGDNKWTMTIWGRDADTGRAKFGYQKTPHDEWDYAGVNYMGLSEQ----EVDGKLTPLLTHPDRNGLVYTLNRETGALVNAFKIDDTVNWVKKVDLKT-----GLPIRDPEYSTRM---DHNAKGICPSAMGYHNQGIESYDPDKKLFFMGVNHICMDWEPFMLPYRAGQFFVGATLNMYPG----PKGM------LGQVKAMNAVTGKMEWEVPEKFAVWGGTLATAGDLVFYGTLDGFIKARDTRT-GELKWQFQLPSGVIGHPITYQHNGKQYIAIYSGVGGWPGVGLVFDLKDPTAGLGAVGAFRELAHYTQM--GGSVFVFSL-------------------- | |
M_capsulatus_Bath_putMdhLarge -MKKPVKSWLIASSIASLLAVPGVS-------FANAEVEALTKDPKNFATWGGNYAGTRYSTLDQINFKNAKHLQPVWTFSTGMLRGHEGGPLVVNDVIYIHTGYPHKVYALDQATQ-SVIWEYVYAPDKGTDQSQVISVMCCDVVNRGLAY--GDGK----IFLAQGDATLVALDAKTGKIVWKVKNGDPKTGMTATNAPLVVKDKVLTGISGGEFGVRGFLAAYNIKDGSLVWKKYSMGPDDEVGLDPEHTMTWTDGKMAP-----------------------------VGKDSSLKTW-----QGDQWKIGGGTTWGWYSYDPDLNLVYYGSGNPSTWNPVQRPGDNKWSMTIWARDVDTGEAKWVYQMTPHDEWDYDGINEMMLIDQEMTAKDGSKHSKLLTHFDRNGFGYTLDRVTGELLVAEKFDKAVNWATHVDMKT-----GRPQVNPKYSTQHGGQDVDTKGICPSAMGAKNEPPVTYSPRTKLIYIPGNHTCMNYEPFEVEYTAGQPYVGATLNIFPARANVKTGEKESSNHMGSFTAWDPTTGTIAWQFDEPFSLWSGMVSTAGDIVIYGTLEGYLKVRDAKT-GEELYRFKTPSGIIGNVSTWTYNGKQYIGVLSGIGGWAGVGMAAGLEGDTEGLGAVGAYKGLSSHTKL--GGVFTVFALP------------------- | |
S_meliloti_1021_MdhLarge -------MKRLLTMLAIMSIGGGAQ-----VAFANDELQKLIDDPNQWAIQTGDYANLRYSKLDQINKDNVGKLQVAWTFSTGVLRGHEGSPLVIGDLMYVHTPFPNTVYALDLSKDGQIVWKYE-----PKQDPNVIPVMCCDTVNRGVAY--ADNK----IFLHQADTTVVALDAKTGKVIWSVKNGDATKGETNTATVMPVKDKILVGISGGEFGVRGHVTAYSMADGKVLWRGYSMGPDSDTLIDPEKTT----HLGKP-----------------------------VGKDSGLTTW-----EGDQWKIGGGTTWGWYSYDPEENLVYYGTGNPSTWNPTQRPGDNRWSMTIFARDVDTGMAKWLYQMTPHDEWDYDGVNEMILTEQ----QIDGKDRKLLTHFDRNGFGYTMDRVTGELLVAEKYDPTVNWATEVVMDPKSDKYGRPQVVAQYSTEQNGEDTNTTGVCPAALGTKDQQPAAYSPKTELFYVPTNHVCMDYEPFRVSYTAGQPYVGATLSMYPP----KDSH----GGMGNFIAWDNKEGKIKWSLPEPFSVWSGALATAGDVVFYGTLEGYLKAVDAAT-GKELYRFKTPSGVIGNVMTYAREGKQYVAVLSGVGGWAGIGLAAGLTNPTEGLGAVGGYSALSNYTAL--GGTLTVFKLPE------------------ | |
S_meliloti_MdhLarge -------MNRLLTMLAIMSIGGGAQ-----VAFANDELQKLIDDPNQWAIQTGDYANLRYSKLDQINKDNVGKLQVAWTFSTGVLRGHEGSPLVIGDLMYVHTPFPNTVYALDLSKDGQIVWKYE-----PKQDPNVIPVMCCDTVNRGVAY--ADNK----IFLHQADTTVVALDAKTGKVIWSVKNGDATKGETNTATVMPVKDKILVGISGGEFGVRGHVTAYSMADGKVLWRGYSMGPDSDTLIDPEKTT----HLGKP-----------------------------VGKDSGLTTW-----EGDQWKIGGGTTWGWYSYDPEENLVYYGTGNPSTWNPTQRPGDNRWSMTIFARDVDTGMAKWLYQMTPHDEWDYDGVNEMILTGQ----HIDGKDRKLLTHFDRNGFGYTMDRVTGELLVAEKYDPTVNWATEVVMDPKSDKYGRPQVVAQYSTEQNGEDTNTTGVCPAALGTKDQQPAAYSPKTELFYVPTNHVCMDYEPFRVSYTAGQPYVGATLSMYPP----KDSH----GGMGNFIACDNKEGKIKWSLPEPFSVWSGALATAGDVVFYGTLEGYLKAVDAAT-GKELYRFKTPSGVIGNVMTYAREGKQYVAVLSGVGGWAGIGLAAGLTNPTEGLGAVGGYSDLSNYNAL--GGTLTEFKLPE------------------ | |
B_japonicum_USDA_110_MdhLarge -----MRKVLLATYLGSAAALAVGS------ASANDELIKMSQNPKDWVMPAGDYANTRYSKLNQINAQNVGKLQVAWTFSTGVLRGHEGGPLIIGNMMYVHTPFPNKVYAIDLSNENKIVWKYE-----PKQDPNVIPVMCCDTVNRGLSY--GDGK----IILHQADTNLVALDAKTGQVAWSATNGDPSKGQTGTSAALVVKDKVLVGISGGEFGVQCHVTAYDLKSGKQVWRAFSEGPDDQIKVDPAKTT----SLGKP-----------------------------VGADSSLKTW-----QGDQWKIGGGCTWGWMSYDPALNLVYYGSGNPSTWNPKQRPGDNKWSMTIFARDADTGMAKWVYQMTPHDEWDYDGVNEMILSDQ----QINGQARKLLTHFDRNGLGYTMDRESGELLVAEKYDPKVNWTSGVDMDKNSPTYGRPKVLDAASTDKAGEDHNVKGICPAALGTKDEQPAAYSPDTQLFYVPTNHVCMDYEPFKVSYTAGQPYVGATLSMYP-----PQGE----SHMGNFIAWDGKTGKIVWSNKEQFSVWSGALATAGGVVFYGTLEGYLKAVDAKS-GKELYKFKTPSGIIGNVTTYENGGKQYVAVLSGVGGWAGIGLAAGLTDPTAGLGAVGGYAALSNYTAL--GGTLTVFSLPAN----------------- | |
HTCC2181_putMdhLarge -MEINMQLNKIKLALGFAAAATMAM---PMVASAAADQEKAMSNANNWAHPRGQHDNQAYSKLTQLNKGNVKNLKAAWTFATGVNRGHEGSPLVIGNMMYVHTAFPNNVYALDLNNDQKIVWSYF-----PKQDPSVQAVLCCDNVNRGLGF--GDGK----IFLQQNDGMLVALDAKTGAKVWDASNVDPKVGATNTNAPHVIKDKVLTGCSGAEFGVRCFMAAYNIADGSLAWKAMSTGPDSEVLIGADFNK----ENPLYSALSVYEDVNGGNKQGGSFKKIPTDQLQGGVADLGVKTWLKPQAVKDGWQHGGGSVWGWWPYDAKTNLVYYGTGNPSVWNPDVRPGDNKWSMTVFARDLDTGMARWGMQMTPHDEWDYDGINEVILFDK------GGK--TYAWHHDRNGFAYTWTAAEGTLVAAEKVHPFVNWATDVDLKS-----GVPNKLAEHSTHQ---DYNTKGTCPAALGTKDQQPAAYSPKTGLIYSPLNHVCMTYEPVESKYVAGQPWVGATLTMFAG----PDGV------MGGFAAYDPMTNKKVWYNKEKFSAWGGAMTTASDLVFYGTLDRWFKALDAKS-GKELWKFQVGSGVIGNAFTYAHKGKQHVGVLSGIGGWAGVAMNLGLTNDTDALGAAGGYKELTKYNAAPGGGGLTVFSL-------------------- | |
M_flagellatus_KT_Mfla_2314 -----MEMSKIKLALGVAFGMAMAV---PMVASAAADQEAAMKDPNNWVHPRGQHNNQGYSTLNQINKSNVKNVKAAWQFATGVNRGHEGSPVVIGNMMYVHTAFPNNVYALDLNDNQKIVWAYF-----PKQDPSVQAVLCCDNVSRGLGY--GDGK----IFLQQNDGVLVALDAKTGAKVWDVKVNDPKVGATNTNAPHVIKDKVLTGCSGAEFGVRCFLVAYNIKDGSQVWKAYSTGPDSEVLIGKDFNK----DNPHYSALSVYEDINGGNKEGGSFKALPKEKLKFPETDLGVKTWLKPQAVKDGWQHGGGSTWGWWPYDHRTNLVYYGTGNPSVWNPDVRPGDNKWSMTIFARDVDTGIAKWGYQMTPHDEWDYDGINEVILFDK------GGK--TYAWHHDRNGFAYTFDAHTGSIIAAEKVHPFVNWATHVDKAT-----GIPHKDGKYSTHQ---DYNTKGICPAALGTKDQQPAAYSPRTGLVYSPLNHVCMTYEPVESKYIAGQPWVGATLTMFAG----PDGV------MGGFAAYDPMTNKAKWYNKEKFSAWGGALATASDLVFYGTLDRWFKALDAQT-GKELWKFQVGSGVIGNAFTYSHKGKQYVGVLSGIGGWAGVAMNLGLENETDALGAAGGYKELTKYNAAPGGGALNVFSL-------------------- | |
M_flagellatus_KT_Mfla_0344 -----MKAKHLKIALGASLIGLASI-----PGFAATDLEKLMQDDNQWATQRKDYANTGYSKLSQINQSNIKNLKAAWTFATGVNRGHEGAPLVVDDTLYFTTAFPNNVYALDLNNEEKIKWSYF-----PKQDPSVQALLCCDNVTRGLAY--GDGK----IFLQQNDGQLVALDAKTGAKVWEVGVVDVKQGATTTNAPHVFKDKVITGCSGGEYGVRCYLTAYNIKDGSIAWRAYATGPDSDVLIGKDFNK----DNPHYSALSVYEDINGGNQEGGSFKALPKEKLKFPETDLGVKTWLKPQAAKNGWEHGGGGTWGWYSYDPALNLVYYATGNPSVWNPDVRPGDNKWSMTIFARDLDTGLAKWGYQLTPHDEWDYDAVNETILWDA------DGK--KLATHFDRNGFGYTLDRQTGKLLVAEKMHPFVNWATGIDLTT-----GTPIKDPKYATHQ---DVQISGICPAALGVKDQQPAAYSPKTKLFYVPLNHVCMDYEPVEVKYVAGQPWVGATLSMYPG----PDGV------MGGFMAWDGLKGKQVWYKKEKFSVWSGALATATDIVFYGTLDRWFKAVDAKS-GKELWKFRVGSGVVGNPITYTHKGKQYVALLSGIGGWAGVAMNLGLTNDNEGLGAAGGYKELKEWNAAPGGGALNVFSL-------------------- | |
M_flagellatus_KT_Mfla_1717 -----MRISPFLIVVSLNLVSAVSS--------AADPLLPLLQDDKQWVSPRKDYYNQGYSRLSQINHRNVANLKLAWSFSTGVQRGHEGAPLVVGNVMYVHTAFPNNVYALDLDHDQKILWAYF-----PKQTSDIEAILCCDSVSRGLAY--GDGK----ILLQQNNGILLALDARDGRKVWEVQVNDPRMGATNTNAPYVFRDKVLTGCSGGEFGVRCFLAAYALQDGRLQWKAYSTGPDSEVLIGPGFNA----ENPHYSALSTYEDVNGGNREGGSFRALPKERLKFPETELGVKTWLKPQAAANGWQHGGGPVWGWFSYDPKLNLVYYGTGNPSVWNPDIRPGDNKWAMTLFARDLDTGMARWGYQLTPHDEWDYDSVNELILWDQ-------GS-RKLATHFDRNGFAYTLDRQNGKLLAAEKMHPFVNWATGIDLQT-----GIPLKDGRYATHE---DQETSAICPSAIGVKNIQPAAYSPQNGWFYVPMNHLCMTYEAVEAKYVAGQPWAGASLSMYPG----PDGD------MGAFMAWDALKHKPVWYTREKYPVWSGALATAGQLAFYGTLDRWFKALDARN-GKELWRFQVGSGIVGNPMTYQHAGKQYVAVFSGIGGWAGVALNQGLTGDSDGAGTAGAFRGLSQENAAPGSGALNVFAL-------------------- | |
M_flagellatus_KT_MxaF ---MKGRVSHVGISAAVSSLLVLAT---VQGAQANQEVLNLQKNADNWALQTGNYTGQHNSTLSQINKSNVKNLKAAWSFSTGVLHGHEGAPLVIGDMMYVHSAFPNNTFAINLNDPGVIAWQHK-----PKQIASVKAVACCDIVNRGLAY--GDGK----IVKTQLDGRLVALDAKSGKIVWEIEVCDPKVGATLTQAPFVAKNSVLVGCSGAELGVRGAVNSFNLKTGELQWRAFATGPDEEVRLAKNFNS----DNPHY-----------------------------GQFGLGLKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWARDLDTGEAKWGYQKTPHDEWDFAGVNQMILSDH----KVDGKVTPLLTHIDRNGIMYTLNRDNGNLIQAAKVDPAVNVFKKVDLKT-----GTPVRDPEFSTRM---DHKSTNVCPSAMGFHNQGLDALDLDEPIVYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PSGP--TKKEMGQVRAMDIVTGKYKWTKWEKFAVWGGTLATKGGLVVYNTLDGYIKALDKDN-GKDLWKFKMPSGGIGAPMTYQFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFKELQNHTQM--GGGLMVFSL-------------------- | |
P_denitrificans_xoxF -------MKNLMNGACLALLMSGTA------ALANEQRAGRDRQAPQWAIQMGDYANTRYSTLDQINKDNVKDLRVAWTFSTGVLRGHEGSPLVIGDVMYVHTPFPNRVFALDLNDNGKILWRYE-----PQQDPNVIAVMSCDTVYRGLSY--ADGM----ILLGQADTTVVALDATSGEVKWSTKIGDPGIGETLTATVVPVKDKVLVGISGGEYGVRGRMTALNLTDGSEAWKAWSTGPDEELLVDPETTT----HLGKP-----------------------------IGADSSLNSW-----EGDQWQIGGGTIWGWFSYDPDLNLVYYGTGNPSTWNPSQRPGDNKWSMTIMARDADTGMAKWFYQMTPHDEWDYDGVNEMILTNQ----TVDGQERKLLTHFDRNGLAYTMDRETGELLVAEKYDPVVNWTTGVDMDPNSETYGRPAVVAEYSTAQNGEDENTTGVCPAALGTKDQQPAAFSPKTNLFYVPTNHVCMDYEPFRVAYTAGQPYVGATLSMYPA----PNSH----GGMGNFIAWHNTTGEIKWSVPEQFSVWSGALATAGDVVFYGTLEGYLKPVDAQT-GEELYKFKTPSGIIGNVMTYEHGGKQYVGILSGVGGWAGIGLAAGLTNPNDGLGAVGGYASLSQYTEL--GGQLTVFELPG------------------ | |
B_fungorum_GluDH -----MNLRTTVLGLAILASAALSS----FVAQADSQLDGLMKNPSNWAAQAGDYANHRYSPLKQINENNVGKLQVAWTMSTGVLRGHEGAPLVIGDTMYIHSPFPNKVIAINLKDQ-TFIWQYL-----PKQDDQVVSVMCCDTVNRGLAY--GDGK----IFLQQADTKLVALNAKTGDVVWTAQNGNPKAGETNTNAPHVFGDKVLTGISGGEFGVRGRLIAYDIKTGKPAWTAYSTGPDKDMLIDPDKTTTYADGKMVP-----------------------------VGADSSLKSW-----KGDQWKLGGGTTWGWYAWDPKLNLVYYGTGNPGTWNPTQRPGDNKWSMSIFARDLNTGQARWVYQMTPHDEWDYDGVNEMILSDL----SIDGKKVPAIVHFDRNGFGYTLNRETGQLLVAQKFDPAVNWADHVDMKS-----GKPIRNAAYSTQAAGSDHNVKGICPAALGSKDQQPAAYDPGSSLFLVPTNHVCMDYEPFDVDYVSGQPYVGATLSMYPG----PNDN----NSMGNFIAWDASKGKIVWSKPERFSVWSGVLATAGGVAFYGTLEGYIKAVRIKD-GKELWRFKTPSGIIGNVFTYEYQGKQFIGVYSGIGGWAGIGMAAGLEKSTEGLGAVGGYRELAKYTAL--GGTLFVFAIPGGNS--------------- | |
H_methylovorum_mxaF MGMKKVVSTPMLMSASCMAIAVALQVGVASSAYANDKLIELSKSNENWVMPGKNYDSNNYSESTQVNAENVKQLKHAWSFSTGELHGHEGAPLVIGDMMYVHSSFPNKTFALNLNDPGHILWQHS-----PKQDPAARSVACCDLVNRGLAYWPGDDKTPALVIKTQLDGHLVALNAKTGEEFWKVENGDIKVGQTLTQAPYVVHDLAIVGSSGAELGVRGHVTAYNVKTGEQAWRYYATGPDEEIGLADDFNS----ANPHY-----------------------------GQKGLGTATW-----EGDAWKIGGGTNWGWYAYDPQANLIYYGSGNPAPWNETMRPGDNKWTMTITARDADTGKMKFGYQKTPHDEWDFAGVNVIMLSEQT---DKEGKKRKLLTHPDRNGIVYTLDRENGDLISADKLDDTVNVFKHVDLKS-----GLPVRDPEFGTRM---DHKGTEICPSAMGYHNQGHDSYDPTKQLFFMGINHICMDWEPFMLPYRAGQFFVGATLWMYPG----PKGDRQNYLGLGQIKAYNAITNTYKWEHMERFSVWGGTLATAGNLVFYGTLDGFLKARNSDT-GELLWKHKLPSGVIGYPMTYEHKGVQYIAVMSGVGGWPGVGLVFDLQDPTAGLGAVGAFKNLQRYTQM--GGSLEVFSLDGKNPYDDVNVANWTKGCI- | |
R_sphaeroides_GluDH -------MKMLKRGLAATLLLSSAP------VYANDSVMQAIGDSTQWAIQTGDYANTRYSELDQINRENVGKLQVAWTFSTGVLRGHEGSPLVIDGIMYVHTPFPNNVYALDLNNEGRILWRYE-----PQQNPDVVGVMCCDTVNRGVAY--ADGM----IFLHQADTTIVALDAKSGEVKWSVVNGDPTKGETNTATVLPVKDKILVGISGGEFGVRGHLTAYDMQTGEQVWRAFSTGPDEEMLVDPEQTT----HLGKP-----------------------------IGPNSSLESW-----EGDQWQIGGGTTWGWYSYDPDLNLVYYGTGNPSTWNPSQRPGDNKWSMTIMARDVDTGMAKWFYQKTPHDEWDYDGVNENILVDQ----EIDGQMRKLLVNLDRNGFGYTLDRETGELLVAEKYDPAVNWATEVVMDPESDQYGRPQVVAEYSTAQNGEDTNTTGVCPAALGSKDQQPAAFSPKTGLFYVPTNHVCMDYEPYRVSYTAGQPYVGATLSMYPA----PDSH----GGMGNFIAWDATKGEIKWSLPEQFSVWSGALATAGDVVFYGTLEGHLKAIDAET-GELLYRFKTPSGVIGNVMTYELNGKQYIGILSGVGGWAGIGLAAGLTNPNEGLGAVGGYAALSDYTEL--GGQLTVFAVPD------------------ | |
Methylobacillus_sp_SK5_MdhLarge ---MKGRVTHVGISAAVSSLLVLAT---MQGAQANQDLQNLTKNADNWALQTGNYTGQHNSTLSQINKGNVKNLKAAWSFSTGVLHGHEGAPLVIGDMMYIHSAFPNNTFAVNLNDPGVIAWQHK-----PKQIASVKAVACCDIVNRGLAY--GDGK----IVKTQLDGKLVALDAKSGKVVWEIEVCDPKVGATLTQAPFIVKNTVLVGCSGAELGVRGAVNSFNLKTGELQWRAFATGPDEEVRLAKNFNS----DNPHY-----------------------------GQFGLGLKTW-----EGDAWKIGGGTNWGWYAYDPKLNLFYYGSGNPAPWNETMRPGDNKWTMTIWARDLDTGEAKWGYQKTPHDEWDFAGVNQMILSDH----KVDGKVTPLLTHIDRNGIMYTLNRDNGNLVQAAKVDPAVNVFKKVDLKT-----GTPVRDPEFSTRM---DHKSTNVCPSAMGFHNQGLDALDLDEPIVYAGLNHICMDWEPFMLPYRAGQFFVGATLAMYPG----PSGP--TKKEMGQVRAMDIVTGKYKWTKWEKFAVWGGTLATKGGLVAYNTLDGYIKALDKDN-GKELWKFKMPSGGIGAPMTYQFKGKQYIGSMYGVGGWPGVGLVFDLTDPSAGLGAVGAFKELQNHTQM--GGGLMVFSL-------------------- | |
M_petroleiphilum_PM1_Mpe_A3393 MKVSKHSGWRLMRPLGLALLAIPAV------VQANADVEKNIANSKNWAMQAGDMFNQRYSKLDQINKGNVGKMQVAWTFSTGVLRGHEGSPLVIDGTMYLHSPFPNKVFAIDLNTQ-KILWKYE-----PKQDPAVIPQMCCDTVNRGLAY--AEGK----VILQQADSNLVALDAKSGKVVWSVKNGDPKLGAVNTNAPHVFKDKVITGISGGEWGVRGFIAAYNLKDGKPAWKGYSVGPDAEMLIDPAKTTTWIDGKVAP-----------------------------VGADSSLKTW-----KGDQWKIGGGTTWGWYSYDKALNAMYYGTGNPSTWNPSQRPGDNKWSMSIWSRDVDTGKVNWVYQMTPFDEWDFDGINEMILADI----NVKGKPTKALVHFDRNGFAYTMDRTNGALLVAEKYDPKVNWATHVDMKT-----GRPQVVKQYSTAQNGPDVNTKGICPAALGSKDQQPASFDPNTKLFYVPTNHVCMDYEPFKVEYTAGQPYVGATLSMFPA----PGSH----GGMGNYITWDAGTGKIVQSKAEKFSVWSGSLNTAGGLSCYGTLEGYFKCVDAKDISKELFKFKTPSGIIGNVFTYEHKGKQYMGVFSGIGGWAGIGMAAGLEKDQDGLGAVGGYKELNQYTEL--GGSLTVFALPN------------------ | |
); | |
# The list of peptides that are to be highlighted. | |
our @peptides = unique(qw( | |
TNLVYYGTGNPSVWNPDVRPGDNK AMSNSANWAHPR VITGCSGAEFGVR LTQVNKGNVK EAAHSTHQDYNTK | |
GLGFGNGKIFLQQNDGNLVALNAK QDPSVQAVLCCDNVNR NGFAYTWVASNGTLVSAEK KIPTDQLQGGVADLGVK | |
VGATNTNAPHVIKDK VLTGCSGAEFGVR AAWTISTGVNR GQHDNQAYSK RIPNDQLQAPVADLGIK | |
WGMQMTPHDEWDYDGINEVILFDKGGK IFLQQNDGNLVALNAK VGATNTNAPHVINDK LTQLNKGNVK | |
DQQPAAYSPK VVWSYFPKQDPSVQAVLCCDNVNR DGWQHGGGSTWGWWPYDAK TGLIYSPLNHVCMTYEPVESK | |
AAWTLSTGVNR WGMQMTPHDEWDYDGINEVILFDK RIPNDQLQGGVADLGLK DGWQHGGGSVWGWWPYDAK | |
VGATNTNAPHVIK LTQINKGNVK VWSTLNTDPK ELTKYNAAPGGGALTVFAL TWLKPQAVK VVWSYFPK | |
)); | |
# The phylogenetic tree in nexus format. Names must be identical to those in the alignment. | |
# Make sure the string ends in "):0.0;", as in the example below. | |
our $tree = "((((M_capsulatus_Bath_MdhLarge:0.25935233153341924472,((H_methylovorum_mxaF:0.20878168596712515237,(M_nodulans_MxaF:0.12190741832194466887,(M_extorquens_moxF:0.02274678088090198447,M_organophilum_MdhLarge:0.02307171873930398062):0.06011530645090353009):0.08366283963156732895):0.06995643960127288785,P_denitrificans_MdhLarge:0.20442045141664449970):0.12242880488028372310):0.14514292177919754789,((Methylobacillus_sp_SK5_MdhLarge:0.01600304631180017589,M_flagellatus_KT_MxaF:0.01526122172147443765):0.10767993129317506018,M_methylotrophus_MdhLarge:0.14280191029943056780):0.19923880678630695451):0.42551609073153434659,((M_flagellatus_KT_Mfla_1717:0.35231344928419872087,M_flagellatus_KT_Mfla_0344:0.14695574411891779154):0.06730097283858139934,(M_flagellatus_KT_Mfla_2314:0.06839777440088301574,HTCC2181_putMdhLarge:0.14290213930651829433):0.15771954238829591355):0.22366507158804760369):0.19229690543073024722,((M_petroleiphilum_PM1_Mpe_A3393:0.27375376371657816899,B_fungorum_GluDH:0.32568960184059214669):0.07065861624495217708,((M_extorquens_putMdhLarge:0.20052498854951039675,B_japonicum_USDA_110_MdhLarge:0.20384942278865467169):0.06929269248036029616,((R_sphaeroides_GluDH:0.13212536758338755405,P_denitrificans_xoxF:0.18460378179484654426):0.08680998014121153739,(S_meliloti_1021_MdhLarge:0.00000087024666238173,S_meliloti_MdhLarge:0.01447501190371387053):0.11910797865841461773):0.04195425485701579499):0.13380076964851062016):0.02532900152403316091,M_capsulatus_Bath_putMdhLarge:0.44978920877462219563):0.0; "; | |
# Parameters concerning the size of the .eps formatted output. | |
our ($height, $width, $buffer, $textsize, $tree_zoom) = (225, 600, 20, 6, 1); | |
# Output file. Must be EPS. To convert to other formats, use Imagemagick (`convert out.eps out.png`) | |
# or ghostscript (`gs -sDEVICE=pdfwrite -sOutputFile="out.pdf" -dNOPAUSE -dEPSCrop -f out.eps -c quit`) | |
our $outfile = "out.eps"; | |
# That's all | |
#------------------------------------------------------------------------# | |
# Don't change anything below this line unless you know what you are doing | |
# Do some quick sanity checks | |
die ("No input alignment.\n") unless (scalar(keys %proteins)); | |
our $alignment_len = length((values %proteins)[0]); | |
die ("Error: Zero-length alignment.\n") unless ($alignment_len); | |
while (my ($id, $seq) = each %proteins) { | |
die ("Sequences are not aligned.\n") if (length($seq) != $alignment_len); | |
die ("Name '$id' contains invalid characters.\n") if ($id =~ m/[\(\)\:\,]/); | |
die ("Sequence names cannot be all numbers ($id).\n") if ($id =~ m/^\d+$/); | |
$proteins{$id} = $seq = uc($seq); | |
die ("Invalid sequence for '$seq'.\n") unless ($seq =~ m/^[\-ARNDCEQGHILKMFPSTWYV]+$/); | |
} | |
if (scalar(@peptides)) { | |
foreach my $i (0..$#peptides) { | |
my $seq = $peptides[$i] = uc($peptides[$i]); | |
die("Invalid peptide '$seq'.\n") unless ($seq =~ m/^[\-ARNDCEQGHILKMFPSTWYV]+$/); | |
} | |
} else { | |
print STDERR "Warning: No peptides provided in input. Proceeding...\n"; | |
} | |
# Find out how often each amino acid occurs at each position | |
our @consensus = (); | |
foreach my $p (0..($alignment_len - 1)) { | |
my $hash = {}; | |
$hash->{substr($_, $p, 1)}++ foreach (values %proteins); | |
my $total = sum(values %$hash); | |
$hash->{$_} /= $total foreach (keys %$hash); | |
push @consensus, $hash; | |
} | |
# Digest the tree data into a form we can recurse over | |
our @nodes = (); | |
our @members = (); | |
while ($tree =~ m/(\([^\(\)]+\))/) { | |
substr($tree, index($tree, $1), length($1), scalar(@nodes)); | |
my @pair = (); | |
my $member_count = 0; | |
foreach (split(',', substr($1, 1, -1))) { | |
my ($name, $len) = split(':', $_); | |
push @pair, { 'name' => $name, 'len' => $len }; | |
if ($name =~ /^(\d+)$/) { | |
$member_count += $members[$1]; | |
} else { | |
$member_count++; | |
if (!defined($proteins{$name})) { | |
die ("Name in tree '$name' not in the alignment.\n"); | |
} | |
} | |
} | |
push @nodes, [@pair]; | |
push @members, $member_count; | |
} | |
# Set up variables to describe our drawing area | |
my $scalebar_height = 10 + ($textsize * 2); | |
my $n_leaves = $members[-1]; | |
my $usable_height = $height - (2 * $buffer) - $scalebar_height; | |
our $rect_height = $usable_height / $n_leaves; | |
our $rect_halfheight = $rect_height / 2; | |
our ($rect_x1, $rect_x2) = ($width * .45, $width - $buffer); | |
our $rect_width = ($rect_x2 - $rect_x1) + 1; | |
our @names_ypos = map($buffer + $usable_height * $_ / $n_leaves, 0..($n_leaves-1)); | |
our $text_x_adjust = ($textsize * .3); | |
our $text_y_adjust = -1 * ($textsize * .3); | |
our $linesize = $textsize / 10; | |
our $p = new PostScript::Simple(colour => 1, xsize => $width, ysize => $height, units => "pt"); | |
$p->setfont("Arial", $textsize); | |
$p->setlinewidth($linesize); | |
# Draw the scale bar for the genes | |
my $scale_ypos = $height - $buffer - $scalebar_height + 2; | |
my $interval = int(($alignment_len / $rect_width) * 50); # Number of genes per 40 pixels | |
my $base = 10 ** (length($interval) - 1); | |
$interval = sprintf("%.0i", $interval / $base) * $base; | |
my $extended_len = $interval * int($alignment_len / $interval); | |
$extended_len += $interval if ($extended_len < $alignment_len); | |
my $extended_x2 = $rect_x1 + $rect_width * ($extended_len / $alignment_len); | |
$p->line($rect_x1 - .5 * $linesize,$scale_ypos, $extended_x2 + .5 * $linesize,$scale_ypos); | |
for (my $i = 0; $i <= $extended_len; $i += $interval) { | |
my $x = $rect_x1 + $rect_width * ($i / $alignment_len); | |
$p->line($x,$scale_ypos, $x,$scale_ypos+4); | |
$p->text( {align => 'center'}, $x,$scale_ypos+4+$textsize*.2, $i); | |
} | |
$p->text( {align => 'center'}, ($rect_x1+$extended_x2)/2,$scale_ypos+8+$textsize+$textsize*.2, "Position in amino acid alignment"); | |
# Call the drawing command and save the image | |
if ($tree =~ m/(\d+)\:([0-9\.]+)/) { | |
print_tree({'name' => $1, 'len' => $2}, 0); | |
$p->output($outfile); | |
} | |
sub print_tree { | |
my $node = shift; | |
my $dist = shift; | |
my $x1 = $dist * ($width * .25) + $buffer - $linesize * .5; | |
$dist += $node->{'len'} * $tree_zoom; | |
my $x2 = $dist * ($width * .25) + $buffer; | |
if ($node->{'name'} =~ m/^\d+$/) { | |
my @subnodes = @{$nodes[$node->{'name'}]}; | |
my @ypos_arr = map(print_tree($_, $dist), @subnodes); | |
my $y_avg = avg(@ypos_arr); | |
my $y_min = min(@ypos_arr); | |
my $y_max = max(@ypos_arr); | |
$p->setcolour(0,0,0); | |
$p->line($x2,$y_min, $x2,$y_max); | |
$p->line($x1,$y_avg, $x2,$y_avg); | |
return $y_avg; | |
} else { | |
my $y_pos = shift @names_ypos; | |
my $y_min = $y_pos - $rect_halfheight + $linesize * .5; | |
my $y_max = $y_pos + $rect_halfheight - $linesize * .5; | |
$x2 = $x1 + $linesize if ($x2 - $x1 < $linesize); | |
$p->setcolour(0,0,0); | |
$p->line($x1,$y_pos, $x2,$y_pos); | |
my $name = $node->{'name'}; | |
$name =~ tr/\_/\ /; | |
$name =~ s/^([A-Z])\ /$1\.\ /; | |
$p->text($x2+$text_x_adjust,$y_pos+$text_y_adjust, $name); | |
my $protein_seq = $proteins{$node->{'name'}}; | |
# Grey box representing the gene | |
$p->setcolour(200,200,200); | |
$protein_seq =~ m/^(\-*)(.*?)(\-*)$/; | |
my $protein_start = $rect_x1 + $rect_width * (length($1) / $alignment_len); | |
my $protein_end = $rect_x2 - $rect_width * ((length($3) ) / $alignment_len); | |
$p->box( {filled => 1}, $protein_start,$y_min, $protein_end,$y_max); | |
# White out gapped regions | |
$p->setcolour("white"); | |
while ($protein_seq =~ m/(\-+)/g) { | |
my $gap_start = length($`); | |
my $gap_end = $gap_start + length($1); | |
my $aa_x1 = $rect_x1 + $rect_width * ($gap_start / $alignment_len); | |
my $aa_x2 = $rect_x1 + $rect_width * ($gap_end / $alignment_len); | |
$p->box( {filled => 1}, $aa_x1,$y_min, $aa_x2,$y_max); | |
} | |
# Mapping of positions between gapped and ungapped | |
my @u2g = (); | |
my $protein_useq = $protein_seq; $protein_useq =~ tr/\-//d; | |
foreach (0..($alignment_len-1)) { | |
push @u2g, $_ unless (substr($protein_seq, $_, 1) eq '-'); | |
} | |
# Heat map amino acids according to conservation if peptide was found | |
foreach my $pep (@peptides) { | |
my $pep_start = -1; | |
while (($pep_start = index($protein_useq, $pep, $pep_start + 1)) > -1) { | |
my $pep_end = $pep_start + length($pep) - 1; | |
foreach ($pep_start..$pep_end) { | |
my $aa_pos = $u2g[$_]; | |
my $aa = substr($protein_seq, $aa_pos, 1); | |
next if ($aa eq '-'); | |
my $aa_x1 = $rect_x1 + $rect_width * ($aa_pos / $alignment_len); | |
my $aa_x2 = $rect_x1 + $rect_width * (($aa_pos+1) / $alignment_len); | |
my $cons = $consensus[$aa_pos]->{$aa}; | |
$p->setcolour(255*$cons, 255-255*$cons, 0); | |
$p->box( {filled => 1}, $aa_x1,$y_min, $aa_x2,$y_max); | |
} | |
} | |
} | |
# Black border around protein in alignment | |
$p->setcolour(0,0,0); | |
$p->box( {filled => 0}, $protein_start - .5 * $linesize,$y_min, $protein_end + .5 * $linesize,$y_max); | |
return $y_pos; | |
} | |
} | |
sub avg { | |
my $sum = 0; | |
$sum+= $_ foreach(@_); | |
my $avg = $sum / scalar(@_); | |
return $avg; | |
} | |
sub min { | |
my $min = shift; | |
foreach (@_) { $min = $_ if ($_ < $min); }; | |
return $min; | |
} | |
sub max { | |
my $max = shift; | |
foreach (@_) { $max = $_ if ($_ > $max); }; | |
return $max; | |
} | |
sub sum { | |
my $sum = 0; | |
$sum += $_ foreach (@_); | |
return $sum; | |
} | |
sub unique { | |
my %hash = (); | |
$hash{$_} = 1 foreach (@_); | |
my @uniq = keys %hash; | |
return @uniq; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment