Last active
December 14, 2015 06:19
-
-
Save ArtPoon/1838da20c383e540076c to your computer and use it in GitHub Desktop.
A HyPhy batch language (C analogue) script for mapping amino acid substitutions from a likelihood function to a tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
acids = {{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}}; | |
function importLikelihoodFunction (dummy) | |
{ | |
SetDialogPrompt ("Choose a file containing an exported likelihood function:"); | |
fscanf(PROMPT_FOR_FILE, "Raw", importLF); | |
ExecuteCommands(importLF); | |
howManyLFs = Rows("LikelihoodFunction"); | |
if (howManyLFs>1) { | |
ChoiceList (likelihoodFnChoice,"Choose a Likelihood Function",1,NO_SKIP,LikelihoodFunction); | |
if (likelihoodFnChoice<0) { | |
return 0; // cancelled | |
} | |
} | |
else | |
{ | |
if (howManyLFs == 1) { | |
// only one available - use it | |
likelihoodFnChoice = 0; | |
} | |
else { | |
// no LFs available; die | |
fprintf (stdout, "ERROR: no likelihood functions available for ancestral state reconstruction!\n"); | |
return 0; | |
} | |
} | |
GetString (lfid,LikelihoodFunction,likelihoodFnChoice); | |
// extract identifiers from likelihood function | |
ExecuteCommands("GetString (recep, "+lfid+", -1)"); | |
dataid = (recep["Datafilters"])[0][0]; //identifier for data filter | |
fprintf(stdout, "Retrieved data partition ", dataid, "\n"); | |
ExecuteCommands ("DataSetFilter filteredData = CreateFilter ("+dataid+",1);"); | |
treeid = (recep["Trees"])[0][0]; //identifier for tree | |
fprintf(stdout, "Retrieved tree ", treeid, "\n"); | |
return 1; | |
} | |
/* ____________________________________________________________________ */ | |
/* Assembles a key as two column vectors mapping sequence names to */ | |
/* branch IDs. */ | |
function setupMapToTree (sampling_option) | |
{ | |
// reconstruct ancestors from imported LF | |
if (sampling_option == 1) { | |
ExecuteCommands("DataSet ancestralSeqs = SampleAncestors ("+lfid+");"); | |
} else { | |
ExecuteCommands("DataSet ancestralSeqs = ReconstructAncestors ("+lfid+");"); | |
} | |
// generate data set filter of amino acid sequences from reconstructed ancestors | |
DataSetFilter filteredDataA = CreateFilter(ancestralSeqs, 1); | |
// get AA frequencies from data | |
ExecuteCommands ("HarvestFrequencies (observedCEFV, "+dataid+",1,1,0);"); | |
HarvestFrequencies (observedCEFV, filteredData, 1, 1, 0); | |
stateCharCount = 20; | |
ExecuteCommands ("branchNames = BranchName ("+treeid+",-1);"); | |
// -1 means retrieve for all branchces in tree | |
h = Columns (branchNames); // total number of branches in tree | |
seqToBranchMap = {h, 2}; // re-using container! | |
// maps sequence names to branch order in column 1 | |
// and the other way around in column 2 | |
for (k=0; k<filteredData.species; k=k+1) // for every extant sequence (tip) | |
{ | |
GetString (seqName, filteredData, k); | |
seqToBranchMap[k][0] = -1; | |
for (v=0; v<h; v=v+1) | |
{ | |
if (branchNames[v] % seqName) { // string comparison | |
seqToBranchMap[k][0] = v; // map tip name ordering to data filter | |
seqToBranchMap[v][1] = k; // and vice versa | |
break; | |
} | |
} | |
} | |
seqToBranchMap[filteredData.species][0] = h-1; | |
seqToBranchMap[h-1][1] = filteredData.species; | |
for (k=1; k<filteredDataA.species; k=k+1) { | |
// for every internal branch | |
GetString (seqName, filteredDataA, k); | |
seqToBranchMap[filteredData.species+k][0] = -1; | |
for (v=0; v<h; v=v+1) { | |
if (branchNames[v] % seqName) { | |
seqToBranchMap[k+filteredData.species][0] = v; | |
seqToBranchMap[v][1] = k+filteredData.species; | |
break; | |
} | |
} | |
} | |
GetDataInfo (dupInfo, filteredData); | |
GetDataInfo (dupInfoA, filteredDataA); | |
matrixTrick = {1,stateCharCount}; | |
matrixTrick2 = {1,stateCharCount}; | |
for (h=Columns(matrixTrick)-1; h>=0; h=h-1) { | |
matrixTrick [h] = h; | |
matrixTrick2 [h] = 1; | |
} | |
aminoInfo = {filteredData.species, filteredData.unique_sites}; // tip data | |
aminoInfo2 = {filteredDataA.species, filteredDataA.unique_sites}; // ancestor data | |
GetDataInfo (dupInfo, filteredData); | |
GetDataInfo (dupInfoA, filteredDataA); | |
matrixTrick = {1,stateCharCount}; | |
matrixTrick2 = {1,stateCharCount}; | |
for (h=Columns(matrixTrick)-1; h>=0; h=h-1) { | |
matrixTrick [h] = h; | |
matrixTrick2 [h] = 1; | |
} | |
for (v=0; v<filteredData.unique_sites;v=v+1) { | |
for (h=0; h<filteredData.species;h=h+1) { | |
GetDataInfo (siteInfo, filteredData, h, v); | |
_SITE_ES_COUNT = matrixTrick2 * siteInfo; | |
if (_SITE_ES_COUNT[0] == 1) { | |
siteInfo = matrixTrick * siteInfo; | |
aminoInfo[h][v] = siteInfo[0]; | |
} | |
else { | |
aminoInfo[h][v] = -1; | |
} | |
} | |
} | |
for (v=0; v<filteredDataA.unique_sites;v=v+1) { | |
for (h=0; h<filteredDataA.species;h=h+1) { | |
GetDataInfo (siteInfo, filteredDataA, h, v); | |
siteInfo = matrixTrick * siteInfo; | |
aminoInfo2[h][v] = siteInfo[0]; | |
} | |
} | |
ExecuteCommands ("flatTreeRep = Abs ("+treeid+");"); | |
GetInformation (seqStrings, filteredData); | |
return 0; | |
} | |
/* ____________________________________________________________________ */ | |
/* Generate a tab-separated table where each row corresponds to a */ | |
/* branch in the tree, and each column corresponds to a codon position */ | |
/* in the aligned sequences. Populate with 0's and 1's, i.e. binary */ | |
/* variables, for presence or absence of a nonsynonymous substitution */ | |
/* at that position in each branch. */ | |
function reconstructAncestors (rep) | |
{ | |
if (option > 0) { | |
writeToFile = outfile+rep; | |
} else { | |
writeToFile = outfile; | |
} | |
fprintf (stdout, "Writing to file ", writeToFile, "\n"); | |
fprintf (writeToFile, CLEAR_FILE, KEEP_OPEN); | |
k = filteredData.species + 1; // root ancestral sequence | |
for (h = 1; h < filteredDataA.species; h = h + 1) { | |
// for every internal branch | |
aa_count = 0; | |
p1 = seqToBranchMap[k][0]; // get position of root in tree ordering | |
pid = flatTreeRep[p1]; | |
p2 = seqToBranchMap[pid][1] - filteredData.species; | |
bn = branchNames [p1]; | |
for (site = 0; site < filteredDataA.sites; site = site + 1) { | |
if (output_option == 0 && site > 0) { | |
fprintf (writeToFile, ","); | |
} | |
// for every site in sequence | |
c1 = dupInfoA[site]; | |
aa1 = aminoInfo2[h] [c1]; // derived codon state on branch | |
aa2 = aminoInfo2[p2][c1]; // ancestral " " | |
// handle gaps | |
if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) { | |
if (output_option == 0) { | |
// CSV formatting | |
fprintf (writeToFile, "0"); | |
} | |
continue; | |
} | |
if (aa1 != aa2) { | |
if (output_option == 0) { | |
// CSV matrix format | |
fprintf (writeToFile, "1"); | |
} | |
else { | |
// list format | |
fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n"); | |
aa_count = aa_count + 1; | |
} | |
} else { | |
if (output_option == 0) { | |
fprintf (writeToFile, "0"); | |
} | |
} | |
} | |
// end loop over sites | |
if (output_option == 0) { | |
fprintf (writeToFile, "\n"); // end of line | |
} | |
k = k + 1; // update parent sequence | |
} | |
// now do the leaves | |
for (h = 0; h < filteredData.species; h = h + 1) { | |
aa_count = 0; | |
p1 = seqToBranchMap[h][0]; | |
pid = flatTreeRep[p1]; | |
p2 = seqToBranchMap[pid][1]-filteredData.species; | |
bn = branchNames [p1]; | |
for (site = 0; site < filteredDataA.sites; site = site + 1) { | |
if (output_option == 0 && site > 0) { | |
fprintf (writeToFile, ","); | |
} | |
c1 = dupInfoA[site]; | |
c2 = dupInfo [site]; | |
aa2 = aminoInfo2[p2][c1]; | |
aa1 = aminoInfo [h] [c2]; | |
if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) { | |
if (output_option == 0) { | |
fprintf (writeToFile, "0"); // gap | |
} | |
continue; | |
} | |
if (aa1 != aa2) { | |
if (output_option == 0) { | |
fprintf (writeToFile, "1"); | |
} else { | |
fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n"); | |
aa_count = aa_count + 1; | |
} | |
} else { | |
if (output_option == 0) { | |
fprintf (writeToFile, "0"); | |
} | |
} | |
} | |
if (output_option == 0) { | |
fprintf (writeToFile, "\n"); // end line | |
} | |
} | |
// START 20070926SLKP: CLOSE_FILE is needed to flush all to disk | |
fprintf (writeToFile, CLOSE_FILE); | |
return 0; // unused | |
} | |
/* ==================================================================================== */ | |
/* MAIN LOOP */ | |
/* ==================================================================================== */ | |
stateCharCount = 20; /* there are 64 possible codons, including termination codons */ | |
/*START 20070926SLKP: handle the case of failing to select an LF */ | |
retcode = importLikelihoodFunction(0); | |
if (retcode == 1) { | |
ChoiceList (option, "Ancestor reconstruction option: ", 1, SKIP_NONE, | |
"Maximum likelihood", "Map most likely reconstruction to internal nodes.", | |
"Sampling", "Map reconstructions at nodes in propoprtion to likelihood."); | |
ChoiceList (output_option, "Output option: ", 1, SKIP_NONE, | |
"Binary matrix", "Write comma-separated (CSV) binary matrices to file, where 1 indicates a site-specific substitution event.", | |
"List", "List amino acid substitutions per branch."); | |
SetDialogPrompt ("Provde a filename to write output(s) to: "); | |
fprintf (PROMPT_FOR_FILE, CLEAR_FILE); | |
outfile = LAST_FILE_PATH; | |
if (option == 1) { | |
fprintf (stdout, "Enter number of replicates to sample: "); | |
fscanf (stdin, "Number", max_reps); | |
for (this_rep = 0; this_rep < max_reps; this_rep = this_rep+1) { | |
setupMapToTree(1); | |
reconstructAncestors(this_rep); | |
} | |
} else { | |
setupMapToTree(0); | |
reconstructAncestors(0); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment