Skip to content

Instantly share code, notes, and snippets.

@ArtPoon
Last active December 14, 2015 06:19
Show Gist options
  • Save ArtPoon/1838da20c383e540076c to your computer and use it in GitHub Desktop.
Save ArtPoon/1838da20c383e540076c to your computer and use it in GitHub Desktop.
A HyPhy batch language (C analogue) script for mapping amino acid substitutions from a likelihood function to a tree
acids = {{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}};
function importLikelihoodFunction (dummy)
{
SetDialogPrompt ("Choose a file containing an exported likelihood function:");
fscanf(PROMPT_FOR_FILE, "Raw", importLF);
ExecuteCommands(importLF);
howManyLFs = Rows("LikelihoodFunction");
if (howManyLFs>1) {
ChoiceList (likelihoodFnChoice,"Choose a Likelihood Function",1,NO_SKIP,LikelihoodFunction);
if (likelihoodFnChoice<0) {
return 0; // cancelled
}
}
else
{
if (howManyLFs == 1) {
// only one available - use it
likelihoodFnChoice = 0;
}
else {
// no LFs available; die
fprintf (stdout, "ERROR: no likelihood functions available for ancestral state reconstruction!\n");
return 0;
}
}
GetString (lfid,LikelihoodFunction,likelihoodFnChoice);
// extract identifiers from likelihood function
ExecuteCommands("GetString (recep, "+lfid+", -1)");
dataid = (recep["Datafilters"])[0][0]; //identifier for data filter
fprintf(stdout, "Retrieved data partition ", dataid, "\n");
ExecuteCommands ("DataSetFilter filteredData = CreateFilter ("+dataid+",1);");
treeid = (recep["Trees"])[0][0]; //identifier for tree
fprintf(stdout, "Retrieved tree ", treeid, "\n");
return 1;
}
/* ____________________________________________________________________ */
/* Assembles a key as two column vectors mapping sequence names to */
/* branch IDs. */
function setupMapToTree (sampling_option)
{
// reconstruct ancestors from imported LF
if (sampling_option == 1) {
ExecuteCommands("DataSet ancestralSeqs = SampleAncestors ("+lfid+");");
} else {
ExecuteCommands("DataSet ancestralSeqs = ReconstructAncestors ("+lfid+");");
}
// generate data set filter of amino acid sequences from reconstructed ancestors
DataSetFilter filteredDataA = CreateFilter(ancestralSeqs, 1);
// get AA frequencies from data
ExecuteCommands ("HarvestFrequencies (observedCEFV, "+dataid+",1,1,0);");
HarvestFrequencies (observedCEFV, filteredData, 1, 1, 0);
stateCharCount = 20;
ExecuteCommands ("branchNames = BranchName ("+treeid+",-1);");
// -1 means retrieve for all branchces in tree
h = Columns (branchNames); // total number of branches in tree
seqToBranchMap = {h, 2}; // re-using container!
// maps sequence names to branch order in column 1
// and the other way around in column 2
for (k=0; k<filteredData.species; k=k+1) // for every extant sequence (tip)
{
GetString (seqName, filteredData, k);
seqToBranchMap[k][0] = -1;
for (v=0; v<h; v=v+1)
{
if (branchNames[v] % seqName) { // string comparison
seqToBranchMap[k][0] = v; // map tip name ordering to data filter
seqToBranchMap[v][1] = k; // and vice versa
break;
}
}
}
seqToBranchMap[filteredData.species][0] = h-1;
seqToBranchMap[h-1][1] = filteredData.species;
for (k=1; k<filteredDataA.species; k=k+1) {
// for every internal branch
GetString (seqName, filteredDataA, k);
seqToBranchMap[filteredData.species+k][0] = -1;
for (v=0; v<h; v=v+1) {
if (branchNames[v] % seqName) {
seqToBranchMap[k+filteredData.species][0] = v;
seqToBranchMap[v][1] = k+filteredData.species;
break;
}
}
}
GetDataInfo (dupInfo, filteredData);
GetDataInfo (dupInfoA, filteredDataA);
matrixTrick = {1,stateCharCount};
matrixTrick2 = {1,stateCharCount};
for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
matrixTrick [h] = h;
matrixTrick2 [h] = 1;
}
aminoInfo = {filteredData.species, filteredData.unique_sites}; // tip data
aminoInfo2 = {filteredDataA.species, filteredDataA.unique_sites}; // ancestor data
GetDataInfo (dupInfo, filteredData);
GetDataInfo (dupInfoA, filteredDataA);
matrixTrick = {1,stateCharCount};
matrixTrick2 = {1,stateCharCount};
for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
matrixTrick [h] = h;
matrixTrick2 [h] = 1;
}
for (v=0; v<filteredData.unique_sites;v=v+1) {
for (h=0; h<filteredData.species;h=h+1) {
GetDataInfo (siteInfo, filteredData, h, v);
_SITE_ES_COUNT = matrixTrick2 * siteInfo;
if (_SITE_ES_COUNT[0] == 1) {
siteInfo = matrixTrick * siteInfo;
aminoInfo[h][v] = siteInfo[0];
}
else {
aminoInfo[h][v] = -1;
}
}
}
for (v=0; v<filteredDataA.unique_sites;v=v+1) {
for (h=0; h<filteredDataA.species;h=h+1) {
GetDataInfo (siteInfo, filteredDataA, h, v);
siteInfo = matrixTrick * siteInfo;
aminoInfo2[h][v] = siteInfo[0];
}
}
ExecuteCommands ("flatTreeRep = Abs ("+treeid+");");
GetInformation (seqStrings, filteredData);
return 0;
}
/* ____________________________________________________________________ */
/* Generate a tab-separated table where each row corresponds to a */
/* branch in the tree, and each column corresponds to a codon position */
/* in the aligned sequences. Populate with 0's and 1's, i.e. binary */
/* variables, for presence or absence of a nonsynonymous substitution */
/* at that position in each branch. */
function reconstructAncestors (rep)
{
if (option > 0) {
writeToFile = outfile+rep;
} else {
writeToFile = outfile;
}
fprintf (stdout, "Writing to file ", writeToFile, "\n");
fprintf (writeToFile, CLEAR_FILE, KEEP_OPEN);
k = filteredData.species + 1; // root ancestral sequence
for (h = 1; h < filteredDataA.species; h = h + 1) {
// for every internal branch
aa_count = 0;
p1 = seqToBranchMap[k][0]; // get position of root in tree ordering
pid = flatTreeRep[p1];
p2 = seqToBranchMap[pid][1] - filteredData.species;
bn = branchNames [p1];
for (site = 0; site < filteredDataA.sites; site = site + 1) {
if (output_option == 0 && site > 0) {
fprintf (writeToFile, ",");
}
// for every site in sequence
c1 = dupInfoA[site];
aa1 = aminoInfo2[h] [c1]; // derived codon state on branch
aa2 = aminoInfo2[p2][c1]; // ancestral " "
// handle gaps
if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) {
if (output_option == 0) {
// CSV formatting
fprintf (writeToFile, "0");
}
continue;
}
if (aa1 != aa2) {
if (output_option == 0) {
// CSV matrix format
fprintf (writeToFile, "1");
}
else {
// list format
fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
aa_count = aa_count + 1;
}
} else {
if (output_option == 0) {
fprintf (writeToFile, "0");
}
}
}
// end loop over sites
if (output_option == 0) {
fprintf (writeToFile, "\n"); // end of line
}
k = k + 1; // update parent sequence
}
// now do the leaves
for (h = 0; h < filteredData.species; h = h + 1) {
aa_count = 0;
p1 = seqToBranchMap[h][0];
pid = flatTreeRep[p1];
p2 = seqToBranchMap[pid][1]-filteredData.species;
bn = branchNames [p1];
for (site = 0; site < filteredDataA.sites; site = site + 1) {
if (output_option == 0 && site > 0) {
fprintf (writeToFile, ",");
}
c1 = dupInfoA[site];
c2 = dupInfo [site];
aa2 = aminoInfo2[p2][c1];
aa1 = aminoInfo [h] [c2];
if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) {
if (output_option == 0) {
fprintf (writeToFile, "0"); // gap
}
continue;
}
if (aa1 != aa2) {
if (output_option == 0) {
fprintf (writeToFile, "1");
} else {
fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
aa_count = aa_count + 1;
}
} else {
if (output_option == 0) {
fprintf (writeToFile, "0");
}
}
}
if (output_option == 0) {
fprintf (writeToFile, "\n"); // end line
}
}
// START 20070926SLKP: CLOSE_FILE is needed to flush all to disk
fprintf (writeToFile, CLOSE_FILE);
return 0; // unused
}
/* ==================================================================================== */
/* MAIN LOOP */
/* ==================================================================================== */
stateCharCount = 20; /* there are 64 possible codons, including termination codons */
/*START 20070926SLKP: handle the case of failing to select an LF */
retcode = importLikelihoodFunction(0);
if (retcode == 1) {
ChoiceList (option, "Ancestor reconstruction option: ", 1, SKIP_NONE,
"Maximum likelihood", "Map most likely reconstruction to internal nodes.",
"Sampling", "Map reconstructions at nodes in propoprtion to likelihood.");
ChoiceList (output_option, "Output option: ", 1, SKIP_NONE,
"Binary matrix", "Write comma-separated (CSV) binary matrices to file, where 1 indicates a site-specific substitution event.",
"List", "List amino acid substitutions per branch.");
SetDialogPrompt ("Provde a filename to write output(s) to: ");
fprintf (PROMPT_FOR_FILE, CLEAR_FILE);
outfile = LAST_FILE_PATH;
if (option == 1) {
fprintf (stdout, "Enter number of replicates to sample: ");
fscanf (stdin, "Number", max_reps);
for (this_rep = 0; this_rep < max_reps; this_rep = this_rep+1) {
setupMapToTree(1);
reconstructAncestors(this_rep);
}
} else {
setupMapToTree(0);
reconstructAncestors(0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment