ArtPoon/MapAAMutationsToTree.bf

## MapAAMutationsToTree.bf
acids = {{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}};

function importLikelihoodFunction (dummy)
{
	SetDialogPrompt ("Choose a file containing an exported likelihood function:");
	fscanf(PROMPT_FOR_FILE, "Raw", importLF);

	ExecuteCommands(importLF);

	howManyLFs = Rows("LikelihoodFunction");
	if (howManyLFs>1) {
		ChoiceList  (likelihoodFnChoice,"Choose a Likelihood Function",1,NO_SKIP,LikelihoodFunction);
		if (likelihoodFnChoice<0) {
			return 0;  // cancelled
		}
	}
	else
	{
		if (howManyLFs == 1) {
    		// only one available - use it
			likelihoodFnChoice = 0;
		}
		else {
    		// no LFs available; die
			fprintf (stdout, "ERROR: no likelihood functions available for ancestral state reconstruction!\n");
			return 0;
		}
	}

	GetString (lfid,LikelihoodFunction,likelihoodFnChoice);

	// extract identifiers from likelihood function
	ExecuteCommands("GetString (recep, "+lfid+", -1)");

	dataid = (recep["Datafilters"])[0][0];		//identifier for data filter
	fprintf(stdout, "Retrieved data partition ", dataid, "\n");

	ExecuteCommands ("DataSetFilter filteredData = CreateFilter ("+dataid+",1);");

	treeid = (recep["Trees"])[0][0];				//identifier for tree
	fprintf(stdout, "Retrieved tree ", treeid, "\n");

	return 1;
}


/* ____________________________________________________________________	*/
/*  Assembles a key as two column vectors mapping sequence names to 	*/
/*	branch IDs.															*/
function setupMapToTree (sampling_option)
{
	// reconstruct ancestors from imported LF
	if (sampling_option == 1) {
		ExecuteCommands("DataSet ancestralSeqs = SampleAncestors ("+lfid+");");
	} else {
		ExecuteCommands("DataSet ancestralSeqs  = ReconstructAncestors ("+lfid+");");
	}

	// generate data set filter of amino acid sequences from reconstructed ancestors
	DataSetFilter	filteredDataA  = CreateFilter(ancestralSeqs, 1);

	// get AA frequencies from data
	ExecuteCommands ("HarvestFrequencies (observedCEFV, "+dataid+",1,1,0);");
	HarvestFrequencies (observedCEFV, filteredData, 1, 1, 0);

	stateCharCount = 20;

	ExecuteCommands ("branchNames = BranchName ("+treeid+",-1);");
				// -1 means retrieve for all branchces in tree
	h = Columns (branchNames);	// total number of branches in tree
	seqToBranchMap 	= {h, 2};		// re-using container!

	// maps sequence names to branch order in column 1
	//   and the other way around in column 2
	for (k=0; k<filteredData.species; k=k+1)	// for every extant sequence (tip)
	{
		GetString (seqName, filteredData, k);
		seqToBranchMap[k][0] = -1;
		for (v=0; v<h; v=v+1)
		{
			if (branchNames[v] % seqName) {	// string comparison
				seqToBranchMap[k][0] = v;	 // map tip name ordering to data filter
				seqToBranchMap[v][1] = k;	 // and vice versa
				break;
			}
		}
	}

	seqToBranchMap[filteredData.species][0] = h-1;
	seqToBranchMap[h-1][1] = filteredData.species;

	for (k=1; k<filteredDataA.species; k=k+1) {
    	// for every internal branch
		GetString (seqName, filteredDataA, k);
		seqToBranchMap[filteredData.species+k][0] = -1;
		for (v=0; v<h; v=v+1) {
			if (branchNames[v] % seqName) {
				seqToBranchMap[k+filteredData.species][0] = v;
				seqToBranchMap[v][1] = k+filteredData.species;
				break;
			}
		}
	}

	GetDataInfo    (dupInfo, filteredData);
	GetDataInfo	   (dupInfoA, filteredDataA);
	matrixTrick  = {1,stateCharCount};
	matrixTrick2 = {1,stateCharCount};

	for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
		matrixTrick  [h] = h;
		matrixTrick2 [h] = 1;
	}

	aminoInfo  = {filteredData.species, filteredData.unique_sites};		// tip data
	aminoInfo2 = {filteredDataA.species, filteredDataA.unique_sites};	// ancestor data

	GetDataInfo    (dupInfo, filteredData);
	GetDataInfo	   (dupInfoA, filteredDataA);

	matrixTrick  = {1,stateCharCount};
	matrixTrick2 = {1,stateCharCount};

	for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
		matrixTrick  [h] = h;
		matrixTrick2 [h] = 1;
	}

	for (v=0; v<filteredData.unique_sites;v=v+1) {
		for (h=0; h<filteredData.species;h=h+1) {
			GetDataInfo (siteInfo, filteredData, h, v);
			_SITE_ES_COUNT = matrixTrick2 * siteInfo;
			if (_SITE_ES_COUNT[0] == 1) {
				siteInfo = matrixTrick * siteInfo;
				aminoInfo[h][v] = siteInfo[0];
			}
			else {
				aminoInfo[h][v] = -1;
			}
		}
	}

	for (v=0; v<filteredDataA.unique_sites;v=v+1) {
		for (h=0; h<filteredDataA.species;h=h+1) {
			GetDataInfo (siteInfo, filteredDataA, h, v);
			siteInfo = matrixTrick * siteInfo;
			aminoInfo2[h][v] = siteInfo[0];
		}
	}

	ExecuteCommands ("flatTreeRep = Abs ("+treeid+");");
	GetInformation (seqStrings, filteredData);

	return 0;
}


/* ____________________________________________________________________	*/
/*	Generate a tab-separated table where each row corresponds to a 		*/
/*	branch in the tree, and	each column corresponds to a codon position */
/*	in the aligned sequences.  Populate with 0's and 1's, i.e. binary 	*/
/*	variables, for presence or absence of a nonsynonymous substitution	*/
/*	at that position in each branch.									*/
function reconstructAncestors (rep)
{
    if (option > 0) {
    	writeToFile = outfile+rep;
    } else {
        writeToFile = outfile;
    }
	fprintf (stdout, "Writing to file ", writeToFile, "\n");
	fprintf (writeToFile, CLEAR_FILE, KEEP_OPEN);

	k = filteredData.species + 1;		// root ancestral sequence

	for (h = 1; h < filteredDataA.species; h = h + 1) {
    	// for every internal branch
		aa_count = 0;

		p1 = seqToBranchMap[k][0];	// get position of root in tree ordering
		pid = flatTreeRep[p1];
		p2 = seqToBranchMap[pid][1] - filteredData.species;

		bn = branchNames [p1];


		for (site = 0; site < filteredDataA.sites; site = site + 1) {
			if (output_option == 0 && site > 0) {
				fprintf (writeToFile, ",");
			}

		    // for every site in sequence
			c1 = dupInfoA[site];
			aa1 = aminoInfo2[h] [c1];	// derived codon state on branch
			aa2 = aminoInfo2[p2][c1];	// ancestral	"		"

			// handle gaps
			if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) {
				if (output_option == 0) {
				    // CSV formatting
					fprintf (writeToFile, "0");
				}
				continue;
			}

			if (aa1 != aa2)	{
				if (output_option == 0)	{
    				// CSV matrix format
					fprintf (writeToFile, "1");
				}
				else {
				    // list format
					fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
					aa_count = aa_count + 1;
				}
			} else {
				if (output_option == 0) {
					fprintf (writeToFile, "0");
				}
			}
		}
		// end loop over sites

		if (output_option == 0) {
			fprintf (writeToFile, "\n");	// end of line
		}

		k = k + 1;		// update parent sequence
	}

	// now do the leaves
	for (h = 0; h < filteredData.species; h = h + 1) {
		aa_count = 0;

		p1 = seqToBranchMap[h][0];
		pid = flatTreeRep[p1];
		p2 = seqToBranchMap[pid][1]-filteredData.species;

		bn = branchNames [p1];

		for (site = 0; site < filteredDataA.sites; site = site + 1) {
			if (output_option == 0 && site > 0) {
				fprintf (writeToFile, ",");
			}

			c1 = dupInfoA[site];
			c2 = dupInfo [site];
			aa2 = aminoInfo2[p2][c1];
			aa1 = aminoInfo [h] [c2];

			if (aa1 >= stateCharCount || aa2 >= stateCharCount || aa1 < 0 || aa2 < 0) {
				if (output_option == 0) {
					fprintf (writeToFile, "0"); // gap
				}
				continue;
			}

            if (aa1 != aa2)	{
                if (output_option == 0) {
                    fprintf (writeToFile, "1");
                } else {
                    fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
                    aa_count = aa_count + 1;
                }
            } else {
                if (output_option == 0) {
                    fprintf (writeToFile, "0");
                }
            }
		}

		if (output_option == 0) {
			fprintf (writeToFile, "\n");	// end line
		}
	}
	// START 20070926SLKP: CLOSE_FILE is needed to flush all to disk
	fprintf (writeToFile, CLOSE_FILE);

	return 0;		// unused
}


/* ====================================================================================	*/
/*		MAIN LOOP																		*/
/* ==================================================================================== */

stateCharCount = 20;	/* there are 64 possible codons, including termination codons */


/*START 20070926SLKP: handle the case of failing to select an LF */
retcode = importLikelihoodFunction(0);
if (retcode == 1) {
    ChoiceList (option, "Ancestor reconstruction option: ", 1, SKIP_NONE,
                    "Maximum likelihood", "Map most likely reconstruction to internal nodes.",
                    "Sampling", "Map reconstructions at nodes in propoprtion to likelihood.");

    ChoiceList (output_option, "Output option: ", 1, SKIP_NONE,
                    "Binary matrix", "Write comma-separated (CSV) binary matrices to file, where 1 indicates a site-specific substitution event.",
                    "List", "List amino acid substitutions per branch.");

    SetDialogPrompt ("Provde a filename to write output(s) to: ");
    fprintf (PROMPT_FOR_FILE, CLEAR_FILE);
    outfile = LAST_FILE_PATH;

    if (option == 1) {
        fprintf (stdout, "Enter number of replicates to sample: ");
        fscanf (stdin, "Number", max_reps);

        for (this_rep = 0; this_rep < max_reps; this_rep = this_rep+1) {
            setupMapToTree(1);
            reconstructAncestors(this_rep);
        }
    } else {
        setupMapToTree(0);
        reconstructAncestors(0);
    }
}
	acids = {{"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}};

	function importLikelihoodFunction (dummy)
	{
	SetDialogPrompt ("Choose a file containing an exported likelihood function:");
	fscanf(PROMPT_FOR_FILE, "Raw", importLF);

	ExecuteCommands(importLF);

	howManyLFs = Rows("LikelihoodFunction");
	if (howManyLFs>1) {
	ChoiceList (likelihoodFnChoice,"Choose a Likelihood Function",1,NO_SKIP,LikelihoodFunction);
	if (likelihoodFnChoice<0) {
	return 0; // cancelled
	}
	}
	else
	{
	if (howManyLFs == 1) {
	// only one available - use it
	likelihoodFnChoice = 0;
	}
	else {
	// no LFs available; die
	fprintf (stdout, "ERROR: no likelihood functions available for ancestral state reconstruction!\n");
	return 0;
	}
	}

	GetString (lfid,LikelihoodFunction,likelihoodFnChoice);

	// extract identifiers from likelihood function
	ExecuteCommands("GetString (recep, "+lfid+", -1)");

	dataid = (recep["Datafilters"])[0][0]; //identifier for data filter
	fprintf(stdout, "Retrieved data partition ", dataid, "\n");

	ExecuteCommands ("DataSetFilter filteredData = CreateFilter ("+dataid+",1);");

	treeid = (recep["Trees"])[0][0]; //identifier for tree
	fprintf(stdout, "Retrieved tree ", treeid, "\n");

	return 1;
	}


	/* ____________________________________________________________________ */
	/* Assembles a key as two column vectors mapping sequence names to */
	/* branch IDs. */
	function setupMapToTree (sampling_option)
	{
	// reconstruct ancestors from imported LF
	if (sampling_option == 1) {
	ExecuteCommands("DataSet ancestralSeqs = SampleAncestors ("+lfid+");");
	} else {
	ExecuteCommands("DataSet ancestralSeqs = ReconstructAncestors ("+lfid+");");
	}

	// generate data set filter of amino acid sequences from reconstructed ancestors
	DataSetFilter filteredDataA = CreateFilter(ancestralSeqs, 1);

	// get AA frequencies from data
	ExecuteCommands ("HarvestFrequencies (observedCEFV, "+dataid+",1,1,0);");
	HarvestFrequencies (observedCEFV, filteredData, 1, 1, 0);

	stateCharCount = 20;

	ExecuteCommands ("branchNames = BranchName ("+treeid+",-1);");
	// -1 means retrieve for all branchces in tree
	h = Columns (branchNames); // total number of branches in tree
	seqToBranchMap = {h, 2}; // re-using container!

	// maps sequence names to branch order in column 1
	// and the other way around in column 2
	for (k=0; k<filteredData.species; k=k+1) // for every extant sequence (tip)
	{
	GetString (seqName, filteredData, k);
	seqToBranchMap[k][0] = -1;
	for (v=0; v<h; v=v+1)
	{
	if (branchNames[v] % seqName) { // string comparison
	seqToBranchMap[k][0] = v; // map tip name ordering to data filter
	seqToBranchMap[v][1] = k; // and vice versa
	break;
	}
	}
	}

	seqToBranchMap[filteredData.species][0] = h-1;
	seqToBranchMap[h-1][1] = filteredData.species;

	for (k=1; k<filteredDataA.species; k=k+1) {
	// for every internal branch
	GetString (seqName, filteredDataA, k);
	seqToBranchMap[filteredData.species+k][0] = -1;
	for (v=0; v<h; v=v+1) {
	if (branchNames[v] % seqName) {
	seqToBranchMap[k+filteredData.species][0] = v;
	seqToBranchMap[v][1] = k+filteredData.species;
	break;
	}
	}
	}

	GetDataInfo (dupInfo, filteredData);
	GetDataInfo (dupInfoA, filteredDataA);
	matrixTrick = {1,stateCharCount};
	matrixTrick2 = {1,stateCharCount};

	for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
	matrixTrick [h] = h;
	matrixTrick2 [h] = 1;
	}

	aminoInfo = {filteredData.species, filteredData.unique_sites}; // tip data
	aminoInfo2 = {filteredDataA.species, filteredDataA.unique_sites}; // ancestor data

	GetDataInfo (dupInfo, filteredData);
	GetDataInfo (dupInfoA, filteredDataA);

	matrixTrick = {1,stateCharCount};
	matrixTrick2 = {1,stateCharCount};

	for (h=Columns(matrixTrick)-1; h>=0; h=h-1) {
	matrixTrick [h] = h;
	matrixTrick2 [h] = 1;
	}

	for (v=0; v<filteredData.unique_sites;v=v+1) {
	for (h=0; h<filteredData.species;h=h+1) {
	GetDataInfo (siteInfo, filteredData, h, v);
	_SITE_ES_COUNT = matrixTrick2 * siteInfo;
	if (_SITE_ES_COUNT[0] == 1) {
	siteInfo = matrixTrick * siteInfo;
	aminoInfo[h][v] = siteInfo[0];
	}
	else {
	aminoInfo[h][v] = -1;
	}
	}
	}

	for (v=0; v<filteredDataA.unique_sites;v=v+1) {
	for (h=0; h<filteredDataA.species;h=h+1) {
	GetDataInfo (siteInfo, filteredDataA, h, v);
	siteInfo = matrixTrick * siteInfo;
	aminoInfo2[h][v] = siteInfo[0];
	}
	}

	ExecuteCommands ("flatTreeRep = Abs ("+treeid+");");
	GetInformation (seqStrings, filteredData);

	return 0;
	}



	/* ____________________________________________________________________ */
	/* Generate a tab-separated table where each row corresponds to a */
	/* branch in the tree, and each column corresponds to a codon position */
	/* in the aligned sequences. Populate with 0's and 1's, i.e. binary */
	/* variables, for presence or absence of a nonsynonymous substitution */
	/* at that position in each branch. */
	function reconstructAncestors (rep)
	{
	if (option > 0) {
	writeToFile = outfile+rep;
	} else {
	writeToFile = outfile;
	}
	fprintf (stdout, "Writing to file ", writeToFile, "\n");
	fprintf (writeToFile, CLEAR_FILE, KEEP_OPEN);

	k = filteredData.species + 1; // root ancestral sequence

	for (h = 1; h < filteredDataA.species; h = h + 1) {
	// for every internal branch
	aa_count = 0;

	p1 = seqToBranchMap[k][0]; // get position of root in tree ordering
	pid = flatTreeRep[p1];
	p2 = seqToBranchMap[pid][1] - filteredData.species;

	bn = branchNames [p1];


	for (site = 0; site < filteredDataA.sites; site = site + 1) {
	if (output_option == 0 && site > 0) {
	fprintf (writeToFile, ",");
	}

	// for every site in sequence
	c1 = dupInfoA[site];
	aa1 = aminoInfo2[h] [c1]; // derived codon state on branch
	aa2 = aminoInfo2[p2][c1]; // ancestral " "

	// handle gaps
	if (aa1 >= stateCharCount \|\| aa2 >= stateCharCount \|\| aa1 < 0 \|\| aa2 < 0) {
	if (output_option == 0) {
	// CSV formatting
	fprintf (writeToFile, "0");
	}
	continue;
	}

	if (aa1 != aa2) {
	if (output_option == 0) {
	// CSV matrix format
	fprintf (writeToFile, "1");
	}
	else {
	// list format
	fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
	aa_count = aa_count + 1;
	}
	} else {
	if (output_option == 0) {
	fprintf (writeToFile, "0");
	}
	}
	}
	// end loop over sites

	if (output_option == 0) {
	fprintf (writeToFile, "\n"); // end of line
	}

	k = k + 1; // update parent sequence
	}

	// now do the leaves
	for (h = 0; h < filteredData.species; h = h + 1) {
	aa_count = 0;

	p1 = seqToBranchMap[h][0];
	pid = flatTreeRep[p1];
	p2 = seqToBranchMap[pid][1]-filteredData.species;

	bn = branchNames [p1];

	for (site = 0; site < filteredDataA.sites; site = site + 1) {
	if (output_option == 0 && site > 0) {
	fprintf (writeToFile, ",");
	}

	c1 = dupInfoA[site];
	c2 = dupInfo [site];
	aa2 = aminoInfo2[p2][c1];
	aa1 = aminoInfo [h] [c2];

	if (aa1 >= stateCharCount \|\| aa2 >= stateCharCount \|\| aa1 < 0 \|\| aa2 < 0) {
	if (output_option == 0) {
	fprintf (writeToFile, "0"); // gap
	}
	continue;
	}

	if (aa1 != aa2) {
	if (output_option == 0) {
	fprintf (writeToFile, "1");
	} else {
	fprintf (writeToFile, rep, "\t", bn, "\t", site+1, "\t", acids[aa2], "-->", acids[aa1], "\n");
	aa_count = aa_count + 1;
	}
	} else {
	if (output_option == 0) {
	fprintf (writeToFile, "0");
	}
	}
	}

	if (output_option == 0) {
	fprintf (writeToFile, "\n"); // end line
	}
	}
	// START 20070926SLKP: CLOSE_FILE is needed to flush all to disk
	fprintf (writeToFile, CLOSE_FILE);

	return 0; // unused
	}


	/* ==================================================================================== */
	/* MAIN LOOP */
	/* ==================================================================================== */

	stateCharCount = 20; /* there are 64 possible codons, including termination codons */


	/START 20070926SLKP: handle the case of failing to select an LF /
	retcode = importLikelihoodFunction(0);
	if (retcode == 1) {
	ChoiceList (option, "Ancestor reconstruction option: ", 1, SKIP_NONE,
	"Maximum likelihood", "Map most likely reconstruction to internal nodes.",
	"Sampling", "Map reconstructions at nodes in propoprtion to likelihood.");

	ChoiceList (output_option, "Output option: ", 1, SKIP_NONE,
	"Binary matrix", "Write comma-separated (CSV) binary matrices to file, where 1 indicates a site-specific substitution event.",
	"List", "List amino acid substitutions per branch.");

	SetDialogPrompt ("Provde a filename to write output(s) to: ");
	fprintf (PROMPT_FOR_FILE, CLEAR_FILE);
	outfile = LAST_FILE_PATH;

	if (option == 1) {
	fprintf (stdout, "Enter number of replicates to sample: ");
	fscanf (stdin, "Number", max_reps);

	for (this_rep = 0; this_rep < max_reps; this_rep = this_rep+1) {
	setupMapToTree(1);
	reconstructAncestors(this_rep);
	}
	} else {
	setupMapToTree(0);
	reconstructAncestors(0);
	}
	}