Skip to content

Instantly share code, notes, and snippets.

@jhawthorn
Last active September 13, 2019 04:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jhawthorn/4752893 to your computer and use it in GitHub Desktop.
Save jhawthorn/4752893 to your computer and use it in GitHub Desktop.
/*
* gotoh.c
* Compute a sequence alignment with an affine gap cost
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <float.h>
#define max(a, b) (((a) > (b)) ? (a) : (b))
typedef double score_t;
#define SCORE_MIN (-DBL_MAX)
struct scores{ score_t match, mismatch, gap_start, gap_extend; };
struct gotoh_result{
const char *s1, *s2;
int M, N;
struct scores scores;
score_t **H, **V, **SIM;
score_t final;
};
/* allocate a MxN matrix of score_t */
score_t **mat_alloc(int m, int n){
int i;
score_t **mat = calloc(m, sizeof(score_t *) + n * sizeof(score_t));
score_t *cells = (score_t *)&mat[m];
for(i = 0; i < m; i++)
mat[i] = &cells[i * n];
return mat;
}
/* print one of the internal matrices */
void mat_print(struct gotoh_result *result, score_t **mat){
int i, j;
printf(" ");
for(j = 0; j < result->N; j++)
printf(" %5c", result->s2[j]);
for(i = 0; i < result->M; i++){
printf("\n %5c", i > 0 ? result->s1[i-1] : ' ');
for(j = 0; j < result->N; j++){
score_t s = mat[i][j];
if(s == SCORE_MIN)
printf(" -inf");
else
printf(" %5g", mat[i][j]);
}
}
printf("\n\n");
}
/* print internal H,V,SIM matrices */
void gotoh_print_mat(struct gotoh_result *result){
printf("H:\n"); mat_print(result, result->H);
printf("V:\n"); mat_print(result, result->V);
printf("SIM:\n"); mat_print(result, result->SIM);
}
/* Dynamic programming portion:
* Gotoh Algorithm
*
* H[][] stores the best score from the left
* H[][] stores the best score from above
* SIM[][] stores the best overall score
*/
void gotoh_dp(struct gotoh_result *result){
score_t **H = result->H;
score_t **V = result->V;
score_t **SIM = result->SIM;
struct scores *scores = &result->scores;
int i, j;
/* initialize first row and column of SIM as gaps */
SIM[0][0] = 0;
SIM[0][1] = SIM[1][0] = scores->gap_start;
for(i = 2; i < result->M; i++)
SIM[i][0] = (SIM[i-1][0] + scores->gap_extend);
for(j = 2; j < result->N; j++)
SIM[0][j] = (SIM[0][j-1] + scores->gap_extend);
/* initialize first row and column of H, V to SCORE_MIN (-inf) */
for(i = 0; i < result->M; i++)
H[i][0] = V[i][0] = SCORE_MIN;
for(j = 0; j < result->N; j++)
H[0][j] = V[0][j] = SCORE_MIN;
for(i = 1; i < result->M; i++){
for(j = 1; j < result->N; j++){
H[i][j] = max( H[i][j-1] + scores->gap_extend,
SIM[i][j-1] + scores->gap_start);
V[i][j] = max( V[i-1][j] + scores->gap_extend,
SIM[i-1][j] + scores->gap_start);
score_t h_or_v = max(H[i][j], V[i][j]);
if(result->s1[i-1] == result->s2[j-1]){
/* match */
SIM[i][j] = max(SIM[i-1][j-1] + scores->match, h_or_v);
}else{
/* mismatch */
SIM[i][j] = max(SIM[i-1][j-1] + scores->mismatch, h_or_v);
}
}
}
}
/* print the alignment */
void gotoh_traceback(struct gotoh_result *result){
unsigned int i = result->M - 1;
unsigned int j = result->N - 1;
size_t maxsize = result->M + result->N;
char *m1 = calloc(maxsize + 1, sizeof(char));
char *m2 = calloc(maxsize + 1, sizeof(char));
/* fill string with - to show gaps */
memset(m1, '-', maxsize);
memset(m2, '-', maxsize);
int k = maxsize;
while(i > 0 && j > 0){
k--;
if(j > 0 && result->SIM[i][j] == result->H[i][j]){
m2[k] = result->s2[--j]; /* gap in s1 */
}else if(i > 0 && result->SIM[i][j] == result->V[i][j]){
m1[k] = result->s1[--i]; /* gap in s2 */
}else{
/* match or mismatch */
m1[k] = result->s1[--i];
m2[k] = result->s2[--j];
}
}
/* end gap */
while(i > 0)
m1[--k] = result->s1[--i];
while(j > 0)
m2[--k] = result->s2[--j];
printf("%s\n", &m1[k]);
printf("%s\n", &m2[k]);
free(m1);
free(m2);
}
struct gotoh_result *gotoh(const char *seq1, const char *seq2, const struct scores scores){
struct gotoh_result *result = malloc(sizeof(struct gotoh_result));
result->s1 = seq1;
result->s2 = seq2;
result->scores = scores;
result->N = strlen(seq2) + 1; /* extra first row */
result->M = strlen(seq1) + 1; /* extra and column */
result->H = mat_alloc(result->M, result->N);
result->V = mat_alloc(result->M, result->N);
result->SIM = mat_alloc(result->M, result->N);
gotoh_dp(result);
result->final = result->SIM[result->M-1][result->N-1];
return result;
}
int main(int argc, char *argv[]){
if(argc != 7){
fprintf(stderr, "USAGE: %s SEQUENCE SEQUENCE MATCH MISMATCH GAPSTART GAPEXTEND\n", argv[0]);
return 1;
}else{
struct scores s = { atof(argv[3]), atof(argv[4]), atof(argv[5]), atof(argv[6]) };
struct gotoh_result *result;
result = gotoh(argv[1], argv[2], s);
gotoh_print_mat(result);
printf("score: %g\n", result->final);
gotoh_traceback(result);
return 0;
}
}
@jhawthorn
Copy link
Author

Copyright (c) 2013 John Hawthorn

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment