Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active October 30, 2018 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lh3/8cc5817d56f0fb0be54139338be778b6 to your computer and use it in GitHub Desktop.
Save lh3/8cc5817d56f0fb0be54139338be778b6 to your computer and use it in GitHub Desktop.
Position-specific gap open penalty
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <assert.h>
#include "ksw2.h"
unsigned char seq_nt4_table[256] = {
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
};
static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b)
{
int i, j;
a = a < 0? -a : a;
b = b > 0? -b : b;
for (i = 0; i < m - 1; ++i) {
for (j = 0; j < m - 1; ++j)
mat[i * m + j] = i == j? a : b;
mat[i * m + m - 1] = 0;
}
for (j = 0; j < m; ++j)
mat[(m - 1) * m + j] = 0;
}
int main(int argc, char *argv[])
{
int8_t a = 6, b = 18, q = 30, e = 5;
int c, w = -1;
while ((c = getopt(argc, argv, "w:A:B:O:E:")) >= 0) {
if (c == 'w') w = atoi(optarg);
else if (c == 'A') a = atoi(optarg);
else if (c == 'B') b = atoi(optarg);
else if (c == 'O') q = atoi(optarg);
else if (c == 'E') e = atoi(optarg);
}
if (argc - optind < 2) {
fprintf(stderr, "Usage: ksw2-test [options] <DNA-target> <DNA-query>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -A INT match score [%d]\n", a);
fprintf(stderr, " -B INT mismatch penalty [%d]\n", b);
fprintf(stderr, " -O INT gap open penalty [%d]\n", q);
fprintf(stderr, " -E INT gap extension penalty [%d]\n", e);
return 1;
} else {
int tlen, qlen, i, j, k, l, score, m_cigar = 0, n_cigar = 0, blen;
uint32_t *cigar = 0;
uint8_t *tseq, *qseq;
int8_t mat[25], *pso = 0;
char *out[3];
// prepare for the alignment
ksw_gen_simple_mat(5, mat, a, -b);
tlen = strlen(argv[optind]);
qlen = strlen(argv[optind+1]);
if (optind + 2 < argc) {
j = strlen(argv[optind+2]);
assert(j == tlen);
}
tseq = (uint8_t*)calloc(tlen * 2 + qlen, 1);
qseq = tseq + tlen;
pso = (int8_t*)qseq + qlen;
for (j = 0; j < tlen; ++j) tseq[j] = seq_nt4_table[(uint8_t)argv[optind][j]];
for (j = 0; j < qlen; ++j) qseq[j] = seq_nt4_table[(uint8_t)argv[optind + 1][j]];
if (optind + 2 < argc) {
for (j = 0; j < tlen; ++j) {
int c = (int)argv[optind + 2][j] - '0';
assert(c >= 0 && c <= 127);
pso[j] = c;
}
}
if (w < 0) w = tlen > qlen? tlen : qlen;
// perform alignment
score = ksw_ggd(0, qlen, qseq, tlen, tseq, 5, mat, q, e, w, pso, &m_cigar, &n_cigar, &cigar);
// print alignment
for (k = blen = 0; k < n_cigar; ++k) blen += cigar[k] >> 4;
out[0] = (char*)calloc((blen + 1) * 3, 1);
out[1] = out[0] + blen + 1;
out[2] = out[1] + blen + 1;
for (k = i = j = l = 0; k < n_cigar; ++k) {
int t, op = cigar[k] & 0xf, len = cigar[k] >> 4;
if (op == 0) {
for (t = 0; t < len; ++t) {
out[0][l] = argv[optind][i + t];
out[2][l] = argv[optind+1][j + t];
out[1][l++] = tseq[i + t] == qseq[j + t] && tseq[i + t] < 4? '|' : ' ';
}
i += len, j += len;
} else if (op == 1) {
for (t = 0; t < len; ++t)
out[0][l] = '-', out[2][l] = argv[optind+1][j + t], out[1][l++] = ' ';
j += len;
} else {
for (t = 0; t < len; ++t)
out[0][l] = argv[optind][i + t], out[2][l] = '-', out[1][l++] = ' ';
i += len;
}
}
printf("Score: %d\n%s\n%s\n%s\n", score, out[0], out[1], out[2]);
free(tseq);
free(out[0]);
}
return 0;
}
#ifndef KSW2_H_
#define KSW2_H_
#include <stdint.h>
#include <stdlib.h>
#define KSW_NEG_INF -0x40000000
#ifdef __cplusplus
extern "C" {
#endif
/**
* Global alignment
*
* (first 10 parameters identical to ksw_extz_sse())
* @param m_cigar (modified) max CIGAR length; feed 0 if cigar==0
* @param n_cigar (out) number of CIGAR elements
* @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with kfree(km, )
*
* @return score of the alignment
*/
int ksw_ggd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w,
int8_t *pso, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
#ifdef __cplusplus
}
#endif
/************************************
*** Private macros and functions ***
************************************/
static inline uint32_t *ksw_push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len)
{
if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
if (*n_cigar == *m_cigar) {
*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
cigar = (uint32_t*)realloc(cigar, (*m_cigar) << 2);
}
cigar[(*n_cigar)++] = len<<4 | op;
} else cigar[(*n_cigar)-1] += len<<4;
return cigar;
}
// In the backtrack matrix, value p[] has the following structure:
// bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
// bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
// bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
static inline void ksw_backtrack(int is_rot, int is_rev, int min_intron_len, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
{ // p[] - lower 3 bits: which type gets the max; bit
int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
uint32_t *cigar = *cigar_, tmp;
while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
int force_state = -1;
if (is_rot) {
r = i + j;
if (i < off[r]) force_state = 2;
if (off_end && i > off_end[r]) force_state = 1;
tmp = force_state < 0? p[(size_t)r * n_col + i - off[r]] : 0;
} else {
if (j < off[i]) force_state = 2;
if (off_end && j > off_end[i]) force_state = 1;
tmp = force_state < 0? p[(size_t)i * n_col + j - off[i]] : 0;
}
if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it.
else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
if (force_state >= 0) state = force_state;
if (state == 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
else cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
}
if (i >= 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? 3 : 2, i + 1); // first deletion
if (j >= 0) cigar = ksw_push_cigar(&n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
if (!is_rev)
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
*m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar;
}
#endif
#include <stdio.h> // for debugging only
#include <assert.h>
#include "ksw2.h"
typedef struct { int32_t h, e; } eh_t;
int ksw_ggd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w,
int8_t *pso, int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
{
eh_t *eh;
int8_t *qp; // query profile
int32_t i, j, k, gapoe = gapo + gape, score, n_col, *off = 0;
uint8_t *z = 0; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be more complex
assert(m_cigar_ && n_cigar_ && cigar_);
// allocate memory
if (w < 0) w = tlen > qlen? tlen : qlen;
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
qp = (int8_t*)malloc(qlen * m);
eh = (eh_t*)calloc(qlen + 1, 8);
if (m_cigar_ && n_cigar_ && cigar_) {
*n_cigar_ = 0;
z = (uint8_t*)malloc((size_t)n_col * tlen);
off = (int32_t*)calloc(tlen, 4);
}
// generate the query profile
for (k = i = 0; k < m; ++k) {
const int8_t *p = &mat[k * m];
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
}
// fill the first row
eh[0].h = 0, eh[0].e = -gapoe - gapoe;
for (j = 1; j <= qlen && j <= w; ++j)
eh[j].h = -(gapoe + gape * (j - 1)), eh[j].e = -(gapoe + gapoe + gape * j);
for (; j <= qlen; ++j) eh[j].h = eh[j].e = KSW_NEG_INF; // everything is -inf outside the band
// DP loop
for (i = 0; i < tlen; ++i) { // target sequence is in the outer loop
int32_t f = KSW_NEG_INF, h1, st, en, os = pso? -pso[i] : 0, gapoe2 = gapoe + os;
int8_t *q = &qp[target[i] * qlen];
uint8_t *zi = &z[(long)i * n_col];
st = i > w? i - w : 0;
en = i + w + 1 < qlen? i + w + 1 : qlen;
h1 = st > 0? KSW_NEG_INF : -(gapoe + gape * i);
f = st > 0? KSW_NEG_INF : -(gapoe + gapoe + os + gape * i);
off[i] = st;
for (j = st; j < en; ++j) {
// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
// Cells are computed in the following order:
// H(i,j) = max{H(i-1,j-1) + S(i,j), E(i,j), F(i,j)}
// E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
// F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
eh_t *p = &eh[j];
int32_t h = p->h, e = p->e;
uint8_t d; // direction
p->h = h1;
h += q[j];
d = h >= e? 0 : 1;
h = h >= e? h : e;
d = h >= f? d : 2;
h = h >= f? h : f;
h1 = h;
h -= gapoe2;
e -= gape;
d |= e > h? 0x08 : 0;
e = e > h? e : h;
p->e = e;
f -= gape;
d |= f > h? 0x10 : 0; // if we want to halve the memory, use one bit only, instead of two
f = f > h? f : h;
zi[j - st] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
}
eh[en].h = h1, eh[en].e = KSW_NEG_INF;
}
// backtrack
score = eh[qlen].h;
free(qp); free(eh);
if (m_cigar_ && n_cigar_ && cigar_) {
ksw_backtrack(0, 0, 0, z, off, 0, n_col, tlen-1, qlen-1, m_cigar_, n_cigar_, cigar_);
free(z); free(off);
}
return score;
}
gg:ksw2_ggd.c cli.c ksw2.h
$(CC) -Wall -g -O2 -o $@ ksw2_ggd.c cli.c
clean:
rm -fr *.o *.dSYM gg
@lh3
Copy link
Author

lh3 commented Oct 30, 2018

How to run:

git clone https://gist.github.com/8cc5817d56f0fb0be54139338be778b6.git gg
cd gg && make
./gg ACGTATATAGGA ACGTATAGGA 000032100000

Output:

Score: 23
ACGTATATAGGA
|||||  |||||
ACGTA--TAGGA

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment