/* An example of global sequence alignment by dynamic programming. * * The program is ANSI C and should compile on any machine that has * a C compiler, with a command line like: * gcc -o global global.c * To run the program: * ./global * * To change the sequences or the scoring system, you have to edit the * appropriate #define's at the top of the program, and recompile. * This is meant to be a simple working example of the mechanics of * dynamic programming sequence alignment. The details of reading * FASTA files and such are left as an exercise to the reader... * * original: SRE, Wed Jun 2 2004 * bugfix in traceback condition: SRE, Sun Feb 24 2008 */ /* Define the problem here: two sequences SEQX and SEQY, * and the simple scoring system (MATCH, MISMATCH, and * the linear gap score INDEL). */ #define SEQX "TTCATA" #define SEQY "TGCTCGTA" #define MATCH 5 #define MISMATCH -2 #define INDEL -6 #include #include #include int main(void) { char *x,*y; /* the two sequences, x_1..M and y_1..N */ int M,N; /* lengths of sequences x,y */ int i,j; /* residue coords in x and y */ int **S; /* dynamic programming matrix */ char *ax,*ay; /* result: the two aligned sequences */ int K; /* length of pairwise alignment */ int k; /* index in pairwise alignment, 0..K-1 */ int sc; /* tmp variable used to find best path */ char tmp; /* tmp variable used to swap chars */ /***************************************************************** * Some initial bookkeeping. *****************************************************************/ /* Initialize the sequences x,y, using SEQX and SEQY. * * In C, arrays are indexed 0..N-1, but we want our sequences to be * indexed 1..N, and we need row/column 0 in the DP matrix for * initialization conditions. That's why we do some +1 fiddling * here, so our sequences are x_1..x_M and y_1..y_N. */ M = strlen(SEQX); N = strlen(SEQY); x = malloc(sizeof(char) * (M+2)); /* +2: leading blank, and trailing \0 */ y = malloc(sizeof(char) * (N+2)); strcpy(x+1, SEQX); /* x_1..x_M now defined; x_0 undefined */ strcpy(y+1, SEQY); /* y_1..y_N now defined; y_0 undefined */ /* Allocate the dynamic programming matrix, 0..M rows by 0..N columns. */ S = malloc(sizeof(int *) * (M+1)); for (i = 0; i <= M; i++) S[i] = malloc(sizeof(int) * (N+1)); /***************************************************************** * Recursion: the heart of the DP algorithm. *****************************************************************/ /* Initialization. */ S[0][0] = 0; for (i = 1; i <= M; i++) S[i][0] = i * INDEL; for (j = 1; j <= N; j++) S[0][j] = j * INDEL; /* The dynamic programming, global alignment (Needleman/Wunsch) recursion. */ for (i = 1; i <= M; i++) for (j = 1; j <= N; j++) { /* case #1: i,j are aligned */ if (x[i] == y[j]) S[i][j] = S[i-1][j-1] + MATCH; else S[i][j] = S[i-1][j-1] + MISMATCH; sc = S[i-1][j] + INDEL; /* case #2: i aligned to - */ if (sc > S[i][j]) S[i][j] = sc; sc = S[i][j-1] + INDEL; /* case #3: j aligned to - */ if (sc > S[i][j]) S[i][j] = sc; } /* The result (optimal alignment score) is now in S[M][N]. */ /***************************************************************** * Traceback ***************************************************************** * * Traceback is a little fiddly, maybe a little obvious, and * there's more than one way to do it, so it often gets glossed * over in intros to DP. * * There are two general strategies that people use for * tracebacks. You can keep a "shadow matrix" of traceback * pointers in the recursion, so that you remember which * case won in every cell S(i,j). Alternatively, you can * recapitulate the calculations to figure out which case * won. * * Recapitulation is usually a little simpler, for small examples; * but shadow matrices are probably more common in real-world * implementations. Recapitulation is used here. * * Because we trace backwards, and we don't know the length of * the alignment 'til we're done, we have a bookkeeping problem * with creating the aligned sequences. There's more than one * way to solve it. A simple (but brutish) way is to allocate * for the maximum length alignment (M+N; happens if x,y are * completely unaligned to each other), make the alignment backwards, * then reverse the aligned strings. */ /* Allocate for our aligned strings (which will be 0..K-1) */ ax = malloc(sizeof(char) * (M+N+1)); ay = malloc(sizeof(char) * (M+N+1)); /* Start at M,N; work backwards */ i = M; j = N; k = 0; while (i > 0 || j > 0) { /* Case 1: best was x_i aligned to x_j? */ if (i > 0 && j > 0) { /* watch out for boundaries */ sc = S[i-1][j-1]; if (x[i] == y[j]) sc += MATCH; else sc += MISMATCH; if (sc == S[i][j]) { /* residue i aligns to j: */ ax[k] = x[i]; ay[k] = y[j]; k++; /* record the alignment */ i--; j--; /* move to next cell in trace */ continue; } } /* Case 2: best was x_i aligned to -? */ if (i > 0) { /* watch out for boundaries */ if (S[i-1][j] + INDEL == S[i][j]) { ax[k] = x[i]; ay[k] = '-'; k++; /* record the alignment */ i--; /* move to next cell in trace */ continue; } } /* Case 3: best was - aligned to y_j? */ if (j > 0) { /* watch out for boundaries */ if (S[i][j-1] + INDEL == S[i][j]) { ax[k] = '-'; ay[k] = y[j]; k++; /* record the alignment */ j--; /* move to next cell in trace */ continue; } } } /* Done with the traceback. * Now ax[0..k-1] and ay[0..k-1] are aligned strings, but backwards; * and k is our alignment length. * Reverse the strings and we're done. */ K = k; /* K = remember the alignment length */ for (k = 0; k < K/2; k++) { /* a sly, efficient in-place reversal by swapping pairs of chars: */ tmp = ax[K-k-1]; ax[K-k-1] = ax[k]; ax[k] = tmp; tmp = ay[K-k-1]; ay[K-k-1] = ay[k]; ay[k] = tmp; } /***************************************************************** * Output ***************************************************************** */ printf("Sequence X: %s\n", x+1); printf("Sequence Y: %s\n", y+1); printf("Scoring system: %d for match; %d for mismatch; %d for gap\n", MATCH, MISMATCH, INDEL); printf("\n"); printf("Dynamic programming matrix:\n"); for (j = -1; j <= N; j++) printf(" %c ", j > 0 ? y[j] : ' '); printf("\n"); for (i = 0; i <= M; i++) { printf(" %c ", i > 0 ? x[i] : ' '); for (j = 0; j <= N; j++) printf("%4d ", S[i][j]); printf("\n"); } printf("\n"); printf("Alignment:\n"); printf("X: %s\n", ax); printf("Y: %s\n", ay); printf("Optimum alignment score: %d\n\n", S[M][N]); /* Free the memory we allocated; then exit. */ free(x); free(y); free(ax); free(ay); for (i = 0; i <= M; i++) free(S[i]); free(S); exit(0); }