/* Fast, but crude, pairwise structural Alignments of RNA sequences Possible structures of each RNA are encoded in a linear "probability profile", by computing for each base the probability of being unpaired, or paired upstream or downstream. These profiles can be aligned using standard string alignment. The is an extension of the old method in ProfileDist.c with the following changes: - use sequence as well as structure profile for scoring - use similarity alignment instead of distance (maybe add local alinment) - use affine gap costs C Ivo L Hofacker, Vienna RNA Package */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "ViennaRNA/dist_vars.h" #include "ViennaRNA/fold_vars.h" #include "ViennaRNA/part_func.h" #include "ViennaRNA/utils.h" #include "ViennaRNA/profiledist.h" #include "ViennaRNA/ProfileAln.h" #define EQUAL(x,y) (fabs((x)-(y)) <= fabs(x)*2*FLT_EPSILON) PRIVATE int *alignment[2]; PRIVATE void sprint_aligned_bppm(const float *T1, const char *seq1, const float *T2, const char *seq2); PRIVATE double PrfEditScore(const float *p1, const float *p2, char c1, char c2); PRIVATE double average(double x, double y); PRIVATE double open=-1.5, ext=-0.666; /* defaults from clustalw */ PRIVATE double seqw=0.5; PRIVATE int free_ends=1; /* whether to use free end gaps */ /*---------------------------------------------------------------------------*/ PRIVATE float **newmat(int l1, int l2) { float **a; int i; a = (float **) vrna_alloc((l1+1)*sizeof(float *)); for (i=0; i<=l1; i++) a[i] = (float *) vrna_alloc((l2+1)*sizeof(float)); return a; } PUBLIC float profile_aln(const float *T1, const char *seq1, const float *T2, const char *seq2) { /* align the 2 probability profiles T1, T2 */ /* This is like a Needleman-Wunsch alignment, with affine gap-costs ala Gotoh. The score looks at both seq and pair profile */ float **S, **E, **F, tot_score; int i, j, length1, length2; length1 = strlen(seq1); length2 = strlen(seq2); S = newmat(length1, length2); E = newmat(length1, length2); F = newmat(length1, length2); E[0][0] = F[0][0] = open - ext; S[0][0] = 0; for (i=1; i<=length1; i++) F[i][0] = -9999; /* impossible */ for (j=1; j<=length2; j++) E[0][j] = -9999; /* impossible */ if (!free_ends) { for (i=1; i<=length1; i++) S[i][0] = E[i][0] = E[i-1][0] +ext; for (j=1; j<=length2; j++) S[0][j] = F[0][j] = F[0][j-1] +ext; } for (i=1; i<=length1; i++) { for (j=1; j<=length2; j++) { float M; E[i][j] = MAX2(E[i-1][j]+ext, S[i-1][j]+open); F[i][j] = MAX2(F[i][j-1]+ext, S[i][j-1]+open); M = S[i-1][j-1] + PrfEditScore(T1+3*i,T2+3*j, seq1[i-1], seq2[j-1]); S[i][j] = MAX3(M, E[i][j], F[i][j]); } } if (edit_backtrack) { double score=0; char state = 'S'; int pos, i,j; alignment[0] = (int *) vrna_alloc((length1+length2+1)*sizeof(int)); alignment[1] = (int *) vrna_alloc((length1+length2+1)*sizeof(int)); pos = length1+length2; i = length1; j = length2; tot_score = S[length1][length2]; if (free_ends) { /* find starting point for backtracking, search for highest entry in last row or column */ int imax=0; for (i=1; i<=length1; i++) { if (S[i][length2]>score) { score=S[i][length2]; imax=i; } } for (j=1; j<=length2; j++) { if (S[length1][j]>score) { score=S[length1][j]; imax=-j; } } if (imax<0) { for (j=length2; j> -imax; j--) { alignment[0][pos] = 0; alignment[1][pos--] = j; } i=length1; } else { for (i=length1; i>imax; i--) { alignment[0][pos] = i; alignment[1][pos--] = 0; } j=length2; } tot_score=score; } while (i>0 && j>0) { switch (state) { case 'E': score = E[i][j]; alignment[0][pos] = i; alignment[1][pos--] = 0; if (EQUAL(score, S[i-1][j] + open)) state = 'S'; i--; break; case 'F': score = F[i][j]; alignment[0][pos] = 0; alignment[1][pos--] = j; if (EQUAL(score, S[i][j-1] + open)) state = 'S'; j--; break; case 'S': score = S[i][j]; if (EQUAL(score, E[i][j])) state = 'E'; else if (EQUAL(score, F[i][j])) state = 'F'; else if (EQUAL(score, S[i-1][j-1] + PrfEditScore(T1+3*i,T2+3*j, seq1[i-1], seq2[j-1]))) { alignment[0][pos] = i; alignment[1][pos--] = j; i--; j--; } else vrna_message_error("backtrack of alignment failed"); break; } } for (; j>0; j--) { alignment[0][pos] = 0; alignment[1][pos--] = j; } for (; i>0; i--) { alignment[0][pos] = i; alignment[1][pos--] = 0; } for(i=pos+1; i<=length1+length2; i++){ alignment[0][i-pos] = alignment[0][i]; alignment[1][i-pos] = alignment[1][i]; } alignment[0][0] = length1+length2-pos; /* length of alignment */ sprint_aligned_bppm(T1,seq1, T2,seq2); free(alignment[0]); free(alignment[1]); } for (i=0; i<=length1; i++) { free(S[i]); free(E[i]); free(F[i]); } free(S); free(E); free(F); return tot_score; } /*---------------------------------------------------------------------------*/ PRIVATE inline double average(double x, double y) { /* As in Bonhoeffer et al (1993) 'RNA Multi Structure Landscapes', Eur. Biophys. J. 22: 13-24 we have chosen the geometric mean. */ return (float) sqrt(x*y); } PRIVATE double PrfEditScore(const float *p1, const float *p2, char c1, char c2) { double score; int k; for(score=0.,k=0; k<3; k++) score += average(p1[k],p2[k]); score *= (1- seqw); if (c1==c2) score += seqw; else if (((c1=='A') && (c2=='G')) || ((c1=='G') && (c2=='A')) || ((c1=='C') && (c2=='U')) || ((c1=='U') && (c2=='C'))) score += 0.5*seqw; else score -= 0.9*seqw; return score; } /*---------------------------------------------------------------------------*/ PRIVATE void sprint_aligned_bppm(const float *T1, const char *seq1, const float *T2, const char *seq2) { int i, length; length = alignment[0][0]; for (i=0; i<4; i++) { if (aligned_line[i] != NULL) free(aligned_line[i]); aligned_line[i] = (char *) vrna_alloc((length+1)*sizeof(char)); } for(i=1; i<=length; i++){ if (alignment[0][i]==0) aligned_line[0][i-1] = aligned_line[2][i-1] = '_'; else { aligned_line[0][i-1] = vrna_bpp_symbol(T1+alignment[0][i]*3); aligned_line[2][i-1] = seq1[alignment[0][i]-1]; } if (alignment[1][i]==0) aligned_line[1][i-1] = aligned_line[3][i-1] = '_'; else { aligned_line[1][i-1] = vrna_bpp_symbol(T2+alignment[1][i]*3); aligned_line[3][i-1] = seq2[alignment[1][i]-1]; } } } PUBLIC int set_paln_params(double gap_open, double gap_ext, double seq_weight, int freeends) { open = (gap_open>0) ? -gap_open : gap_open; ext = (gap_ext>0) ? -gap_ext : gap_ext; if (open > ext) vrna_message_warning( "Gap extension penalty is smaller than " "gap open. Do you realy want this?"); seqw = seq_weight; if (seqw<0) { seqw = 0; vrna_message_warning("Sequence weight set to 0 (must be in [0..1])"); } else if (seqw>1) { seqw = 1; vrna_message_warning("Sequence weight set to 1 (must be in [0..1])"); } free_ends = (freeends) ? 1 : 0; return 0; } /*---------------------------------------------------------------------------*/