/* minimum free energy consensus RNA secondary structure prediction with maximum distance base pairs c Ivo Hofacker, Stephan Bernhart Vienna RNA package */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "ViennaRNA/utils.h" #include "ViennaRNA/energy_par.h" #include "ViennaRNA/fold_vars.h" #include "ViennaRNA/pair_mat.h" #include "ViennaRNA/params.h" #include "ViennaRNA/ribo.h" #include "ViennaRNA/alifold.h" #include "ViennaRNA/fold.h" #include "ViennaRNA/loop_energies.h" #ifdef _OPENMP #include #endif #define PAREN #define STACK_BULGE1 1 /* stacking energies for bulges of size 1 */ #define NEW_NINIO 1 /* new asymetry penalty */ #define MAXSECTORS 500 /* dimension for a backtrack array */ #define LOCALITY 0. /* locality parameter for base-pairs */ #define UNIT 100 #define MINPSCORE -2 * UNIT #define NONE -10000 /* score for forbidden pairs */ /* ################################# # GLOBAL VARIABLES # ################################# */ /* ################################# # PRIVATE VARIABLES # ################################# */ PRIVATE vrna_param_t *P = NULL; PRIVATE int **c = NULL; /* energy array, given that i-j pair */ PRIVATE int *cc = NULL; /* linear array for calculating canonical structures */ PRIVATE int *cc1 = NULL; /* " " */ PRIVATE int *f3 = NULL; /* energy of 5' end */ PRIVATE int **fML = NULL; /* multi-loop auxiliary energy array */ PRIVATE int *Fmi = NULL; /* holds row i of fML (avoids jumps in memory) */ PRIVATE int *DMLi = NULL; /* DMLi[j] holds MIN(fML[i,k]+fML[k+1,j]) */ PRIVATE int *DMLi1 = NULL; /* MIN(fML[i+1,k]+fML[k+1,j]) */ PRIVATE int *DMLi2 = NULL; /* MIN(fML[i+2,k]+fML[k+1,j]) */ PRIVATE int **pscore = NULL; /* precomputed array of pair types */ PRIVATE unsigned int length; PRIVATE short **S = NULL; PRIVATE short **S5 = NULL; /*S5[s][i] holds next base 5' of i in sequence s*/ PRIVATE short **S3 = NULL; /*Sl[s][i] holds next base 3' of i in sequence s*/ PRIVATE char **Ss = NULL; PRIVATE unsigned short **a2s = NULL; PRIVATE float **dm = NULL; PRIVATE int olddm[7][7]= {{0,0,0,0,0,0,0}, /* hamming distance between pairs PRIVATE needed??*/ {0,0,2,2,1,2,2} /* CG */, {0,2,0,1,2,2,2} /* GC */, {0,2,1,0,2,1,2} /* GU */, {0,1,2,2,0,2,1} /* UG */, {0,2,2,1,2,0,2} /* AU */, {0,2,2,2,1,2,0} /* UA */}; PRIVATE int energyout; PRIVATE int energyprev; #ifdef _OPENMP /* NOTE: all variables are assumed to be uninitialized if they are declared as threadprivate */ #pragma omp threadprivate(P, c, cc, cc1, f3, fML, Fmi, DMLi, DMLi1, DMLi2, pscore, length, S, dm, S5, S3, Ss, a2s, energyout, energyprev) #endif /* ################################# # PRIVATE FUNCTION DECLARATIONS # ################################# */ PRIVATE void initialize_aliLfold(int length, int maxdist); PRIVATE void free_aliL_arrays(int maxdist); PRIVATE void get_arrays(unsigned int size, int maxdist); PRIVATE short *encode_seq(const char *sequence, short *s5, short *s3, char *ss, unsigned short *as); PRIVATE void make_pscores(const char ** AS, const char *structure,int maxdist, int start); PRIVATE int fill_arrays(const char **strings, int maxdist, char *structure); PRIVATE char *backtrack(const char **strings, int start, int maxdist); /* ################################# # BEGIN OF FUNCTION DEFINITIONS # ################################# */ PRIVATE void initialize_aliLfold(int length, int maxdist){ if (length<1) vrna_message_error("initialize_fold: argument must be greater 0"); get_arrays((unsigned) length, maxdist); make_pair_matrix(); if(P) free(P); vrna_md_t md; set_model_details(&md); P = vrna_params(&md); } /*--------------------------------------------------------------------------*/ PRIVATE void get_arrays(unsigned int size, int maxdist) { int i; c = (int **)vrna_alloc(sizeof(int *)*(size+1)); fML = (int **)vrna_alloc(sizeof(int *)*(size+1)); pscore = (int **)vrna_alloc(sizeof(int *)*(size+1)); f3 = (int *) vrna_alloc(sizeof(int)*(size+2)); /* has to be one longer */ cc = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); cc1 = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); Fmi = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); DMLi = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); DMLi1 = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); DMLi2 = (int *) vrna_alloc(sizeof(int)*(maxdist+5)); for (i=size; i>(int)size-maxdist-5 && i>=0; i--) { c[i] = (int *) vrna_alloc(sizeof(int) *(maxdist+5)); fML[i] = (int *) vrna_alloc(sizeof(int) *(maxdist+5)); pscore[i] = (int *) vrna_alloc(sizeof(int )*(maxdist+5)); } } /*--------------------------------------------------------------------------*/ PRIVATE void free_aliL_arrays(int maxdist) { int i; for(i=0; ilength) maxdist = length; initialize_aliLfold(length, maxdist); for (s=0; strings[s]!=NULL; s++); n_seq = s; S = (short **) vrna_alloc(n_seq*sizeof(short *)); S5 = (short **) vrna_alloc(n_seq*sizeof(short *)); S3 = (short **) vrna_alloc(n_seq*sizeof(short *)); a2s = (unsigned short **) vrna_alloc(n_seq*sizeof(unsigned short *)); Ss = (char **) vrna_alloc(n_seq*sizeof(char *)); for (s=0; s=(int)length-(int)maxdist-4 && i>0; i--) make_pscores((const char **) strings,structure,maxdist,i); energy = fill_arrays(strings, maxdist, structure); free_aliL_arrays(maxdist); return (float) energy/100.; } PRIVATE int fill_arrays(const char **strings, int maxdist, char *structure) { /* fill "c", "fML" and "f3" arrays and return optimal energy */ int i, j, k, length, energy; int decomp, new_fML,MLenergy ; int *type, type_2, tt, s, n_seq, lastf, lastf2, thisj, lastj; lastf = lastf2 = INF; /* int bonus=0;*/ length = (int) strlen(strings[0]); for (s=0; strings[s]!=NULL; s++); n_seq = s; type = (int *) vrna_alloc(n_seq*sizeof(int)); for (j=0; jlength-maxdist-3; j--) { for (i=(length-maxdist-2>0)?length-maxdist-2:1 ; i= 1; i--) { /* i,j in [1..length] */ for (j = i+1; j<=length && j<=i+TURN; j++) { c[i][j-i]=fML[i][j-i]=INF; } for (j = i+TURN+1; j <= length && j <= i+maxdist; j++) { int p, q, psc; /* bonus = 0;*/ for (s=0; s=cv_fact*MINPSCORE) { /* we have a pair 2 consider */ int new_c=0, stackEnergy=INF; /* hairpin ----------------------------------------------*/ for (new_c=s=0; sMLclosing; new_c = MIN2(new_c, MLenergy); new_c = MIN2(new_c, cc1[j-1-(i+1)]+stackEnergy); cc[j-i] = new_c - psc; /* add covariance bonnus/penalty */ if (noLonelyPairs) c[i][j-i] = cc1[j-1-(i+1)]+stackEnergy-psc; else c[i][j-i] = cc[j-i]; } /* end >> if (pair) << */ else c[i][j-i] = INF; /* done with c[i,j], now compute fML[i,j] */ /* free ends ? -----------------------------------------*/ new_fML = fML[i+1][j-i-1]+n_seq*P->MLbase; new_fML = MIN2(fML[i][j-1-i]+n_seq*P->MLbase, new_fML); energy = c[i][j-i]/*+P->MLintern[type]*/; if(dangles){ for (s=0; s 1) ? S5[s][i] : -1, (j < length) ? S3[s][j] : -1, P); } } else{ for (s=0; s1) ? S5[s][i] : -1, S3[s][j], P); } } else{ for(s = 0; s < n_seq; s++){ tt = pair[S[s][i]][S[s][j]]; if(tt==0) tt=7; energy += E_ExtLoop(tt, -1, -1, P); } } if (energy/(j-i+1) < thisf){ thisf = energy/(j-i+1); thisj = j; } energy += f3[j+1]; if(f3[i] > energy){ f3[i] = energy; } } } if(length <= i+maxdist){ j = length; if(c[i][j-i]1) ? S5[s][i] : -1, -1, P); } } else{ for (s=0; s1) ? S5[s][i+1] : -1, S3[s][lastj-i], P); } } else{ for (s=0; s i+1+strlen(ss)) || (do_backtrack==2)){ char *outstr = (char *)vrna_alloc(sizeof(char) * (strlen(prev)+1)); strncpy(outstr, strings[0]+prev_i-1, strlen(prev)); outstr[strlen(prev)] = '\0'; if (csv==1) printf("%s , %6.2f, %4d, %4d\n",prev, energyprev/(100.*n_seq), prev_i,prev_i + (int)strlen(prev)-1); /* if(do_backtrack==1)*/ else { printf("%s (%6.2f) %4d - %4d\n",prev, energyprev/(100.*n_seq), prev_i,prev_i + (int)strlen(prev)-1); } free(outstr); } free(prev); prev = ss; energyprev = energyout; prev_i = i+1; do_backtrack = 0; backtrack_type='F'; } } lastf2 = lastf; lastf = thisf; lastj = thisj; if (i==1) { char *outstr = NULL; if (prev) { outstr = (char *)vrna_alloc(sizeof(char) *(strlen(prev) + 1)); strncpy(outstr, strings[0]+prev_i-1, strlen(prev)); outstr[strlen(prev)] = '\0'; if(csv==1) printf("%s ,%6.2f, %4d, %4d\n", prev, (energyprev)/(100.*n_seq), prev_i,prev_i + (int)strlen(prev)-1); else{ printf("%s (%6.2f) %4d - %4d\n", prev, (energyprev)/(100.*n_seq), prev_i,prev_i + (int)strlen(prev)-1); } } if ((f3[prev_i] != f3[1]) || !prev){ ss = backtrack(strings, i , maxdist); if(outstr) free(outstr); outstr = (char *)vrna_alloc(sizeof(char) * (strlen(ss) + 1)); strncpy(outstr, strings[0], strlen(ss)); outstr[strlen(ss)] = '\0'; printf("%s \n", outstr); if(csv==1) printf("%s ,%6.2f ,%4d ,%4d\n", ss, (f3[1]-f3[1 + strlen(ss)-1])/(100.*n_seq), 1, (int)strlen(ss)-1); else{ printf("%s (%6.2f) %4d - %4d\n", ss, (f3[1]-f3[1 + strlen(ss)-1])/(100.*n_seq), 1, (int)strlen(ss)-1); } free(ss); } if(prev) free(prev); if(outstr) free(outstr); } } { int ii, *FF; /* rotate the auxilliary arrays */ FF = DMLi2; DMLi2 = DMLi1; DMLi1 = DMLi; DMLi = FF; FF = cc1; cc1=cc; cc=FF; for (j=0; j< maxdist+5; j++) {cc[j]=Fmi[j]=DMLi[j]=INF; } if (i<=length-maxdist-4) { c[i-1] = c[i+maxdist+4]; c[i+maxdist+4] = NULL; fML[i-1] = fML[i+maxdist+4]; fML[i+maxdist+4]=NULL; pscore[i-1] = pscore[i+maxdist+4]; pscore[i+maxdist+4] = NULL; if(i > 1) make_pscores((const char**) strings, structure, maxdist, i-1); for(ii=0; ii0) { int ml, fij, cij, traced, i1, j1, mm, p, q, jj=0; int canonical = 1; /* (i,j) closes a canonical structure */ i = sector[s].i; j = sector[s].j; ml = sector[s--].ml; /* ml is a flag indicating if backtracking is to occur in the fML- (1) or in the f-array (0) */ if (ml==2) { structure[i-start] = '('; structure[j-start] = ')'; goto repeat1; } if (j < i+TURN+1) continue; /* no more pairs in this interval */ fij = (ml)? fML[i][j-i] : f3[i]; if (ml == 0) { /* backtrack in f3 */ if (fij == f3[i+1]) { sector[++s].i = i+1; sector[s].j = j; sector[s].ml = ml; continue; } /* i is paired. Find pairing partner */ for (k=i+TURN+1,traced=0; k<=j; k++) { int cc; jj = k+1; cc = c[i][k-(i)]; if (cc1) ? S5[ss][i] : -1, (kMLbase == fij) { /* 3' end is unpaired */ sector[++s].i = i; sector[s].j = j-1; sector[s].ml = ml; continue; } if (fML[i+1][j-(i+1)]+n_seq*P->MLbase == fij) { /* 5' end is unpaired */ sector[++s].i = i+1; sector[s].j = j; sector[s].ml = ml; continue; } cij = c[i][j-i] ; if(dangles){ for (ss=0; ss1) ? S5[ss][i] : -1, (jj-2-TURN) vrna_message_error("backtrack failed in fML"); continue; } repeat1: /*----- begin of "repeat:" -----*/ if (canonical) cij = c[i][j-i]; for (ss=0; ssstack[type[ss]][type_2]; } cij += pscore[i][j-i]; structure[i+1-start] = '('; structure[j-1-start] = ')'; i++; j--; canonical=0; goto repeat1; } canonical = 1; cij += pscore[i][j-i]; { int cc=0; for (ss=0; ss= minq; q--) { if (c[p][q-p]>=INF) continue; for (ss=energy=0; ssMLclosing; if(dangles){ for (ss=0; ss0 && structure[i-1] == '.'; i--) structure[i] = '\0'; } return structure; } /*---------------------------------------------------------------------------*/ PRIVATE short *encode_seq(const char *sequence, short *s5, short *s3, char *ss, unsigned short *as){ unsigned int i,l; short *S; unsigned short p; l = strlen(sequence); S = (short *) vrna_alloc(sizeof(short)*(l+2)); S[0] = (short) l; s5[0]=s5[1]=0; /* make numerical encoding of sequence */ for (i=1; i<=l; i++) { short ctemp = (short)encode_char(toupper(sequence[i-1])); S[i] = ctemp ; } if (oldAliEn) { /*use alignment sequences in all energy evaluations*/ ss[0]=sequence[0]; for (i=1; i0; i--) { char c5; c5=sequence[i-1]; if ((c5=='-')||(c5=='_')||(c5=='~')||(c5=='.')) continue; s5[1] = S[i]; break; } for (i=1; i<=l; i++) { char c3; c3 = sequence[i-1]; if ((c3=='-')||(c3=='_')||(c3=='~')||(c3=='.')) continue; s3[l] = S[i]; break; } } else s5[1]=s3[l]=0; for (i=1,p=0; i<=l; i++) { char c5; c5=sequence[i-1]; if ((c5=='-')||(c5=='_')||(c5=='~')||(c5=='.')) s5[i+1]=s5[i]; else { /* no gap */ ss[p++]=sequence[i-1]; /*start at 0!!*/ s5[i+1]=S[i]; } as[i]=p; } for (i=l; i>=1; i--) { char c3; c3=sequence[i-1]; if ((c3=='-')||(c3=='_')||(c3=='~')||(c3=='.')) s3[i-1]=s3[i]; else s3[i-1]=S[i]; } } return S; } PRIVATE double cov_score(const char **AS, int i, int j) { int n_seq,k,l,s; double score; int pfreq[8]={0,0,0,0,0,0,0,0}; for (n_seq=0; AS[n_seq]!=NULL; n_seq++); for (s=0; sn_seq) return NONE; else for (k=1,score=0.; k<=6; k++) /* ignore pairtype 7 (gap-gap) */ for (l=k; l<=6; l++) /* scores for replacements between pairtypes */ /* consistent or compensatory mutations score 1 or 2 */ score += pfreq[k]*pfreq[l]*dm[k][l]; /* counter examples score -1, gap-gap scores -0.25 */ return cv_fact * ((UNIT*score)/n_seq - nc_fact*UNIT*(pfreq[0] + pfreq[7]*0.25)); } PRIVATE void make_pscores(const char ** AS, const char *structure, int maxd, int i) { /* calculate co-variance bonus for each pair depending on */ /* compensatory/consistent mutations and incompatible seqs */ /* should be 0 for conserved pairs, >0 for good pairs */ int n,j,l; n=S[0][0]; /* length of seqs */ /*first allocate space:*/ pscore[i]=(int *)vrna_alloc((maxd+5)*sizeof(int)); /* pscore[start]-=start;*/ /*fill pscore[start], too close*/ for (j=i+1; (j1) && (j': for (l=i+TURN+1; l<=i+maxd; l++) pscore[i][l-i] = NONE; break; } if (!psij) for (l=i+1; l<=i+maxd; l++) { /*no '(' constraint on i*/ switch (structure[l-1]) { case '(': pscore[i][l-i] = NONE; break; case '<': pscore[i][l-i] = NONE; break; case 'x': pscore[i][l-i] = NONE; break; case ')': pscore[i][l-i] = NONE; break; } } if (hx!=0) { vrna_message_error("%s\nunbalanced brackets in constraint string", structure); } free(stack); } }