#! /usr/bin/env python # meaRNAbor.py # P.Clote # Computes for each k the MEA(k) secondary structure # using McCaskill base pairing probabilities. #---------------------------------------------------------- # FUNCTIONS DEFINED # cleanInput(seq) # readInput(filename) # meaScore(secStr,alpha,beta,P,Q) # getDotPlot(rna) # case1(L0,i,j) # case2(L0,i,j) # case3(L0,i,j,r) # initializeMatrix(rna,S0,Kmax,P,Q,M) # computeEnergyMatrix(rna,S0,Kmax,M,P,Q,alpha,beta) # printMatrix(M,n) # computeMEA_RNAbor(rna,S0,Kmax,P,Q,alpha,beta) # backtrack(i,j,k,M,paren,leftBracket,rightBracket) # main(rna,S0,alpha,beta,Kmax) #---------------------------------------------------------- import sys,os,tempfile,string from misc import THRESHOLD,basePair,basePairDistance, list2string,basePairList import vienna DEBUG = 0 INF = sys.maxint THETA = 3 NUCL = ['A','C','G','U'] RNAfold = '/usr/src/ViennaRNA-1.8.3/Progs/RNAfold' #RNAfold = /usr/local/bin/RNAfold' def cleanInput(seq): seq = seq.upper().replace('T','U') s = "" for ch in seq: if ch in NUCL: s += ch return s def readInput(filename): file = open(filename) line = file.readline() if line.strip()[0]=='>': FASTA = line.strip()[1:] line = file.readline() else: FASTA = "rnaSequence" rna = cleanInput(line.strip()) secStr = file.readline().strip() line = file.readline() if line: words = line.split() alpha = float(words[0]) beta = float(words[1]) if len(words)>2: Kmax = int(words[2]) else: Kmax = len(rna) else: alpha = 1 beta = 1 Kmax = len(rna) return rna,secStr,alpha,beta,Kmax def meaScore(secStr,alpha,beta,P,Q): n = len(secStr) positions = range(1,n+1) L = basePairList('$'+secStr) val = 0 for (i,j) in L: val += 2*alpha*P[i][j] positions.remove(i) positions.remove(j) for i in positions: val += beta*Q[i] return val def getDotPlot(rna): n = len(rna) P = {} #dictionary of base pair probabilities: for i=i and y==j: num+=1 if DEBUG: X = []; Y = [] for (x,y) in L0: if i<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case1,i,j,num,diff" print i,j,num,diff assert num==diff return num def case2(L0,i,j): #d( S0[i+1,j-1]+{(i,j)}, S0[i,j] ) num = 0 if (i,j) not in L0: num+=1 for (x,y) in L0: if x==i and yi and y==j: num+=1 if DEBUG: X = [(i,j)]; Y = [] for (x,y) in L0: if i+1<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case2,i,j,num,diff" print i,j,num,diff assert num==diff return num def case3(L0,i,j,r): #d( S0[i,r-1]+S0[r+1,j-1]+{(r,j)], S0[i,j] ) assert(i < r <= j-THRESHOLD-1) num = 0 if (r,j) not in L0: num+=1 for (x,y) in L0: if i<=x<=r-1 and r+1<=y<=j-1: num+=1 if x==r and r+1<=y<=j-1: num+=1 if i<=x and x!=r and y in [r,j]: num+=1 if DEBUG: X = [(r,j)]; Y = [] for (x,y) in L0: if i<=x and y<=r-1: X.append((x,y)) if r+1<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case3,i,j,r,num,diff" print i,j,r,num,diff assert num==diff return num def initializeMatrix(rna,S0,Kmax,P,Q,M): #M is dictionary of form M[(i,j,k)] = MEA score of rna[i],...,rna[j] #with respect to structures having k base pair difference with S0 #This function initializes energy to be 0 for k>0 #WARNING: S0 is unchanged, but secStrList0 is list of (i,j) with 1..i,j..n #Indices of RNA go from 1,...,n due to dummy character '$' secStrList0 = basePairList('$'+S0) n = len(rna) rna = '$'+rna #preinitialize for i in range(1,n+1): for j in range(1,n+1): for k in range(0,Kmax+1): if i<=j: if k==0: M[(i,j,k)] = 0 else: #k>0 M[(i,j,k)] = -INF else: #i>j M[(i,j,k)] = (0,0,0) #now set to correct values for d in range(0,n): #d = 0,...,n-1 for i in range(1,n-d): j = i+d #j = d+1,...,n if j<=i+THETA: #i<=j<=i+THETA sum = 0 for r in range(i,j+1): sum += beta*Q[r] M[(i,j,0)] = sum # M[(j,i,0)] already set to (0,0,0) elif i+THETA+1==j: val = 0 for r in range(i+1,j): val += beta*Q[r] #val=beta*(Q[i+1]+...+Q[j-1]) if (i,j) in secStrList0: M[(i,j,0)] = 2 *alpha *P[i][j] + val M[(j,i,0)] = (i,0,0) M[(i,j,1)] = val + beta*(Q[i]+Q[j]) # M[(j,i,1)] already set to (0,0,0) else: # (i,j) not in secStrList0 M[(i,j,0)] = val + beta*(Q[i]+Q[j]) # M[(j,i,0)] already set to (0,0,0) if basePair(rna[i],rna[j]): M[(i,j,1)] = 2 * alpha * P[i][j] + val M[(j,i,1)] = (i,0,0) return def computeEnergyMatrix(rna,S0,Kmax,M,P,Q,alpha,beta): #------------------------------- # P is dictionary of base pairing probability, P[i][j] defined for i THETA+1 max = -INF; index = (0,0,0) #Set M[(i,j,k)] to maximum of following cases # Case 1: j unpaired in S[i,j], where S in NOT S0, but arbitrary str b0 = case1(secStrList0,i,j) #$b0=1 if j paired in secStrList0[i,j], else 0 if k-b0>=0: val = M[(i,j-1,k-b0)]+beta * Q[j] if val>max: max = val index = (0,k-b0,0) # backtracking: j unpaired, k-b0 bpdistance on remainder # Case 2: (i,j) in S, where S is NOT S0, but arbitrary str if basePair(rna[i],rna[j]): #check if i,j can pair b1 = case2(secStrList0,i,j) #b1 = bpDist(S0[i+1,j-1] +{(i,j)},S0[i,j]) if k-b1>=0: val = M[(i+1,j-1,k-b1)]+ 2 * alpha * P[i][j] if val>max: max = val index = (i,0,k-b1) #backtracking: (i,j) in S # Case 3: (r,j) in S, some i=0: val = M[(i,r-1,k0)]+M[(r+1,j-1,k1)]+2*alpha*P[r][j] if val>max: max = val index = (r,k0,k1) #backtracking: (r,j) in S, k0 bpDist in left, k1 bpDist in right M[(i,j,k)] = max M[(j,i,k)] = index return def printMatrix(M,n): print "Matrix values" for k in range(0,Kmax+1): print "k=%d" % k for i in range(1,n+1): for j in range(1,n+1): z = M[(i,j,k)] if i<=j: sys.stdout.write("%.2f " % z) else: sys.stdout.write("%.2f " % z[0]) print sys.stdout.write("\n") def computeMEA_RNAbor(rna,S0,Kmax,P,Q,alpha,beta): #------------------------------- # P is dictionary of base pairing probability, P[i][j] defined for i0: if DEBUG: print "BACKTRACK %d" % k backtrack(1,n,k,M,paren,leftBracket,rightBracket) parenString = list2string(paren) paren.remove('$') secStr = list2string(paren) SecStrs.append(secStr) return SecStrs def backtrack(i,j,k,M,paren,leftBracket,rightBracket): #backtrack on [i,j] for k-nbor of S0 restricted to [i,j] if j-i>THRESHOLD and M[(i,j,k)]>0: if DEBUG: print "i,j,k,M[(i,j,k)],M[(j,i,k)]" print i,j,k,M[(i,j,k)],M[(j,i,k)] (r,k0,k1) = M[(j,i,k)] if r > 0: #j base-pairs with r in [i,j] paren[r] = leftBracket #paren has dummy char at position 0 paren[j] = rightBracket #so indices of paren and energy go from 1,...,n if DEBUG: print "+++1) backtrack(r+1,j-1,k1,paren)" print r+1,j-1,k1 print "".join(paren[1:]) backtrack(r+1,j-1,k1,M,paren,leftBracket,rightBracket) if DEBUG: print "+++2) backtrack(i,r-1,k0,paren)" print i,r-1,k0 print "".join(paren[1:]) backtrack(i,r-1,k0,M,paren,leftBracket,rightBracket) else: #r is 0, so j not paired in [i,j] backtrack(i,j-1,k0,M,paren,leftBracket,rightBracket) return def main(rna,S0,alpha,beta,Kmax): P,Q,n = getDotPlot(rna) SecStrs = computeMEA_RNAbor(rna,S0,Kmax,P,Q,alpha,beta) numSecStrs = len(SecStrs) print rna print S0 # print "k\tsecStr\tmea\tenergy" print "Distance\tMEA\tStructure\tEnergy" for k in range(numSecStrs): secStr = SecStrs[k] dist = basePairDistance(S0,secStr) mea = meaScore(secStr,alpha,beta,P,Q) energy = vienna.evalViennaEnergy(rna,secStr) # print "%d\t%s\t%f\t%f" % (k,secStr,mea,energy) print "%d\t%f\t%s\t%f" % (k,mea,secStr,energy) if k!=dist: print k,dist assert(k==dist) if __name__ == '__main__': if len(sys.argv) < 2: text = """Usage: %s filename file consists of optional FASTA comment, RNA, sec str, each on separate line. Additionally, weights alpha,beta,Kmax can be given on a last line, separated by tabs. For example: >EMBL1 GGGGCCCCC (.......) 2 1 10 -- Optionally, the last line can not be given, in which case, alpha=1=beta and Kmax=len(rna). For example: >EMBL1 GGGGCCCCC (.......) -- Optionally, in the last line, only alpha and beta can be given, in which case Kmax = len(rna) -- RNA is upper or lowercase sequence of A,C,G,U (or T) 0 <= alpha,beta are weights of P[i][j] resp Q[i] default values are alpha = 2, beta = 1""" text = text % sys.argv[0] print text sys.exit(1) rna,secStr,alpha,beta,Kmax = readInput(sys.argv[1]) main(rna,secStr,alpha,beta,Kmax)