#! /usr/bin/env python # meaRNAborPartitionFunction.py # P.Clote # Computes for each k the MEA(k) secondary structure # using McCaskill base pairing probabilities. #---------------------------------------------------------- # FUNCTIONS DEFINED # cleanInput(seq) # readInput(filename) # getDotPlot(rna) # case1(L0,i,j) # case2(L0,i,j) # case3(L0,i,j,r) # initializePartitionFunctionMatrix(rna,S0,Kmax,RT,P,Q,Z) # computePartitionFunctionMatrix(rna,S0,Kmax,RT,Z,P,Q,alpha,beta) # computeBoltzmannMEAProb(rna,S0,Kmax,RT,P,Q,alpha,beta) # main(rna,S0,alpha,beta,Kmax,RT) #---------------------------------------------------------- import sys,os,tempfile,string from math import exp,log from misc import THRESHOLD,basePair,basePairDistance, list2string,basePairList import vienna DEBUG = 0 INF = sys.maxint THETA = 3 NUCL = ['A','C','G','U'] RNAfold = '/usr/src/ViennaRNA-1.8.3/Progs/RNAfold' #RNAfold = /usr/local/bin/RNAfold' def cleanInput(seq): seq = seq.upper().replace('T','U') s = "" for ch in seq: if ch in NUCL: s += ch return s def readInput(filename): file = open(filename) line = file.readline() if line.strip()[0]=='>': FASTA = line.strip()[1:] line = file.readline() else: FASTA = "rnaSequence" rna = cleanInput(line.strip()) secStr = file.readline().strip() line = file.readline() if line: words = line.split() alpha = float(words[0]) beta = float(words[1]) if len(words)>2: Kmax = int(words[2]) else: Kmax = len(rna) if len(words)>3: RT = float(words[3]) else: RT = len(rna) else: alpha = 1 beta = 1 Kmax = len(rna) RT = len(rna) return rna,secStr,alpha,beta,Kmax,RT def getDotPlot(rna): n = len(rna) P = {} #dictionary of base pair probabilities: for i=i and y==j: num+=1 if DEBUG: X = []; Y = [] for (x,y) in L0: if i<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case1,i,j,num,diff" print i,j,num,diff assert num==diff return num def case2(L0,i,j): #d( S0[i+1,j-1]+{(i,j)}, S0[i,j] ) num = 0 if (i,j) not in L0: num+=1 for (x,y) in L0: if x==i and yi and y==j: num+=1 if DEBUG: X = [(i,j)]; Y = [] for (x,y) in L0: if i+1<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case2,i,j,num,diff" print i,j,num,diff assert num==diff return num def case3(L0,i,j,r): #d( S0[i,r-1]+S0[r+1,j-1]+{(r,j)], S0[i,j] ) assert(i < r <= j-THRESHOLD-1) num = 0 if (r,j) not in L0: num+=1 for (x,y) in L0: if i<=x<=r-1 and r+1<=y<=j-1: num+=1 if x==r and r+1<=y<=j-1: num+=1 if i<=x and x!=r and y in [r,j]: num+=1 if DEBUG: X = [(r,j)]; Y = [] for (x,y) in L0: if i<=x and y<=r-1: X.append((x,y)) if r+1<=x and y<=j-1: X.append((x,y)) if i<=x and y<=j : Y.append((x,y)) diff = 0 for (x,y) in X: if (x,y) not in Y: diff += 1 for (x,y) in Y: if (x,y) not in X: diff += 1 print "case3,i,j,r,num,diff" print i,j,r,num,diff assert num==diff return num def initializePartitionFunctionMatrix(rna,S0,Kmax,RT,P,Q,Z): #Z is dictionary of form Z[(i,j,k)] = MEA score of rna[i],...,rna[j] #with respect to structures having k base pair difference with S0 #This function initializes energy to be 0 for k>0 #WARNING: S0 is unchanged, but secStrList0 is list of (i,j) with 1..i,j..n #Indices of RNA go from 1,...,n due to dummy character '$' secStrList0 = basePairList('$'+S0) n = len(rna) rna = '$'+rna #preinitialize for i in range(1,n+1): for j in range(1,n+1): for k in range(0,Kmax+1): if i<=j: if k==0: Z[(i,j,k)] = 1 else: #k>0 Z[(i,j,k)] = 0 else: #i>j if k==0: Z[(i,j,k)] = 1 else: #k>0 Z[(i,j,k)] = 0 #now set to correct values for d in range(0,n): #d = 0,...,n-1 for i in range(1,n-d): j = i+d #j = d+1,...,n if j<=i+THETA: #i<=j<=i+THETA prod = 1 for r in range(i,j+1): prod *= exp(beta*Q[r]/RT) Z[(i,j,0)] = prod elif i+THETA+1==j: prod = 1; num = 0 for r in range(i+1,j): prod *= exp(beta*Q[r]/RT) #prod=exp(beta*(Q[i+1]+...+Q[j-1])/RT) if (i,j) in secStrList0: Z[(i,j,0)] = prod*exp(2*alpha*P[i][j]/RT) Z[(i,j,1)] = val + beta*(Q[i]+Q[j]) Z[(j,i,0)] = 1.0 Z[(j,i,1)] = 1.0 else: # (i,j) not in secStrList0 Z[(i,j,0)] = prod*exp(beta*(Q[i]+Q[j])/RT) Z[(j,i,0)] = 1.0 if basePair(rna[i],rna[j]): Z[(i,j,1)] = prod*exp(2 * alpha * P[i][j]/RT) Z[(j,i,1)] = 1.0 return def computePartitionFunctionMatrix(rna,S0,Kmax,RT,Z,P,Q,alpha,beta): #------------------------------- # P is dictionary of base pairing probability, P[i][j] defined for i THETA+1 sum = 0; num = 0.0 #Set Z[(i,j,k)] to sum of following cases # Case 1: j unpaired in S[i,j], where S in NOT S0, but arbitrary str b0 = case1(secStrList0,i,j) #$b0=1 if j paired in secStrList0[i,j], else 0 if k-b0>=0: sum += Z[(i,j-1,k-b0)] * exp(beta * Q[j]/RT) num += Z[(j-1,i,k-b0)] # Case 2: (i,j) in S, where S is NOT S0, but arbitrary str if basePair(rna[i],rna[j]): #check if i,j can pair b1 = case2(secStrList0,i,j) #b1 = bpDist(S0[i+1,j-1] +{(i,j)},S0[i,j]) if k-b1>=0: sum += Z[(i+1,j-1,k-b1)] * exp(2*alpha*P[i][j]/RT) num += Z[(j-1,i+1,k-b1)] # Case 3: (r,j) in S, some i=0: sum += Z[(i,r-1,k0)]*Z[(r+1,j-1,k1)]*exp(2*alpha*P[r][j]/RT) num += Z[(r-1,i,k0)]*Z[(j-1,r+1,k1)] Z[(i,j,k)] = sum Z[(j,i,k)] = num for k in range(Kmax+1): Zsum += Z[(1,n,k)] Nsum += Z[(n,1,k)] return Zsum,Nsum def computeBoltzmannMEAProb(rna,S0,Kmax,RT,P,Q,alpha,beta): #------------------------------- # P is dictionary of base pairing probability, P[i][j] defined for i0: BoltzProb[k] = Z[(1,n,k)]/Zsum UnifProb[k] = Z[(n,1,k)]/Nsum return BoltzProb,UnifProb,Z def main(rna,S0,alpha,beta,Kmax,RT): P,Q,n = getDotPlot(rna) BoltzProb,UnifProb,Z = computeBoltzmannMEAProb(rna,S0,Kmax,RT,P,Q,alpha,beta) keys = BoltzProb.keys() keys.sort() print rna print S0 print "Distance\todds" for k in keys: # print "%d\t%f" % (k,UnifProb[k]*Z[(1,n,k)]) # print "%d\t%f" % (k,BoltzProb[k]/UnifProb[k]) # print "%d\t%f\t%f" % (k,UnifProb[k],BoltzProb[k]) print "%d\t%f\t%f\t%f" % (k,UnifProb[k],BoltzProb[k],Z[(1,n,k)]) if __name__ == '__main__': if len(sys.argv) < 2: text = """Usage: %s filename file consists of optional FASTA comment, RNA, sec str, each on separate line. Additionally, weights alpha,beta,Kmax,RT can be given on the 3rd line, separated by tabs. For example: >EMBL1 GGGGCCCCC (.......) 2 1 10 0.6 -- Optionally, the last line can not be given, in which case, alpha=1=beta, Kmax=len(rna), and RT = len(rna). For example: >EMBL1 GGGGCCCCC (.......) -- Optionally, in the last line, only alpha, beta, Kmax can be given, in which case RT = len(rna) -- Optionally, in the last line, only alpha and beta can be given, in which case Kmax = len(rna) and RT = len(rna) -- RNA is upper or lowercase sequence of A,C,G,U (or T) 0 <= alpha,beta are weights of P[i][j] resp Q[i] default values are alpha = 2, beta = 1""" text = text % sys.argv[0] print text sys.exit(1) rna,secStr,alpha,beta,Kmax,RT = readInput(sys.argv[1]) main(rna,secStr,alpha,beta,Kmax,RT)