#! /usr/bin/python # Copyright (C) 2005 Peter Clote # All Rights Reserved. # # Permission to use, copy, modify, and distribute this # software for NON-COMMERCIAL purposes # and without fee is hereby granted provided that this # copyright notice appears in all copies. If you use, modify # or extend this code, please cite the reference # "Symmetric time warping, Boltzmann pair probabilities and # functional genomics", by P. Clote and J. Straubhaar, # submitted to Journal of Mathematical Biology. # # THE AUTHOR MAKES NO REPRESENTATIONS OR # WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE # IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHOR # SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED # BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING # THIS SOFTWARE OR ITS DERIVATIVES. # # # import sys,getPAMmatrix SEQUENCE = 1 #1 means nucleotide sequence, 0 means gene expression data #T = .5 T = .5 k = 1 ALPHA = -14 BETA = -2 GAP = -2 INF = sys.maxint-1 LINELEN = 60 EPSILON = 0.01 DEBUG = 0 VERBOSE = 0 def printList(L): for x in L: print x def distance(x,y): if SEQUENCE: d = getPAMmatrix.getPAMmatrix('distance.txt') return d[(x,y)] else: return abs(float(x)-float(y)) def isNumber(s): try: x = float(s) ok = 1 except: x = 0 ok = 0 return ok def getSequence(filename): file = open(filename,"r") line = file.readline() text = file.read() text = text.replace("\n","") if line[0] == '>': return text else: text = line[:-1]+text return text def getNumericalSequence(filename): file = open(filename,"r") line = file.readline() # L = line.split("\t") L = line.split() L = L[1:] #remove gene name L = map(float,L) #convert from string to float return L def printMatrix(M): keys = M.keys() keys.sort() (x,y) = keys[0] for key in keys: (u,v) = key if u==x: # sys.stdout.write("%.4f\t" % M[key]) sys.stdout.write("%f\t" % M[key]) else: x=u # sys.stdout.write("\n%.4f\t" % M[key]) sys.stdout.write("\n%f\t" % M[key]) print "\n" def monus(a,b): if a>b: return a-b else: return 0 def lengthOfSequencesInAlignment(A): #A is alignment of form: # I-LR- # LY-RR #return lengths of words aligned, (3,4) in this example sumA = 0; sumB = 0 for x,y in A: if x!='-': sumA += 1 if y!='-': sumB += 1 return (sumA,sumB) def convertDmatrixToEmatrix(D): #since values of D[key] are pairs of form (score,backarrowList) #we create matrix E containing only scores E = {} for key in D.keys(): first,second = D[key] E[key] = first return E def getSubsequencesFromAlignment(A): #A is alignment of form: # I-LR- # LY-RR #return words aligned, ('ILR','LYRR') in this example a = ""; b = "" for x,y in A: if x!='-': a += x if y!='-': b += y return (a,b) def getStarredSubsequencesFromAlignment(A): #A is alignment of form: # I-LR- # LY-RR #return words aligned, ('I-LR-','LY-RR') in this example aStar = ""; bStar = "" for x,y in A: aStar += x; bStar += y return (aStar,bStar) def printAlignmentWithQuality(A,quality,start=(0,0)): #A is alignment of form: # I-LR- # LY-RR #quality is string of qualities of Boltzmann probabilities #return lengths of words aligned, (3,4) in this example assert len(A)==len(quality) startA,startB = start #offset positions aStar,bStar = getStarredSubsequencesFromAlignment(A) length = len(A) startPos = 1; endPos = min(LINELEN,length) stop = 0 #boolean flag to stop loop while not stop: print "%d\t%s\t%d" %(startPos+startA,aStar[startPos-1:endPos],endPos+startA) print "\t%s\t" %quality[startPos-1:endPos] print "%d\t%s\t%d" %(startPos+startB,bStar[startPos-1:endPos],endPos+startB) print numCharPrinted = endPos-startPos+1 startPos += numCharPrinted endPos = min(endPos+LINELEN,length) if startPos > length: stop = 1 def printSequenceAndAlignment(A,a,b,Probs,LINELEN,start=(0,0)): startA,startB = start; maxStart = max(startA,startB) c,d= getSubsequencesFromAlignment(A) if VERBOSE: print "\nFirst sequence" print a print "\nOptimal subword of first sequence" for i in range(startA): sys.stdout.write(' ') print c print "\nSecond sequence" print b print "\nOptimal subword of second sequence" for i in range(startB): sys.stdout.write(' ') print d quality = "" for i in range(len(Probs)): if Probs[i]>0.75: quality += "+" elif Probs[i]>0.5: quality += "*" elif Probs[i]>0.25: quality += "-" else: quality += " " if VERBOSE: text = """ Explanation: (+: 75\%-100\%), (*: 50\%-75\%), (-: 25\%-50\%), ( : 0\% -25\%)""" print text print "\n-------------------------------------" print "Alignment" printAlignmentWithQuality(A,quality,start) if VERBOSE: aStar,bStar = getStarredSubsequencesFromAlignment(A) print "\n-------------------------------------" print "Table of probabilities in alignment" for i in range(len(A)): print "%s\t%s\t%s" % (aStar[i],bStar[i],Probs[i])