#! /usr/bin/python # Copyright (C) 2005 Peter Clote # All Rights Reserved. # # Permission to use, copy, modify, and distribute this # software for NON-COMMERCIAL purposes # and without fee is hereby granted provided that this # copyright notice appears in all copies. If you use, modify # or extend this code, please cite the reference # "Symmetric time warping, Boltzmann pair probabilities and # functional genomics", by P. Clote and J. Straubhaar, # submitted to Journal of Mathematical Biology. # # THE AUTHOR MAKES NO REPRESENTATIONS OR # WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE # IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHOR # SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED # BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING # THIS SOFTWARE OR ITS DERIVATIVES. # # # # NOTE: Most recent version, handles sequences and microarray data # timeWarpExpData.py # P.Clote # Timewarp algorithm for expression data. # This implementation differs from standard Kruskal-Liberman # description in our treatment of analogue of gap in sequence alignment # Our treatment ensures that computing alignment from left to right # is identical to computing alignment from right to left, which is # indispensible in computing Boltzmann alignment. # Such treatment is not reasonable for voice recognition because of # the directionality of time in spoken words (ie drawing out syllables). # However out treatment is MORE reasonable than that of Kruskal-Liberman # for microarray data. # For debugging purposes, this program time warps nucleotide sequences. # To distinguish between nucleotide sequence input and expression data, # use the boolean constant SEQUENCE import string,sys,random,math import tempfile,os import getPAMmatrix from parameters import GAP,LINELEN,INF,EPSILON,k,T,DEBUG,VERBOSE from parameters import getNumericalSequence,printMatrix,getSequence from parameters import getSubsequencesFromAlignment, printSequenceAndAlignment from parameters import SEQUENCE TAU = 10.0 #even sample intervals for yeast time series sequences TAUover2 = float(TAU)/2.0 MU = 40.0/3.0 #even sample intervals for human time series sequences #NOTE: Yeast time series is 0,10min,...,160min for 2 cell cycles # Human time series is 0,2hrs,...,24hrs for 2 cell cycles #Thus with 12 human time points per 16 yeast time points, expansion factor #of 4/3 for human time series taken MUover2 = float(MU)/2.0 TAUMUover2 = float(TAU+MU)/2.0 DEBUG = 0 def printList(L): for x in L: print x def distance(x,y): if SEQUENCE: d = getPAMmatrix.getPAMmatrix('distance.txt') return d[(x,y)] else: return abs(float(x)-float(y)) def isNumber(s): try: x = float(s) ok = 1 except: x = 0 ok = 0 return ok def printPathGraph(L): infilename = tempfile.mktemp() outfilename = tempfile.mktemp() infile = open(infilename,"w") outfile = open(outfilename,"w") #WARNING: infile is write-only and used to write contents of list L #Then infile is passed to gnuplot to produce an eps output graph in outfile for (x,y) in L: infile.write(str(x)+"\t"+str(y)+"\n") infile.flush(); infile.close() #I hope that it stays present long enough to call gnuplot outfile.write("set terminal postscript eps\n") graphFileName = "pathGraph.eps" cmd = 'set output "' + graphFileName +'"\n' cmd += 'set pointsize 3 ' + '\n' cmd += 'set noautoscale ' + '\n' #unscaled axis distances outfile.write(cmd) outfile.write('plot "'+infilename+ '" with linespoints\n') outfile.flush() cmd = "gnuplot " + outfilename os.system(cmd) cmd = "ps2gif " + "pathGraph.eps" os.system(cmd) cmd = "rm -f p2f*.pbm" os.system(cmd) def pathMatrix(a,b): #WARNING: In contrast to sequence alignment, in time warping #we have D[(i,j)] = time warp distance between a[0],...,a[i] #and b[0],...,b[j] for all 0<=i0 or j>0: print i,j,backArrowLists[(i,j)] return (D,backArrowLists) def backwardsPathMatrix(a,b): #WARNING: In contrast to sequence alignment, in time warping #we have E[(i,j)] = time warp distance between a[i],...,a[n-1] #and b[j],...,b[m-1] for all 0>i>n and 0>j>m a = [ a[0] ] + a + [ a[-1] ] b = [ b[0] ] + b + [ b[-1] ] #pad a,b with first,last number for base case treatment n = len(a); m = len(b) E = {}; forwardArrowLists = {} E[(n-1,m-1)] = 0 for j in range(m-2,0,-1): E[(n-1,j)] = E[(n-1,j+1)]+MUover2*distance(a[n-1],b[j]) forwardArrowLists[(n-1,j)] = [(n-1,j+1)] for i in range(n-2,0,-1): E[(i,m-1)] = E[(i+1,m-1)]+TAUover2*distance(a[i],b[m-1]) forwardArrowLists[(i,m-1)] = [(i+1,m-1)] for i in range(n-2,0,-1): for j in range(m-2,0,-1): #determine minimum min = INF x = E[(i+1,j+1)] + TAUMUover2*distance(a[i],b[j]) if x < min: min = x x = E[(i+1,j)]+TAUover2*(distance(a[i],b[j])+distance(a[i],b[j-1]))/2 if x < min: min = x x = E[(i,j+1)]+MUover2*(distance(a[i],b[j])+distance(a[i-1],b[j]))/2 if x < min: min = x E[(i,j)] = min #now determine back arrows forwardArrowLists[(i,j)] = [] x = E[(i+1,j+1)] + TAUMUover2*distance(a[i],b[j]) if x == min: forwardArrowLists[(i,j)].append( (i+1,j+1) ) x = E[(i+1,j)] + TAUover2*distance(a[i],b[j]) if x == min: forwardArrowLists[(i,j)].append( (i+1,j) ) x = E[(i,j+1)] + MUover2*distance(a[i],b[j]) if x == min: forwardArrowLists[(i,j)].append( (i,j+1) ) if DEBUG: print "Forward arrow lists" for i in range(n-1,0,-1): for j in range(m-1,0,-1): if i0 or y>0: #print backArrowLists[(x,y)] #x0,y0 = random.choice( backArrowLists[(x,y)] ) x0,y0 = backArrowLists[(x,y)][0] #pick first direction, which preferentially is diagonal if #that appears in the list if x0+1==x and y0+1==y: A.append( (a[x],b[y]) ) B.append( (a[x],b[y]) ) C.append( (1,1) ) x = x0; y = y0 elif x0==x and y0+1==y: A.append( ('-',b[y]) ) B.append( (a[x],b[y]) ) C.append( (0,1) ) y = y0 elif x0+1==x and y0==y: A.append( (a[x],'-') ) B.append( (a[x],b[y]) ) C.append( (1,0) ) x = x0 A.reverse() B.reverse() C.reverse() path = [] xCoord = 0; yCoord = 0 #coordinates of points in path graph path.append( (xCoord,yCoord) ) #start at origin for (deltaX,deltaY) in C: xCoord += deltaX; yCoord += deltaY path.append( (xCoord,yCoord) ) return (A,B,path) def printAlignment( A ): print "\n-------------------------------------\n" aStar = ""; bStar = "" for (x,y) in A: if SEQUENCE: aStar = ""; bStar = "" for (x,y) in A: aStar += x; bStar += y else: if isNumber(x): chX = str(x)+"\t" else: chX = "-"+"\t" if isNumber(y): chY = str(y)+"\t" else: chY = "-"+"\t" aStar += chX; bStar += chY print aStar print bStar def printDirectionsInPathMatrix( X ): print "\n-------------------------------------\n" for (ch,n) in X: for i in range(n): sys.stdout.write(' ') print ch print if __name__ == '__main__': if len(sys.argv) < 4: print "Usage: %s -s/-f a b " % sys.argv[0] print """ 1) a is expression array entry sequence (-s option) or FASTA file of expression array entry (-f option) 2) b is expression array entry sequence (-s option) or FASTA file of expression array entry (-f option) """ sys.exit(1) flag = sys.argv[1] if flag == '-s': if SEQUENCE: a = list(sys.argv[2]) b = list(sys.argv[3]) else: print "Cannot interactively input gene expression data" sys.exit(1) else: if SEQUENCE: a = list(getSequence(sys.argv[2])) b = list(getSequence(sys.argv[3])) else: a = getNumericalSequence(sys.argv[2]) b = getNumericalSequence(sys.argv[3]) D,arrows = pathMatrix(a,b) A,B,path = align(a,b,D,arrows) E,arrows = backwardsPathMatrix(a,b) printAlignment( A ) printAlignment( B ) printPathGraph(path) print D[(len(a),len(b))] print E[(1,1)] print a print b assert ( abs(D[(len(a),len(b))] - E[(1,1)]) < 0.001 ) if DEBUG: print "\n--------------------------------------" print "Matrix D\n" printMatrix(D) print "\n--------------------------------------" print "Matrix E\n" printMatrix(E)