#! /usr/bin/python # Copyright (C) 2005 Peter Clote # All Rights Reserved. # # Permission to use, copy, modify, and distribute this # software for NON-COMMERCIAL purposes # and without fee is hereby granted provided that this # copyright notice appears in all copies. If you use, modify # or extend this code, please cite the reference # "Symmetric time warping, Boltzmann pair probabilities and # functional genomics", by P. Clote and J. Straubhaar, # submitted to Journal of Mathematical Biology. # # THE AUTHOR MAKES NO REPRESENTATIONS OR # WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE # IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHOR # SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED # BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING # THIS SOFTWARE OR ITS DERIVATIVES. # # # # P.Clote # driver.py # This program computes only the time warping distance # between Cho's yeast and Cho's human gene expression data, # in ALL against ALL time warping. With original data, this # took about 1 week, hence should be written in C. import string,sys,random,math import timeWarpExpData from timeWarpExpData import pathMatrix def printDict(D): keys = D.keys() keys.sort() for key in keys: print key for dist,gene in D[key]: print "%s\t%s" % (gene,dist) def process(yeastFile,humanFile,N=10): #(1) Open files and read contents into dictionaries outputA = open("outputA","w") #time warp distances outputB = open("outputB","w") #all yeast human gene distances outputC = open("outputC","w") #for each yeast gene, top 10 human genes yeast = open(yeastFileName,"r") human = open(humanFileName,"r") yeastDict = {} humanDict = {} lineYeast = yeast.readline() while lineYeast: Lyeast = lineYeast.split() yeastGene = Lyeast[0] Lyeast = Lyeast[1:] #remove gene name Lyeast = map(float,Lyeast) #convert from string to float yeastDict[yeastGene] = Lyeast lineYeast = yeast.readline() yeast.close() lineHuman = human.readline() while lineHuman: Lhuman = lineHuman.split() humanGene = Lhuman[0] Lhuman = Lhuman[1:] #remove gene name Lhuman = map(float,Lhuman) #convert from string to float humanDict[humanGene] = Lhuman lineHuman = human.readline() human.close() #print yeastDict #print humanDict #(2) Now process # D = {} keysYeast = yeastDict.keys() keysHuman = humanDict.keys() keysYeast.sort() keysHuman.sort() for yeastGene in keysYeast: L = [] #list of (timeWarpDistance,human gene) Lyeast = yeastDict[yeastGene] for humanGene in keysHuman: Lhuman = humanDict[humanGene] E,backArrowLists = pathMatrix(Lyeast,Lhuman) dist = E[ (len(Lyeast),len(Lhuman)) ] L.append( (dist,humanGene) ) #print dist outputA.write(str(dist)+"\n") L.sort() for dist,gene in L: outputB.write( "%s\t%s\t%s\n" % (yeastGene,gene,dist) ) L = L[:maxNumOrthologs] # D[yeastGene] = L outputC.write(yeastGene+"\n") for dist,gene in L: outputC.write( "%s\t%s\n" % (gene,dist) ) outputA.flush() outputB.flush() outputC.flush() outputA.close() outputB.close() outputC.close() # return D if __name__ == '__main__': if len(sys.argv) < 3: print "Usage: %s yeastNormalized humanNormalized " % sys.argv[0] print """ 1) yeastNormalized is file of normalized expression data for yeast (gene name followed by tab separated floats) 2) humanNormalized is file of normalized expression data for human (gene name followed by tab separated floats) """ sys.exit(1) yeastFileName = sys.argv[1] humanFileName = sys.argv[2] yeast = open(yeastFileName,"r") human = open(humanFileName,"r") maxNumOrthologs = 10 process(yeast,human,maxNumOrthologs) # D = process(yeast,human,maxNumOrthologs) #D[yeastGene] = list of maxNumOrthologs many human genes whose #time warping distance to the yeast gene is least # printDict(D)