#! /usr/bin/python


# Copyright (C) 2005 Peter Clote
#  All Rights Reserved.
#  
#  Permission to use, copy, modify, and distribute this
#  software for NON-COMMERCIAL purposes
#  and without fee is hereby granted provided that this
#  copyright notice appears in all copies. If you use, modify
#  or extend this code, please cite the reference
#    "Symmetric time warping, Boltzmann pair probabilities and
#    functional genomics", by P. Clote and J. Straubhaar,
#    submitted to Journal of Mathematical Biology.
#                                                                                  
#  THE AUTHOR MAKES NO REPRESENTATIONS OR
#  WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#  PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHOR
#  SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED
#  BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
#  THIS SOFTWARE OR ITS DERIVATIVES.
#                                                                                  
#  
#  
# P.Clote
# driver.py
# This program computes only the time warping distance
# between Cho's yeast and Cho's human gene expression data,
# in ALL against ALL time warping. With original data, this
# took about 1 week, hence should be written in C.

import string,sys,random,math
import timeWarpExpData 
from timeWarpExpData import pathMatrix


def printDict(D):
  keys = D.keys()
  keys.sort()
  for key in keys:
    print key
    for dist,gene in D[key]:
      print "%s\t%s" % (gene,dist)

def process(yeastFile,humanFile,N=10):
  #(1) Open files and read contents into dictionaries
  outputA = open("outputA","w") #time warp distances
  outputB = open("outputB","w") #all yeast human gene distances
  outputC = open("outputC","w") #for each yeast gene, top 10 human genes
  yeast = open(yeastFileName,"r")
  human = open(humanFileName,"r")
  yeastDict = {}
  humanDict = {}
  lineYeast = yeast.readline()
  while lineYeast:
    Lyeast    = lineYeast.split()
    yeastGene = Lyeast[0]
    Lyeast    = Lyeast[1:]         #remove gene name
    Lyeast    = map(float,Lyeast)  #convert from string to float
    yeastDict[yeastGene] = Lyeast
    lineYeast = yeast.readline()
  yeast.close()
  lineHuman = human.readline()
  while lineHuman:
    Lhuman    = lineHuman.split()
    humanGene = Lhuman[0]
    Lhuman    = Lhuman[1:]         #remove gene name
    Lhuman    = map(float,Lhuman)  #convert from string to float
    humanDict[humanGene] = Lhuman
    lineHuman = human.readline()
  human.close()
  #print yeastDict
  #print humanDict
  #(2) Now process 
#  D = {}
  keysYeast = yeastDict.keys()
  keysHuman = humanDict.keys()
  keysYeast.sort()
  keysHuman.sort()
  for yeastGene in keysYeast:
    L         = [] #list of (timeWarpDistance,human gene)
    Lyeast = yeastDict[yeastGene]
    for humanGene in keysHuman:
      Lhuman = humanDict[humanGene]
      E,backArrowLists = pathMatrix(Lyeast,Lhuman)
      dist      = E[ (len(Lyeast),len(Lhuman)) ]      
      L.append( (dist,humanGene) )
      #print dist
      outputA.write(str(dist)+"\n")
    L.sort()
    for dist,gene in L:
      outputB.write( "%s\t%s\t%s\n" % (yeastGene,gene,dist) )
    L = L[:maxNumOrthologs] 
#    D[yeastGene] = L 
    outputC.write(yeastGene+"\n")
    for dist,gene in L:
      outputC.write( "%s\t%s\n" % (gene,dist) )
    outputA.flush()
    outputB.flush()
    outputC.flush()
  outputA.close()
  outputB.close()
  outputC.close()
#  return D 


if __name__ == '__main__':
  if len(sys.argv) < 3:
    print "Usage: %s yeastNormalized humanNormalized " % sys.argv[0]
    print """
          1) yeastNormalized is file of normalized expression data
             for yeast (gene name followed by tab separated floats)
          2) humanNormalized is file of normalized expression data
             for human (gene name followed by tab separated floats)
          """
    sys.exit(1)
  yeastFileName = sys.argv[1]
  humanFileName = sys.argv[2]
  yeast = open(yeastFileName,"r")
  human = open(humanFileName,"r")
  maxNumOrthologs = 10  
  process(yeast,human,maxNumOrthologs)
#  D = process(yeast,human,maxNumOrthologs)
   #D[yeastGene] = list of maxNumOrthologs many human genes whose
   #time warping distance to the yeast gene is least 
#  printDict(D)