#! /usr/bin/env python # altschulEriksonDinuclShuffle.py # P. Clote # NOTE: One cannot use function "count(s,word)" to count the number # of occurrences of dinucleotide word in string s, since the built-in # function counts only nonoverlapping words, presumably in a left to # right fashion. import sys,string,whrandom LINELEN = 70 def printFASTAformat(text): n = len(text) rows = n/LINELEN for i in range(rows+1): print text[i*LINELEN:(i+1)*LINELEN] def computeCountAndLists(s): #WARNING: Use of function count(s,'UU') returns 1 on word UUU #since it apparently counts only nonoverlapping words UU #For this reason, we work with the indices. #Initialize lists and mono- and dinucleotide dictionaries List = {} #List is a dictionary of lists List['A'] = []; List['C'] = []; List['G'] = []; List['U'] = []; nuclList = ["A","C","G","U"] s = s.upper() s = s.replace("T","U") nuclCnt = {} #empty dictionary dinuclCnt = {} #empty dictionary for x in nuclList: nuclCnt[x]=0 dinuclCnt[x]={} for y in nuclList: dinuclCnt[x][y]=0 #Compute count and lists nuclCnt[s[0]] = 1 nuclTotal = 1 dinuclTotal = 0 for i in range(len(s)-1): x = s[i]; y = s[i+1] List[x].append( y ) nuclCnt[y] += 1; nuclTotal += 1 dinuclCnt[x][y] += 1; dinuclTotal += 1 assert (nuclTotal==len(s)) assert (dinuclTotal==len(s)-1) return nuclCnt,dinuclCnt,List def chooseEdge(x,dinuclCnt): numInList = 0 for y in ['A','C','G','U']: numInList += dinuclCnt[x][y] z = whrandom.random() denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['U'] numerator = dinuclCnt[x]['A'] if z < float(numerator)/float(denom): dinuclCnt[x]['A'] -= 1 return 'A' numerator += dinuclCnt[x]['C'] if z < float(numerator)/float(denom): dinuclCnt[x]['C'] -= 1 return 'C' numerator += dinuclCnt[x]['G'] if z < float(numerator)/float(denom): dinuclCnt[x]['G'] -= 1 return 'G' dinuclCnt[x]['U'] -= 1 return 'U' def connectedToLast(edgeList,nuclList,lastCh): D = {} for x in nuclList: D[x]=0 for edge in edgeList: a = edge[0]; b = edge[1] if b==lastCh: D[a]=1 for i in range(2): for edge in edgeList: a = edge[0]; b = edge[1] if D[b]==1: D[a]=1 ok = 0 for x in nuclList: if x!=lastCh and D[x]==0: return 0 return 1 def eulerian(s): nuclCnt,dinuclCnt,List = computeCountAndLists(s) #compute nucleotides appearing in s nuclList = [] for x in ["A","C","G","U"]: if x in s: nuclList.append(x) #compute numInList[x] = number of dinucleotides beginning with x numInList = {} for x in nuclList: numInList[x]=0 for y in nuclList: numInList[x] += dinuclCnt[x][y] #create dinucleotide shuffle L firstCh = s[0] #start with first letter of s lastCh = s[-1] edgeList = [] for x in nuclList: if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] ) ok = connectedToLast(edgeList,nuclList,lastCh) return ok,edgeList,nuclList,lastCh def shuffleEdgeList(L): n = len(L); barrier = n for i in range(n-1): z = int(whrandom.random() * barrier) tmp = L[z] L[z]= L[barrier-1] L[barrier-1] = tmp barrier -= 1 return L def dinuclShuffle(s): ok = 0 while not ok: ok,edgeList,nuclList,lastCh = eulerian(s) nuclCnt,dinuclCnt,List = computeCountAndLists(s) #remove last edges from each vertex list, shuffle, then add back #the removed edges at end of vertex lists. for [x,y] in edgeList: List[x].remove(y) for x in nuclList: shuffleEdgeList(List[x]) for [x,y] in edgeList: List[x].append(y) #construct the eulerian path L = [s[0]]; prevCh = s[0] for i in range(len(s)-2): ch = List[prevCh][0] L.append( ch ) del List[prevCh][0] prevCh = ch L.append(s[-1]) t = string.join(L,"") return t if __name__ == '__main__': if len(sys.argv)<2: print "Usage: %s RNAsequence [number of random RNAs]" % sys.argv[0] sys.exit(1) s = sys.argv[1] if len(sys.argv)>2: num = int(sys.argv[2]) else: num = 1 for i in range(num): print "> "+str(i+1) seq = dinuclShuffle(s) printFASTAformat(seq)