#! /usr/bin/python2.7 #Amir Bayegan, May 2015 #RNA contact order for the secondary and tertiary structures are computed. #The secondary structure contact order is computed for a secondary structure obtainded from maximizing the number of base pairs predicted by RNAVIEW from pdb files. #3D contact order is calculated after parsing a pdb file and selecting all the atoms in contact(<6 angstrom) #Input is a file containing paths of the pdbfiles. Each line of the file should include just one path. import os,sys,glob,runRnaviewPeter, secStrFromRNAVIEW, misc, math PRINTCONTACTLIST=1 purineAtoms=['N9','C8','N7','C5','C6','O6','N6','N1','C2','N2','N3','C4'] pyrimidineAtoms=['N1','C2','O2','N3','C4','N4','O4','C5','C6'] threeLetterAAcodes = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL' ] def isPurine(nuc): if (nuc=='A' or nuc=='G'): return 1; else: return 0; def isPyrimidine(nuc): if (nuc=='U' or nuc=='C'): return 1; else: return 0; def get2DcontactOrder(secStr): co=0.0 bpList = misc.basePairList(secStr) for bp in bpList: co += abs(bp[1]-bp[0]) if len(bpList) !=0: a_co = float(co)/len(bpList) r_co = co/(len(secStr)*len(bpList)) else: a_co = float("inf") r_co = float("inf") return a_co,r_co def getRecSpec(line): if len(line[17:20])==1: res = line[17:20].strip() elif line[17:20] in threeLetterAAcodes: res = -1 else: res = line[19] return line[22:26].strip(),res ,line[12:16].strip(),line[21].strip(),line[30:38].strip(),line[38:46].strip(),line[46:55].strip() def euclideanDist(c1,c2): return math.sqrt(math.pow(float(c2[2])-float(c1[2]),2) + math.pow(float(c2[1])-float(c1[1]),2) + math.pow(float(c2[0])-float(c1[0]),2)) def get3DpairwiseDistance(pdbName): d={};rnaDict={};ATOMseq={};numAtomsInCont=0;contactList=[];viewCompatibleContactList=[];prevPos=-1 pdbId = pdbName.split('/')[-1].split(".")[0] pdbFile=open(pdbName,'r') line = pdbFile.readline(); while line: if line[:6]=="ATOM " or line[:6]=="HETATM": pos,nuc,atom,chainId,x,y,z = getRecSpec(line); if nuc==-1: #an amino acid is reached break #print pos,nuc,atom,chainId,x,y,z if (pos,chainId) not in d.keys(): d[(pos,chainId)]={} if (isPyrimidine(nuc) and atom in pyrimidineAtoms) or (isPurine(nuc) and atom in purineAtoms): d[(pos,chainId)][atom] = (nuc,x,y,z) #print pos,chainId,d[(pos,chainId)][atom] if prevPos!=pos: if chainId not in ATOMseq.keys(): ATOMseq[chainId]=nuc else: ATOMseq[chainId]+=nuc prevPos=pos elif line[:6]=="ENDMDL": #only the 1st model is condidered if more than one model exists break line = pdbFile.readline(); #print ATOMseq rnaSeqDict = runRnaviewPeter.getRNAfromSEQRES(pdbName) #print rnaSeqDict key = d.keys() key.sort() #print key #find and report discrepencies between SEQRES and ATOM fields for c in ATOMseq.keys(): rnaDict[c]=[rnaSeqDict[c],ATOMseq[c]] if len(ATOMseq[c])!=len(rnaSeqDict[c]): print "ATOM records in %s , chain %s have %d missing positions!" % (pdbName,c,len(rnaSeqDict[c])-len(ATOMseq[c])) #compute pairwise distance between all atoms of all bases for pos1 in key: for pos2 in key: if pos1[0] != pos2[0] and pos1[1]==pos2[1] : # check for different positions and the same chain id for atom1 in d[pos1].keys(): for atom2 in d[pos2].keys(): if euclideanDist(d[pos1][atom1][1:],d[pos2][atom2][1:]) < 6: # atoms are in contact if their distance is less than 6 Angstroms numAtomsInCont +=1 break; #if one atom is in contacts with at least one other heavy atom it is sufficients if numAtomsInCont>=1: #change here to put a threshold for the minimum number of atoms in contact between to base pairs contactList.append([pdbId,pos1[1],d[pos1][atom1][0],pos1[0],pos2[0],numAtomsInCont]) numAtomsInCont=0 if(PRINTCONTACTLIST==1): outFile=open("%s_3DContactList"%pdbId,'w') for c in contactList: outFile.write("%s\t%s\t%s\t%s\t%d\n" % (c[1],c[2],c[3],c[4],c[5])) outFile.close() pdbFile.close() #compute distance between atoms which appear in RNAVIEW numAtomsInCont=0;viewList=[] for chain in ATOMseq.keys(): viewName = pdbId+chain+".rnaview.out" if os.path.exists(viewName): viewFile=open(viewName,'r') for line in viewFile.readlines()[1:]: pos1 = line.split('\t')[0] pos2 = line.split('\t')[2].split()[0] if pos2!='0': viewList.append((pos1,chain)) viewList.append((pos2,chain)) viewFile.close() #posList = [(i[0]) for i in key] #print posList for pos1 in set(viewList): for pos2 in set(viewList): if pos1!=pos2: if pos1 in key and pos2 in key: for atom1 in d[pos1]: for atom2 in d[pos2]: if euclideanDist(d[pos1][atom1][1:],d[pos2][atom2][1:]) < 6: # atoms are in contact if their distance is less than 6 Angstroms numAtomsInCont +=1 break; #if one atom is in contacts with at least one other heavy atom it is sufficients if numAtomsInCont>=1: #change here to put a threshold for the minimum number of atoms in contact between to base pairs viewCompatibleContactList.append([pdbId,pos1[1],d[pos1][atom1][0],pos1[0],pos2[0],numAtomsInCont]) numAtomsInCont=0 else: print "position %s or %s of rnaview for %s not found in pdb" % (pos1[0],pos2[0],pdbId) return if(PRINTCONTACTLIST==1): outFile=open("%s_viewCompat3DContactList"%pdbId,'w') for c in viewCompatibleContactList: outFile.write("%s\t%s\t%s\t%s\t%d\n" % (c[1],c[2],c[3],c[4],c[5])) outFile.close() return contactList,rnaDict,viewCompatibleContactList def get3DcontactOrder(contactList,rnaDict): #hainIdList= set([int(i[0]) for i in contactList[1]]) d={};cnt=0;COlist=[] for l in contactList: diff = abs(float(l[3])-float(l[4])) if diff > 1: #adjacent nucleotides are not considered in the definition of contact order if (l[0],l[1]) in d: #(pdb,chainId) d[(l[0],l[1])] += diff else: d[(l[0],l[1])] = diff cnt += 1 for key in d: abs_co = d[key]/cnt rel_co = abs_co/len(rnaDict[key[1]][1]) #normalized by the sequence length obtained from ATOM COlist.append((key[0],key[1],abs_co,rel_co,rnaDict[key[1]][0],len(rnaDict[key[1]][1]))) return COlist def run2DContactOrder(pdbPath): allowedBP = ["WWc","WWt","SHc","SHt","WSc","WSt","SSt","SSc","HHt","HHc","WHc","WHt","PP","MM"] co2Ddict={};rnaDict={} runRnaviewPeter.RNAview(pdbPath,allowedBP) for viewOutFile in glob.glob("*.rnaview.out"): pdbId = viewOutFile.split('/')[-1].split(".")[0] rna,secStr = secStrFromRNAVIEW.runSecStr(viewOutFile) rnaDict[pdbId]=rna abs_co,rel_co = get2DcontactOrder(secStr) co2Ddict[pdbId] = [rna,secStr,abs_co,rel_co] s = "pdb\tAbs_Co\tRelative_Co\trna\tsecStr\tlength\n" key = co2Ddict.keys() key.sort() for k in key: s += "%s\t%f\t%f\t%s\t%s\t%d\n" %(k,co2Ddict[k][2],co2Ddict[k][3],co2Ddict[k][0],co2Ddict[k][1],len(co2Ddict[k][0])) out = open(pdbPath+"_2DcontactOrder",'w') out.write(s) out.close() def run3DContactOrder(pdbPath): COlist =[];viewList=[] fileList = map(lambda x:x[:-1], open(pdbPath,"r").readlines()) for f in fileList: if os.path.exists(f): contactList,rnaDict,viewCompatibleContactList = get3DpairwiseDistance(f) if contactList: COlist += get3DcontactOrder(contactList,rnaDict) else: print "%s is excluded from 3D computation. Pdb file does not seem like RNA!" % f continue if viewCompatibleContactList: viewList += get3DcontactOrder(viewCompatibleContactList,rnaDict) else: print "No base pairs reported for %s by RNAVIEW. It is excluded from 2D and pseudo-3D!" % f continue else: print "file %s not found!" %f s = "pdb\tAbs_Co\tRelative_Co\trna\tlength(ATOM)\n" for l in COlist: s += "%s%s\t%f\t%f\t%s\t%d\n" %(l[0],l[1],l[2],l[3],l[4],l[5]) out = open(pdbPath+"_3DcontactOrder",'w') out.write(s) out.close() s = "pdb\tAbs_Co\tRelative_Co\trna\tlength(ATOM)\n" for l in viewList: s += "%s%s\t%f\t%f\t%s\t%d\n" %(l[0],l[1],l[2],l[3],l[4],l[5]) out = open(pdbPath+"_viewCompat3DcontactOrder",'w') out.write(s) out.close() if __name__ == "__main__": if len(sys.argv)<2: print "USAGE:%s pdbPath"%sys.argv[0] sys.exit(1) pdbPath=sys.argv[1] #run2DContactOrder(pdbPath) run3DContactOrder(pdbPath)