#! /usr/bin/env python # strandbpseq.py # P.Clote # Computes average number of hairpins in STRAND database # To remove outliers (thus requiring at least MIN number of # sequences before average is reported), set REMOVE_OUTLIERS to 1 import sys,os,stats MIN = 3 REMOVE_OUTLIERS = 1 def aux(filename): #print filename HairpinList = [] numHairpins = 0.0 seqlen = 0 file = open(filename) line = file.readline() hairpinLeft = 0; hairpinRight = 0 while line: if line.strip()[0] in ['#','>']: line = file.readline() continue seqlen += 1 words = line.split() x = int(words[0]) y = int(words[2]) if x>hairpinLeft and 0 hairpinLeft and y > hairpinRight: numHairpins += 1 hairpinLeft = x; hairpinRight = y HairpinList.append( (hairpinLeft,hairpinRight) ) #By reindexing positions in HairpinList, can produce a #bogus list of basepairs, then apply modified NussJac #to determine number of hairpins in a maximal nested structure line = file.readline() file.close() return numHairpins,seqlen def main(dirName): D = {} filenames = os.listdir(dirName) for filename in filenames: filename = dirName+os.sep+filename numHairpins,n = aux(filename) if n not in D.keys(): D[n] = [numHairpins] else: D[n].append( numHairpins ) #now compute averages keys = D.keys() keys.sort() for n in keys: if REMOVE_OUTLIERS: if len(D[n])>MIN: mean,stdev,max,min = stats.getSampleStats(D[n]) print "%d\t%f\t%f\t%d\t%d" % (n,mean,stdev,max,min) else: mean,stdev,max,min = stats.getSampleStats(D[n]) print "%d\t%f\t%f\t%d\t%d" % (n,mean,stdev,max,min) if __name__ == '__main__': if len(sys.argv) < 2: print "%s dirname" % sys.argv[0] sys.exit(1) dirname = sys.argv[1] main(dirname)