#!/bin/env python import os, socket, sys, re, popen2 import newick.tree from newick.tree import Tree, Leaf from CoaSim import * from CoaSim.popStructure import Population as P, Sample as S, Merge as M class Labelling(object): def __init__(self): self.d = dict() self.count = 0 def insert(self,label): try: return self.d[label] except KeyError: self.d[label] = self.count self.count += 1 return self.count - 1 def get(self,label): return self.d[label] class BitVector(object): def __init__(self, n): self.vector = [0 for i in xrange(n)] def get(self,i): return self.vector[i] def set(self,i): self.vector[i] = 1 def union(self,other): for i in xrange(len(self.vector)): self.vector[i] = self.vector[i] | other.vector[i] def __str__(self): s = '' for i in xrange(len(self.vector)): s += str(self.vector[i]) return s class Labeller(newick.tree.TreeVisitor): def __init__(self): newick.tree.TreeVisitor.__init__(self) self.labelling = Labelling() def visit_leaf(self,l): self.labelling.insert(l.id) class Folder(newick.tree.TreeVisitor): def __init__(self, labelling): newick.tree.TreeVisitor.__init__(self) self.labelling = labelling def visit_leaf(self,leaf): leaf.bit_vector = BitVector(len(self.labelling.d)) leaf.bit_vector.set(self.labelling.get(leaf.id)) def post_visit_tree(self,t): t.bit_vector = BitVector(len(self.labelling.d)) for (child,_,__) in t.edges: t.bit_vector.union(child.bit_vector) self.update(t.bit_vector,t) class Annotator(Folder): def __init__(self, labelling): Folder.__init__(self, labelling) self.vector_set = dict() def update(self,bv,t): self.vector_set[str(bv)] = 1 class NotFound: pass class Checker(Folder): def __init__(self, root, labelling, vectors): Folder.__init__(self, labelling) self.vector_set = vectors self.root_children = [child for (child,_,_) in root.edges] def update(self,bv,t): if str(bv) not in self.vector_set: if t not in self.root_children: raise NotFound def tree_cmp(t1, t2): l = Labeller() t1.dfs_traverse(l) a = Annotator(l.labelling) t1.dfs_traverse(a) vectors = a.vector_set try: c = Checker(t2, l.labelling, vectors) t2.dfs_traverse(c) return True except NotFound: return False def branchLength(t): class LV(newick.tree.TreeVisitor): def __init__(self): self.sum = 0 def pre_visit_edge(self,src,b,l,dst): self.sum += l lv = LV() t.dfs_traverse(lv) return lv.sum def recombType(t1, t2): epsilon = 1e-5 l1 = branchLength(t1) l2 = branchLength(t2) equal_topology = tree_cmp(t1,t2) if abs(l1-l2) < epsilon and equal_topology: return 1 elif equal_topology: return 2 else: return 3 def _hasIncomp(col1,col2): x = zip(col1,col2) if (0,0) in x and (0,1) in x and (1,0) in x and (1,1) in x: return True else: return False def _getCol(dataMatrix,i): return [row[i] for row in dataMatrix] def incompatible(dataMatrix): noRows = len(dataMatrix) noCols = len(dataMatrix[0]) for i in xrange(noCols): for j in xrange(i): col1 = _getCol(dataMatrix,i) col2 = _getCol(dataMatrix,j) if _hasIncomp(col1,col2): return True return False def filterLeaves2(tree, leaves): if isinstance(tree, Leaf): if tree.id in leaves: return Leaf(tree.id) else: return None else: edges = tree.get_edges() children = [(filterLeaves2(n,leaves),b,l) for n,b,l in edges] filteredChildren = [(c,b,l) for c,b,l in children if c is not None] if len(filteredChildren) == 0: return None else: newTree = Tree() for c,b,l in filteredChildren: newTree.add_edge((c,b,l)) return newTree def removeDeg2Node(tree): if isinstance(tree,Leaf): return tree edges = [(removeDeg2Node(n),b,l) for n,b,l in tree.get_edges()] tree.edges = [removeDeg2Edge(e) for e in edges] return tree def hasDeg2(node): if isinstance(node,Leaf): return False else: return len(node.edges)==1 def removeDeg2Edge(edge): n,_,l = edge if hasDeg2(n): n2,_,l2 = n.edges[0] return (n2,None,l+l2) else: return edge def filterLeaves(tree, leaves): root = filterLeaves2(tree,leaves) while len(root.get_edges()) == 1: root,_,_ = root.get_edges()[0] return removeDeg2Node(root) pairwise = 'LDhat2/SC/pairwise' match1 = re.compile('Is likelihood file an exact match to data?') match2 = re.compile('Do you wish to change grid over which to estimate likelihoods') match3 = re.compile('Maximum at 4Ner\(region\) = (.*) :') match4 = re.compile('Do you wish to carry out a sliding windows analysis?') match5 = re.compile('Print out table of Rmin values?') match6 = re.compile('Lower bound on Rmin = (.*)') match7 = re.compile('Estimate 4Ner by moment method?') match8 = re.compile(': 4Ner = (.*)') match9 = re.compile('Do you wish to test for recombination?') match10= re.compile('Proportion Lkmax <= estimated = (.*)') match11= re.compile('Proportion corr\(r2,d\) <= estimated = (.*)') matchLi= re.compile(' rhobar = (.*)') ##FIXME matches to lower bound, 4Ner and two different significance testings def hudson(): cmd = "./exhap> seqH, noSeqs, noSNPs print >> seqM, noSeqs, noSNPs, 1 li_pos = [p/hudson_pos[noSNPs-1] for p in hudson_pos] for i in xrange(noSNPs): print >> seqH, li_pos[i], print >> seqH print >> seqH, 'label', for q in xrange(noSNPs): print >> seqH, '?', i = 0 for seq in seqs: print >> seqM, '>', i print >> seqM, ''.join([str(a) for a in seq]) print >> seqH, i, seq i += 1 seqM.close() seqH.close() locs = open('/tmp/'+host+'-locs','w') print >> locs, noSNPs, mcvean_pos[noSNPs-1]+100, 'L' print >> locs, ' '.join([str(mcvean_pos[i]+1) for i in xrange(noSNPs)]) locs.close() if len(sys.argv) != 6: print 'usage:', sys.argv[0], 'no-samples beta no-seqs no-snps merge-time' sys.exit(1) host = socket.gethostname() noSamples = int(sys.argv[1]) beta = int(sys.argv[2]) noSeqs = int(sys.argv[3]) noSNPs = int(sys.argv[4]) mergeTime = float(sys.argv[5]) posStart = [1e-10*x for x in xrange(noSNPs/2)] posStop = [(1.0-1e-10)-1e-10*x for x in xrange(noSNPs/2)] posStop.reverse() markerPositions = posStart+posStop markers = [SNPMarker(p, 0.1, 0.9) for p in markerPositions] # mapping seqs*beta*mergeTime -> rho rhoMap = {(20, 0, 0.150): 0.19, (50, 0, 0.150): 0.15, (20, 100, 0.015): 2.22, (50, 100, 0.015): 1.20} import exceptions class reject(exceptions.Exception): pass class rejectionSampler(object): def __init__(self): self.k = None self.time = None self.migCount = 0 def recombinationEvent(self, n1, n2, k): if self.k is not None: raise reject() self.k = k self.time = n1.eventTime def migrationEvent(self,*dummy): self.migCount += 1 sampleNo = 0 while True: try: cb = rejectionSampler() rho = rhoMap[(noSeqs,beta,mergeTime)] popSpec = P(1, M(mergeTime, [P(1,S(noSeqs)),P(1,S(noSeqs))])) arg = simulate(markers, popSpec, rho=rho, beta=beta, callbacks=cb) if cb.k is None: continue # no recombinations trees = [i.tree for i in arg.intervals] t1Str, t2Str = [str(t) for t in trees] t1 = newick.tree.parse_tree(t1Str) t2 = newick.tree.parse_tree(t2Str) leaves1 = map(str, range(0,noSeqs)) leaves2 = map(str, range(noSeqs,2*noSeqs)) t11 = filterLeaves(t1,leaves1) t21 = filterLeaves(t2,leaves1) t12 = filterLeaves(t1,leaves2) t22 = filterLeaves(t2,leaves2) try: seqs = arg.sequences[:noSeqs] i1 = incompatible(seqs) sequenceSetup([t11,t21], seqs) _, _, _, mcveanSign1, r2Sign1 = mcvean() hudson_est1 = hudson() seqs = arg.sequences[noSeqs:] i2 = incompatible(seqs) sequenceSetup([t12,t22], seqs) _, _, _, mcveanSign2, r2Sign2 = mcvean() hudson_est2 = hudson() print cb.k, cb.time, cb.migCount, print recombType(t11,t21), i1, print hudson_est1, mcveanSign1, r2Sign1, print recombType(t12,t22), i2, print hudson_est2, mcveanSign2, r2Sign2 sampleNo += 1 if sampleNo >= noSamples: break except: # something odd happened in the call to the external programs... continue except reject: continue