Emerge
From HyPhy Wiki
This wiki page provides HyPhy scripts that were used in a phylogenetic analysis of deep sequence data, which is presented in a manuscript submitted to PLoS Comput Biol. Many of the Python scripts will depend on the HyPhy shared library.
Contents |
Workflow
This flowchart is also provided as Supporting Figure S4.
HyPhy batch files
Name | Description | Dependencies | Link |
---|---|---|---|
emelineV2.5.bf | Screen sequences for indel errors by pairwise alignment to reference | Utility/GrabBag.bf, Utility/DescriptiveStats.bf, and TemplateModels/chooseGeneticCode.def | File:EmelineV2.5.bf |
batchAlignMPI.bf | MPI wrapper for parallel execution of emelineV2.5 on a computing cluster | MPI environment | File:BatchAlignMPI.bf |
convertToNexus.bf | Batch conversion of sequence files to NEXUS format, after censoring stop codons with gap characters. | CleanStopCodons.bf | File:ConvertToNexus.bf |
fit_codon_model.bf | Fit a codon substitution model to replicate trees and sample ancestral sequences from the posterior distribution. | fit_codon_model.ibf, TemplateModels/chooseGeneticCode.def | File:Fit codon model.bf |
fit_codon_model.ibf | Support functions for fit_codon_model.bf | File:Fit codon model.ibf.txt | |
_mpiFitCodon.bf | MPI wrapper for fit_codon_model.bf | MPI environment | File:MpiFitCodon.bf |
fit_indels_v4.bf | Fit a reversible model of indel evolution to integer-valued sequence data encoding indel polymorphisms. | File:Fit indels v4.bf | |
map_in_time.bf | Match geno2pheno predictions based on ancestral sequence reconstructions to internal nodes of replicate trees. | Utility/ReadDelimitedFiles.bf | File:Map in time.bf |
FindMRCAofX4.bf | Identify the internal node corresponding to the most recent common ancestor (MRCA) of all observed CXCR4-using sequences (i.e., with predicted g2p FPR < 3.5). | Utility/ReadDelimitedFiles.bf, Utility/GrabBag.bf, TreeTools.ibf | File:FindMRCAofX4.bf |
Python scripts
Please note that these scripts were not designed to be portable and many file paths will be broken when executed on another computer. In addition, many of these scripts have dependencies on third-party packages such as mpi4py or the HyPhy shared library.
merge_alignments.py
""" Gather CSV output from emeline, merge identical sequences and organize by patient and time point in a format that is ready for going into BEAST -- i.e., time stamp in sequence header. """ import subprocess from csv import reader import sys, os # load link info infile = open('/Users/apoon/wip/swenson/dutch_v3/info/enum_link.csv', 'rU') lines = infile.readlines() infile.close() # enum,studyID,patientNum,timeToSI,specimen,days.since # 36155A,DS1,13940,-23.05,PLASMA,735 days_d = {} for line in lines: enum, study_id, patnum, time_to_si, specimen, days_since = line.strip('\n').split(',') days_d.update ( { (study_id, time_to_si, specimen) : days_since } ) # get patient information from Luke's spreadsheet lines = reader(open('/Users/apoon/wip/swenson/dutch_v3/info/454 - complete 454 results_repaired.csv', 'rU')) header = lines.next() d = {} for subj_id, patid, enum, avgTtoSI, samptype, x4_count, total_count, percent_x4, top_fpr in lines: d.update ( { enum : {'subject':subj_id, 'is_plasma':(samptype in ['PLASMA','PLA']), 'avgTtoSI':avgTtoSI} } ) # use UNIX find to generate list of paths to files root = '/Users/apoon/wip/swenson/dutch_v3/' p = subprocess.Popen('find ' + root + 'prelim -name "*.prealign.fas.csv"', shell=True, stdout=subprocess.PIPE) paths = [path.strip('\n') for path in p.stdout.readlines()] for path in paths: enum = path.split('/')[-1].split('_')[0] subj_id = d[enum]['subject'] is_plasma = d[enum]['is_plasma'] timing = d[enum]['avgTtoSI'] days_since = days_d[ ( subj_id, timing, 'PLASMA' if is_plasma else 'PBMC' ) ] newpath = root+'merged/'+subj_id+'_'+('PLA' if is_plasma else 'PBM')+'.fasta' if os.path.exists(newpath): outfile = open(newpath, 'a') else: outfile = open(newpath, 'w') infile = open(path, 'rU') lines = infile.readlines() infile.close() # compress alignment (merge reads that are identical after screening for errors with emeline) temp = {} for line in lines: name, orig, score, num_shifts, aligned_ref, aligned_query, aligned_seq = line.strip('\n').split(',') if int(score) < 200: #print aligned_seq continue count = int(name.split()[1]) if aligned_seq not in temp.keys(): temp.update ( {aligned_seq : count} ) else: temp[aligned_seq] += count intermed = [ (v, k) for k, v in temp.iteritems() ] intermed.sort(reverse=True) index = 0 for count, seq in intermed: outfile.write('>' + enum + '_' + str(index) + '_' + str(count) + '_' + days_since + '\n') outfile.write(seq+'\n') index += 1 outfile.close()
randomsampler.py
""" Sample 50 sequences in proportion to their counts in longitudinal 454 data sets. """ import os, random from seqUtils import * sample_size = 50 path = '/Users/apoon/wip/swenson/dutch_v3/merged/' files = os.listdir(path) for f in files: if f.endswith('.fasta'): infile = open(path+f, 'rU') fasta = convert_fasta(infile.readlines()) infile.close() seqs = {} counts = {} for h, s in fasta: enum, index, count, days_since = h.split('_') # generate dictionaries of sequences keyed by indices, stratified by "days since" if not seqs.has_key(days_since): seqs.update ( { days_since : { } } ) seqs[days_since].update ( { index : (h,s) } ) # also make lists of indices replicated by their respective counts if not counts.has_key(days_since): counts.update ( { days_since : [] } ) counts[days_since] += [index] * int(count) outfile = open (path.replace('merged', 'samples')+f.split('.')[0]+'_sample'+str(sample_size) + 'a.fasta', 'w') for days_since in counts.iterkeys(): indices = counts[days_since] sample = [ random.sample(indices, 1)[0] for i in range(sample_size) ] new_index = 0 for ix in sample: h, s = seqs[days_since][ix] outfile.write ('>' + str(new_index) + '_' + h + '\n' + s + '\n') new_index += 1 outfile.close()
parse_beast_xml.py
from xml.etree.ElementTree import ElementTree as Tree from xml.etree.ElementTree import Element as Node from Bio import SeqIO import sys import os # _________________________________________________________________ if len(sys.argv) != 4: print "Incorrect number of arguments ("+str(len(sys.argv))+"), expecting 3" print "Usage: python parse_beast_xml.py [template file] [NEXUS alignment] [output stem]" sys.exit() # stem should be the destination path # unpack arguments from command line (template_file, nexus_file, stem) = sys.argv[1:] output_file = nexus_file.split('/')[-1].strip('.nex') # the XML file to write stem += output_file.split('/')[-1] # add the filename prefix for naming log files print 'Using stem: '+ stem # import sequences from NEXUS file alignment = [] for record in SeqIO.parse(open(nexus_file, 'rU'), 'nexus'): alignment.append([record.id, str(record.seq)]) template = Tree() t_root = template.parse(template_file) # reset TAXA and ALIGNMENT blocks t_taxa = template.findall('taxa')[0] t_taxa._children = [] t_aln = template.find('alignment') t_aln._children = [] time_unit = 'days' print 'WARNING: specifying time in units of '+time_unit+'!' all_dates = [] # update blocks for row in alignment: (id,seq) = row # TAXON date_val = id.split('_')[-1] if int(date_val) not in all_dates: all_dates.append(int(date_val)) try: date = Node('date', {'units':time_unit, 'direction':'forwards', 'value':str(float(date_val))}) except: print '********' print row raise taxon = Node('taxon', {'id':id}) taxon.append(date) t_taxa.append(taxon) # SEQUENCE seqtag = Node('sequence', {}) staxon = Node('taxon', {'idref':id}) staxon.tail = '\n\t\t\t'+seq # mimic formatting in BEAST XML seqtag.append(staxon) t_aln.append(seqtag) if len(all_dates) < 2: print '********************************************' print 'ERROR: only one sample date in file ' + nexus_file print '********************************************' sys.exit() date_diff = abs(all_dates[0]-all_dates[1]) # revise log file paths and prior settings t_mcmc = template.find('mcmc') log_elements = t_mcmc.findall('log') for log in log_elements: if log.get('id') == 'fileLog': log.set('fileName', stem+'.log') break log_tree_element = t_mcmc.find('logTree') if (log_tree_element): log_tree_element.set('fileName', stem+'.trees') else: print "ERROR: Failed to find tree log, exiting without write!" sys.exit() # for local use template.write(output_file+'.xml')
mpibeast.py
#!/usr/bin/env python import os, sys from subprocess import Popen, PIPE from mpi4py import MPI my_rank = MPI.COMM_WORLD.Get_rank() nprocs = MPI.COMM_WORLD.Get_size() if len(sys.argv) < 2: print 'python mpibeast.py [path to XML]' path_to_xml = sys.argv[1] if not path_to_xml.endswith('/'): path_to_xml += '/' files = os.listdir(path_to_xml) files = [f for f in files if f.endswith('.xml')] for i in range(len(files)): if i%nprocs == my_rank: sys.stdout.write("Process %d of %d starting task %d\n"%(my_rank, nprocs, i)) p = Popen(['java', '-jar', '/home/art/src/BEASTv1.6.1/lib/beast.jar', path_to_xml+files[i]], stdout=PIPE) result = p.communicate()[0].split('\n') sys.stdout.write("Process %d of %d completed task %d:\n%s\n"%(my_rank, nprocs, i, result[-1])) MPI.COMM_WORLD.Barrier() MPI.Finalize()
parseTreeLog.py
# open a BEAST treelog file, thin the trees inside, # and convert into a conventional Newick format, # write out to another file import sys, re if len(sys.argv) < 4: print 'Usage: python parseTreeLog.py [infile] [percent burnin (float)] [sample size]' sys.exit() infile = open(sys.argv[1], 'rU') lines = infile.readlines() infile.close() burnin = float(sys.argv[2]) # 0.5 is 50% burn-in sampsize = int(sys.argv[3]) # how many trees to retain outfile = open(sys.argv[1].replace('.trees', '.nwk'), 'w') trBlock = False label_d = {} for i in range(len(lines)): if 'Translate' in lines[i]: trBlock = True continue if trBlock: if ';' in lines[i]: trBlock = False continue try: index, label = lines[i].strip('\n').strip(',').split()[-2:] except: print lines[i] raise label_d.update( { index:label } ) trees = [i for i in range(len(lines)) if lines[i].startswith('tree')] ntrees = len(trees) interval = int(round((ntrees*(1.-burnin))/sampsize)) figRegex = re.compile('\[[^]]+\]') idxRegex = re.compile('([0-9]+):') def idx2label (index): return label_d[index.groups()[0]]+':' tally = 0 for i in range(trees[int(ntrees*burnin)], trees[-1], interval): tree = lines[i].split()[-1].strip(';') # sub out figtree annotations (branch lengths in evolutionary rates) tree = re.sub(figRegex, '', tree) # replace all indices with labels tree = re.sub(idxRegex, idx2label, tree) outfile.write(tree+';\n') tally += 1 if tally == sampsize: break outfile.close()
scanForIndels.py
""" Scan FASTA file for different indel configurations output binary character strings in FASTA format with one column for each configuration. """ import sys, re from seqUtils import * from pairAlign import * hyphy = HyPhy._THyPhy (os.getcwd(),2) g2p_ref = 'CTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHC' init_hyphy_protein_align(hyphy) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 1;", False) left_trail = re.compile('^([-]+)\w') right_trail = re.compile('\w([-]+)$') if len(sys.argv) < 3: print 'Usage: python scanForIndels.py [FASTA file] [outfile]' sys.exit() # load FASTA file contents infile = open(sys.argv[1], 'rU') fasta = convert_fasta(infile.readlines()) infile.close() # figure out how to trim sequences to V3 region data = {} # do not count prefix/suffix gaps in partial sequences regex = re.compile('[A-Z][-]+[A-Z]') for header, seq in fasta: aaseq = translate_nuc(seq,0) aligned_query, aligned_ref, score = pairwise_align (aaseq, g2p_ref, hyphy) try: left_clip = aligned_query[ : len(left_trail.findall(aligned_ref)[0])] right_clip = aligned_query[-len(right_trail.findall(aligned_ref)[0]) : ] except: print aligned_ref raise aaseq = aaseq.replace(left_clip,'').replace(right_clip,'') matches = regex.finditer(aaseq) binstr = [ '0' for i in range(len(aaseq)) ] for match in matches: left, right = match.span() for i in range(left+1, right-1): binstr[i] = '1' binstr = ''.join(binstr) if data.has_key(binstr): data[binstr].append(header) else: data.update( { binstr : [header] } ) # determine number of indel sites regex = re.compile('0[1]+0|^[1]+0|0[1]+$') regions = [] for binstr in data.iterkeys(): matches = regex.finditer(binstr) for m in matches: left, right = m.span() if m.group().startswith('0'): left += 1 if m.group().endswith('0'): right -= 1 res = range(left, right) if res not in regions: regions.append(res) # sort regions by size intermed = [ (len(r), r) for r in regions ] intermed.sort(reverse=True) regions = [ x[1] for x in intermed ] # collapse regions that overlap, longest regions get transferred first next_regions = [] for region in regions: if len(next_regions) == 0: next_regions.append(region) continue overlap = False for nr in next_regions: if len( set(region).intersection( set(nr) ) ) > 0: overlap = True break if not overlap: next_regions.append(region) regions = next_regions # generate binary sequence dictionary based on regions max_state = 0 bin_d = {} for region in regions: states = {} for binstr in data.iterkeys(): state = ''.join( [ binstr[i] for i in region ] ) count = len (data[binstr]) if not states.has_key(state): states.update ( { state : count } ) else: states[state] += count intermed = [ (v, k) for k, v in states.iteritems() ] intermed.sort (reverse=True) if len(intermed) > max_state: max_state = len(intermed) temp = dict( [ (intermed[i][1], i) for i in range(len(intermed)) ] ) bin_d.update ( { (region[0], region[-1]+1) : temp } ) if len(bin_d) == 0: sys.exit() keys = bin_d.keys() keys.sort() binstr2state = {} for binstr in data.iterkeys(): state = '' for start, end in keys: state += str( bin_d[(start, end)][binstr[start:end]] ) binstr2state.update ( { binstr : state } ) print sys.argv[1] + '\t' + binstr + '\t' + state # output file outfile = open(sys.argv[2], 'w') outfile.write('$BASESET:"012"\n') for k in data.iterkeys(): headers = data[k] state = binstr2state[k] for h in headers: outfile.write('#' + h + '\n' + state + '\n') outfile.close()
_batch_g2p_web.py
""" batch g2p score alignments generated from prelim 454 files """ import os, sys from g2pScorer import * import re import urllib, urllib2 import poster from BeautifulSoup import BeautifulSoup NSEQS = 100 poster.streaminghttp.register_openers() G2P_base = 'http://coreceptor.bioinf.mpi-inf.mpg.de/' G2P_index = G2P_base + 'index.php' path = '/Users/apoon/wip/swenson/dutch_v3/hyphy/anc/' files = os.listdir(path) v3ref = 'TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT' def do_g2p (f, of): infile = open(f, 'rU') lines = infile.readlines() infile.close() # 0,0,Node0,TGTACAAG...AGAGCACATTGT # reduce to unique sequences for efficiency # and clip out V3 region hyphy.ExecuteBF("alignOptions = {};", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_CHARACTER_MAP\"]=\"ACGT\";", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_SCORE_MATRIX\"] = "+scoreMatrixNuc, False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN\"] = 100;", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN2\"] = 100;", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND\"] = 10;", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND2\"] = 10;", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_AFFINE\"] = 1;", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 1;", False) # do not penalize prefix or suffix indels d = {} done = {} last_tree = '' for line in lines: if line.startswith('treerep'): continue try: treerep, ancrep, nodename, seq = line.strip('\n').split(',') except: print line raise if treerep != last_tree: last_tree = treerep print treerep id = (treerep, ancrep, nodename) if d.has_key(seq): d[seq]['ids'].append( id ) else: d.update( { seq : {'ids':[id]} } ) print '.. clipped and compressed V3 sequences to ' + str(len(d)) + ' variants' first_headers = dict([ ('_'.join(v['ids'][0]), k) for k,v in d.iteritems() ]) res = [] # send to g2p webserver in sets of 50 for i in range(0, len(d.keys()), 50): tmpfile = open('/tmp/temp.fas', 'w') for h, s in first_headers.items()[i:(i+50)]: tmpfile.write ('>'+h+'\n'+s+'\n') tmpfile.close() # send tmp file to g2p webserver infile = open('/tmp/temp.fas', 'rb') data_to_submit = { 'identifier': 'test', 'slev_cxcr4': 3, 'uploaded_file': infile, 'selVL': 0, 'selD32': 0, 'selCD4pct': 0, 'selCD4': 0, 'selCD8': 0, 'action': 1, 'go': 'Go'} datagen, headers = poster.encode.multipart_encode(data_to_submit) g2p_req = urllib2.Request(G2P_index, datagen, headers) g2p_response = urllib2.urlopen(g2p_req) infile.close() # analyze response from server g2p_html = g2p_response.read() g2p_soup = BeautifulSoup(g2p_html) try: results = g2p_soup.findAll('table')[3] except: print g2p_soup.findAll('table')[-1] raise for row in results: try: index, header, aaseq, subtype, fpr, percentile, comment = [str(x)[4:-5] for x in row.contents] except: continue if header == 'header': continue fpr = float(fpr.strip('%')) try: # apply results to all instances of unique sequence for treerep, ancrep, nodename in d[first_headers[header]]['ids']: res.append( [ int(treerep), int(ancrep), nodename, convert2g2p(fpr), fpr ] ) except: raise res.sort() print '.. calculated g2p scores' outfile = open(of, 'w') outfile.write('treerep,ancrep,nodename,g2p,g2pFPR\n') for items in res: outfile.write(str(items[0])) for val in items[1:]: outfile.write(','+str(val)) outfile.write('\n') outfile.close()
map_g2p_shifts.py
import HyPhy import os, math #v3ref = 'CTRPNNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHC' # hxb2 v3ref = 'CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC' # subtype B consensus h = HyPhy._THyPhy (os.getcwd(), 2) h.ExecuteBF('ACCEPT_ROOTED_TREES = 1;', False) h.ExecuteBF('ACCEPT_BRANCH_LENGTHS = 1;', False) h.ExecuteBF('UseModel (USE_NO_MODEL);', False) scoreMatrixHIV25 = """\ {{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-10,-17},\ {-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-10,-17},\ {-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-10,-17},\ {-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-10,-17},\ {-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-10,-17},\ {-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-10,-17},\ {-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-10,-17},\ {-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-10,-17},\ {-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-10,-17},\ {-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-10,-17},\ {-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-10,-17},\ {-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-10,-17},\ {-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-10,-17},\ {-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-10,-17},\ {-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-10,-17},\ {-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-10,-17},\ {1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-10,-17},\ {-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-10,-17},\ {-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-10,-17},\ {0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-10,-17},\ {-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-10,-17},\ {-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-10,-17},\ {-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,1,-17},\ {-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}}; """ h.ExecuteBF("alignOptions = {};", False) h.ExecuteBF("alignOptions [\"SEQ_ALIGN_CHARACTER_MAP\"]=\"ARNDCQEGHILKMFPSTWYVBZX*\";", False) h.ExecuteBF("alignOptions [\"SEQ_ALIGN_SCORE_MATRIX\"] = "+scoreMatrixHIV25, False) h.ExecuteBF("alignOptions [\"SEQ_ALIGN_AFFINE\"] = 1;", False) h.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 0;", False) def pairwise_align (hyphy, refseq, query, gapOpen=40, gapOpen2=20, gapExtend=10, gapExtend2=5, noTerminalPenalty = False): """ Returns a tuple containing aligned query and reference sequences using Smith-Wasserman algorithm. alignOptions is a persistent HyPhy object initialized in init_hyphy_nuc_align. """ hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN\"] = "+str(gapOpen)+";", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN2\"] = "+str(gapOpen2)+";", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND\"] = "+str(gapExtend)+";", False) hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND2\"] = "+str(gapExtend2)+";", False) dump = hyphy.ExecuteBF ('inStr={{"'+refseq+'","'+query+'"}};', False); dump = hyphy.ExecuteBF ('AlignSequences(aligned, inStr, alignOptions);', False); aligned = hyphy.ExecuteBF ('return aligned;', False); exec "d = " + aligned.sData align_score = int(d['0']['0']) aligned_ref = d['0']['1'] aligned_query = d['0']['2'] return (aligned_query, aligned_ref, align_score) def newick2dict (newick, h): h.ExecuteBF('Tree mytree = '+newick+';', False) h.ExecuteBF('tre = mytree^1;', False) tre = h.ExecuteBF('return tre;', False) dictStr = tre.sData exec('res = '+dictStr) # re-key dictionary using node names d = {} for key in res.iterkeys(): if key == '0': continue nodename = res[key]['Name'] d.update ( { nodename : { 'depth':res[key]['Depth'], 'length':res[key]['Length'] } } ) if res[key].has_key('Parent'): # nodename of this node's parent d[nodename].update ( { 'parent': res[str(res[key]['Parent'])]['Name'] } ) if res[key].has_key('Children'): d[nodename].update ( { 'children': [] } ) for child in res[key]['Children'].itervalues(): d[nodename]['children'].append ( res[str(child)]['Name'] ) return d def get_tips (result, node, tree): if tree[node].has_key('children'): for child in tree[node]['children']: result = get_tips(result, child, tree) else: try: result.append(node) except: print result raise return result def mctree (trees): """ given a list of phylogenetic trees as Dictionary objects, determine which one is the maximum credibility tree by tallying the frequency of every clade and then taking the product of these empirical probabilities for each tree. """ counts = {} # indexed by node name lists tiplists = [[] for i in range(len(trees))] for i in range(len(trees)): tree = trees[i] # starting at Node0, iterate through all internal nodes for node in tree.iterkeys(): if node != 'Node0' and tree[node].has_key('children'): result = [] tips = get_tips(result, node, tree) tips.sort() tips = tuple(tips) if not counts.has_key(tips): counts.update ( {tips: 0} ) counts[tips] += 1./len(trees) tiplists[i].append(tips) creds = [0. for i in range(len(trees))] for i in range(len(trees)): tiplist = tiplists[i] for tips in tiplist: creds[i] += math.log(counts[tips]) mc_index = creds.index(max(creds)) return mc_index path = '/Users/apoon/wip/swenson/dutch_v3/' outfile = open(path+'resample/results/map_g2p_shifts_2.out', 'w') #for patid in [str(i) for i in range(1,9)]+['X']: for patid in ['2', '4', '6', '7', '8']: for compart in ['PLA']: prefix = 'DS'+str(patid)+'_'+compart+'_sample50' print prefix # load Newick trees infile = open(path+'hyphy/newick/'+prefix+'.nwk', 'rU') newicks = [ line.strip('\n') for line in infile.readlines() ] infile.close() # get the maximum credibility tree trees = [ newick2dict(n,h) for n in newicks ] maxcred = mctree(trees) data = trees[maxcred] print maxcred # assign integer indices to tips in left-to-right order in newick string h.ExecuteBF ('Tree mytree='+newicks[maxcred]+';', False) h.ExecuteBF ('bn=BranchName(mytree,-1);', False) recep = h.ExecuteBF ('return bn;', False) exec('nodenames=['+recep.sData.strip('{}\n')+']') tipnames = [ nodename for nodename in nodenames if not nodename.startswith('Node') ] # get node heights for node in data.iterkeys(): total_t = 0. current_node = node while data[current_node].has_key('children'): next_node = data[current_node]['children'][0] total_t += data[next_node]['length'] current_node = next_node # extract days since January 1, 1990 from current node header days = int(current_node.split('_')[-1]) end_date = days - total_t start_date = end_date - data[node]['length'] data[node].update ( { 'start': start_date, 'end': end_date } ) # load g2p scores for observed sequences infile = open(path+'samples/'+prefix+'.g2p', 'rU') lines = infile.readlines() infile.close() for line in lines[1:]: header, aligned_query, g2p, fpr = line.strip('\n').split(',') data[header].update ( { 'g2p':float(g2p) } ) # load ancestral sequence info infile = open(path+'hyphy/g2p/'+prefix+'.anc3_rev.g2p', 'rU') lines = infile.readlines() infile.close() for line in lines[1:]: treerep, ancrep, nodename, g2p, fpr = line.strip('\n').split(',') if int(treerep) == maxcred: if not data[nodename].has_key('g2p'): data[nodename].update ( { 'g2p': [] } ) data[nodename]['g2p'].append(float(g2p)) elif int(treerep) > maxcred: break # find largest shifts in g2p shifts = {} for ar in range(100): shifts.update ( {ar: {}} ) for node in data.iterkeys(): if node == 'Node0': continue if data[node].has_key('children'): parent = data[node]['parent'] to_g2p = data[node]['g2p'][ar] from_g2p = data[parent]['g2p'][ar] to_val = math.log (1./to_g2p - 1) from_val = math.log (1./from_g2p - 1) delta = to_val - from_val if (delta < -0.01): shifts[ar].update ( { node: delta } ) # use frequencies of nodes in shift lists to compute # maximum credibilities of replicate ancestral reconstructions # use sum of credibilities so that we are not penalizing # shift dictionaries for size counts = {} for ar in range(100): for node in shifts[ar].iterkeys(): if not counts.has_key(node): counts.update ( { node: 0.01 } ) else: counts[node] += 0.01 creds = [] for ar in range(100): cred = 0. for node in shifts[ar].iterkeys(): cred += counts[node] creds.append( [cred, ar] ) creds.sort(reverse=True) # descending order #print creds[:10] maxcc_i = creds[0][1] maxcc = shifts[maxcc_i] print maxcc_i # create a new dictionary object with keys from maxcc # and incorporating information from 'data' theNodes = maxcc.keys() theTree = {} # identify the most recent ancestor for each node in this list # that only includes other nodes in the same list or Node0 for node in theNodes: itsTips = [] itsTips = get_tips (itsTips, node, data) # also figure out what the most recent tip is descended from this node ypos = 0. last_days = 0 for tip in itsTips: ypos += tipnames.index(tip) / float(len(tipnames)) days = int(tip.split('_')[-1]) if days > last_days: last_days = days # also figure out what proportion of the sample at the most recent # time point is descended from this ancestor n_descends = 0 for tip in itsTips: days = int(tip.split('_')[-1]) if days == last_days: n_descends += 1 theTree.update ( { node: { 'delta': maxcc[node], 'start': data[node]['start'], 'end': data[node]['end'], 'g2p': data[node]['g2p'][maxcc_i], 'ypos': ypos / len(itsTips), 'lastdate': last_days, 'n_descends': n_descends*2} } ) curnode = node while 1: parent = data[curnode]['parent'] if parent == 'Node0' or parent in theNodes: theTree[node].update ( { 'parent': parent } ) break curnode = parent # determine if nodes are closest to tips for node in theTree.iterkeys(): is_parent = False for node2 in theTree.iterkeys(): if node == node2: continue if theTree[node2]['parent'] == node: is_parent = True break if is_parent: theTree[node]['lastdate'] = 'NA' # add root theTree.update ( { 'Node0': { 'start': data['Node0']['start'], 'end': data['Node0']['end'], 'g2p': data['Node0']['g2p'][maxcc_i], 'ypos': 0.5, 'lastdate': 'NA', 'n_descends': '100'} } ) # load protein sequence reconstructions infile = open(path+'hyphy/anc/'+prefix+'.prot', 'rU') lines = infile.readlines() infile.close() for line in lines: treerep, ancrep, nodename, aaseq = line.strip('\n').split(',') if int(treerep) == maxcred and int(ancrep) == maxcc_i: if theTree.has_key(nodename): aligned_query, aligned_ref, align_score = pairwise_align(h, v3ref, aaseq) theTree[nodename].update ( {'aaseq': aligned_query, 'refseq':aligned_ref} ) elif int(treerep) > maxcred: break # output results for node in theTree.iterkeys(): outfile.write(patid+','+compart+','+node+',') outfile.write(str(theTree[node]['ypos'])+',') outfile.write(str(theTree[node]['lastdate'])+',') outfile.write(str(theTree[node]['n_descends'])+',') outfile.write('%f,%f,' % (theTree[node]['start'], theTree[node]['end'])) outfile.write(str(theTree[node]['g2p'])+',') if node == 'Node0': outfile.write('NA,') else: outfile.write(str(theTree[node]['delta'])+',') # get sequence outfile.write(theTree[node]['aaseq']+','+theTree[node]['refseq']+',') if node == 'Node0': outfile.write(',\n') else: parent = theTree[node]['parent'] outfile.write(parent+','+theTree[parent]['aaseq']+'\n') outfile.close()
viz_g2p_shifts.py
import math from g2pScorer import convert2fpr last_timepts = [ 1436, 1631, 2318, 2052, 2978, 2016, 1354, 1449, 1211, 1625] t_adjust = [ 0, 181, 182, 182, 171, 182, 184, 207, 166, 187 ] patids = [str(i) for i in range(1,10)] + ['X'] time_of_si = {} for i in range(10): time_of_si.update( {patids[i]: last_timepts[i] - t_adjust[i] } ) print time_of_si path = '/Users/apoon/wip/swenson/dutch_v3/' infile = open(path+'results/map_g2p_shifts_2.out', 'rU') lines = infile.readlines() infile.close() data = {} for line in lines: patid, compart, node, ypos, lastdate, n_descend, start, end, g2p, delta, seq, refseq, parent, parseq = line.strip('\n').split(',') if not data.has_key(patid): data.update ( {patid: {}} ) if not data[patid].has_key(compart): data[patid].update ( {compart: {}} ) data[patid][compart].update ( {node: { 'ypos': float(ypos), 'lastdate':lastdate, 'n_descend':int(n_descend), 'start': float(start), 'end': float(end), 'g2p': float(g2p), 'seq': seq, 'refseq': refseq, 'parent': parent, 'parseq': parseq}} ) # do we need to rescale the x-axis? can fit about 6 years into plot area plot_settings = {} for patid in data.iterkeys(): plot_settings.update ( {patid: {'min':99999, 'max':0}} ) for compart in data[patid].iterkeys(): starts = [ v['start'] for v in data[patid][compart].itervalues() ] ends = [ int(v['lastdate']) for v in data[patid][compart].itervalues() if v['lastdate'] != 'NA' ] span = ( int(min(starts)), int(max(ends)) ) # number of years past 1990 if span[0] < plot_settings[patid]['min']: plot_settings[patid]['min'] = span[0] if span[1] > plot_settings[patid]['max']: plot_settings[patid]['max'] = span[1] print plot_settings compart2label = {'PLA':'plasma', 'PBM':'PBMC'} # iterate through patients and compartments for patid in data.iterkeys(): for compart in data[patid].iterkeys(): prefix = 'DS'+patid+'_'+compart print prefix outfile = open(path+'results/'+prefix+'_2.ps', 'w') outfile.write('%!\n%%PS file '+prefix+'\n<< /PageSize [2500 2500] >> setpagedevice\n') # draw patient + compartment label outfile.write('/Helvetica findfont\n') outfile.write('80 scalefont\nsetfont\n') outfile.write('250 2250 moveto (%s) show\n' % ('DS'+patid+' '+compart2label[compart])) # draw timeline x_scale = 2000. / ((plot_settings[patid]['max'] - plot_settings[patid]['min'])) outfile.write('0.5 0.5 0.5 setrgbcolor\n') x = 250 + x_scale * (time_of_si[patid] - plot_settings[patid]['min']) outfile.write('%d %d moveto %d %d lineto stroke\n' % (x, 250, x, 2250)) outfile.write('/Helvetica findfont\n') outfile.write('48 scalefont\nsetfont\n') for node in data[patid][compart].iterkeys(): print node # draw points given (x,y) coordinates = (end, ypos*2000) x = int(250 + x_scale * (data[patid][compart][node]['end'] - plot_settings[patid]['min'])) y = int(2000*data[patid][compart][node]['ypos']+250) if node != 'Node0': # draw Bezier curve to parent node parent = data[patid][compart][node]['parent'] px = int(250 + x_scale * (data[patid][compart][parent]['end'] - plot_settings[patid]['min']) ) py = int(2000*data[patid][compart][parent]['ypos']+250) ##rectangular tree #outfile.write( '%d %d moveto %d %d lineto stroke\n' % (x, y, px, y) ) #outfile.write( '%d %d moveto %d %d lineto stroke\n' % (px, y, px, py) ) n_descend = data[patid][compart][node]['n_descend'] g2p = data[patid][compart][node]['g2p'] fpr = convert2fpr (g2p) / 100. #print '%s\t\t%1.3f\t%1.1f' % (node, g2p, fpr*100) outfile.write('%d setlinewidth\n' % (5+n_descend/3)) outfile.write( '%1.3f %1.3f %1.3f setrgbcolor\n' % (1-math.exp(-5*g2p), 0., math.exp(-5*g2p)) ) if y > py: outfile.write( '%d %d moveto %d %d %d %d %d %d curveto stroke\n' % (x, y, px, y, px, y, px, py) ) else: outfile.write( '%d %d moveto %d %d %d %d %d %d curveto stroke\n' % (x, y, px, y, px, y, px, py) ) lastdate = data[patid][compart][node]['lastdate'] if lastdate != 'NA': lastdate = 250 + x_scale * (int(lastdate) - plot_settings[patid]['min']) outfile.write('0.8 0.8 0.8 setrgbcolor\n') outfile.write('%d %d moveto %d %d lineto stroke\n' % (x, y, lastdate, y)) outfile.write('0.5 0.5 0.5 setrgbcolor\n') outfile.write('%d %d moveto (%s) show\n' % (lastdate+5, y-10, str(n_descend)+'%')) # determine substitutions seq = data[patid][compart][node]['seq'] refseq = data[patid][compart][node]['refseq'] parseq = data[patid][compart][node]['parseq'] coords = [] index = 1 for refchar in refseq: if refchar == '-': coords.append('\u2207'+str(index)) else: coords.append(str(index)) index += 1 outfile.write( '0 0 0 setrgbcolor\n' ) nsubs = 0 for i in range(len(seq)): if seq[i] != parseq[i]: if y > py: outfile.write('%d %d moveto (%s) show\n' % (x-120, y+10+nsubs*50, parseq[i]+coords[i]+seq[i])) else: outfile.write('%d %d moveto (%s) show\n' % (x-120, y-50-nsubs*50, parseq[i]+coords[i]+seq[i])) nsubs += 1 outfile.write( '0 0 0 setrgbcolor\n' ) outfile.write( '%d %d 10 0 360 arc fill\n' % (x,y) ) # draw time line outfile.write( '0 0 0 setrgbcolor\n' ) outfile.write('5 setlinewidth\n') outfile.write('250 200 moveto 2250 200 lineto stroke\n') outfile.write('/Helvetica findfont\n') outfile.write('56 scalefont\nsetfont\n') for i in range(int(plot_settings[patid]['min']/365.25), int(plot_settings[patid]['max']/365.25)+1): xcoord = 250 + x_scale * (i*365.25 - plot_settings[patid]['min']) if xcoord > 0: outfile.write('%d 150 moveto (%s) show\n' % ( xcoord, str(1990+i) ) ) # draw legend refpts = [1., 0.351901, 0.156598, 0.105784, 0.059826, 0] fprs = ['0', '3.5', '10', '20', '50', '100'] outfile.write('/Helvetica findfont\n') outfile.write('42 scalefont\nsetfont\n') for i in range(len(refpts)): outfile.write( '%1.3f %1.3f %1.3f setrgbcolor\n' % (1-math.exp(-5*refpts[i]), 0., math.exp(-5*refpts[i])) ) outfile.write('newpath\n%d %d moveto\n' % (100, 300+75*i)) outfile.write('%d %d lineto\n' % (200,300+75*i)) outfile.write('%d %d lineto\n' % (200,375+75*i)) outfile.write('%d %d lineto\n' % (100,375+75*i)) outfile.write('%d %d lineto\n' % (100,300+75*i)) outfile.write('closepath\nfill\n') outfile.write('1 1 1 setrgbcolor\n') outfile.write('%d %d moveto (%s) show\n' % (115, 325+75*i, fprs[i])) outfile.write('showpage\n') outfile.close()