Emerge

From HyPhy Wiki
Jump to: navigation, search

This wiki page provides HyPhy scripts that were used in a phylogenetic analysis of deep sequence data, which is presented in a manuscript submitted to PLoS Comput Biol. Many of the Python scripts will depend on the HyPhy shared library.

Contents

Workflow

This flowchart is also provided as Supporting Figure S4.

Flowchart summarizing the role of various scripts in data processing and visualization.

HyPhy batch files

Name Description Dependencies Link
emelineV2.5.bf Screen sequences for indel errors by pairwise alignment to reference Utility/GrabBag.bf, Utility/DescriptiveStats.bf, and TemplateModels/chooseGeneticCode.def File:EmelineV2.5.bf
batchAlignMPI.bf MPI wrapper for parallel execution of emelineV2.5 on a computing cluster MPI environment File:BatchAlignMPI.bf
convertToNexus.bf Batch conversion of sequence files to NEXUS format, after censoring stop codons with gap characters. CleanStopCodons.bf File:ConvertToNexus.bf
fit_codon_model.bf Fit a codon substitution model to replicate trees and sample ancestral sequences from the posterior distribution. fit_codon_model.ibf, TemplateModels/chooseGeneticCode.def File:Fit codon model.bf
fit_codon_model.ibf Support functions for fit_codon_model.bf File:Fit codon model.ibf.txt
_mpiFitCodon.bf MPI wrapper for fit_codon_model.bf MPI environment File:MpiFitCodon.bf
fit_indels_v4.bf Fit a reversible model of indel evolution to integer-valued sequence data encoding indel polymorphisms. File:Fit indels v4.bf
map_in_time.bf Match geno2pheno predictions based on ancestral sequence reconstructions to internal nodes of replicate trees. Utility/ReadDelimitedFiles.bf File:Map in time.bf
FindMRCAofX4.bf Identify the internal node corresponding to the most recent common ancestor (MRCA) of all observed CXCR4-using sequences (i.e., with predicted g2p FPR < 3.5). Utility/ReadDelimitedFiles.bf, Utility/GrabBag.bf, TreeTools.ibf File:FindMRCAofX4.bf


Python scripts

Please note that these scripts were not designed to be portable and many file paths will be broken when executed on another computer. In addition, many of these scripts have dependencies on third-party packages such as mpi4py or the HyPhy shared library.

merge_alignments.py

"""
Gather CSV output from emeline, merge identical sequences and
organize by patient and time point in a format that is ready
for going into BEAST -- i.e., time stamp in sequence header.
"""

import subprocess
from csv import reader
import sys, os

# load link info
infile = open('/Users/apoon/wip/swenson/dutch_v3/info/enum_link.csv', 'rU')
lines = infile.readlines()
infile.close()

# enum,studyID,patientNum,timeToSI,specimen,days.since
# 36155A,DS1,13940,-23.05,PLASMA,735
days_d = {}
for line in lines:
	enum, study_id, patnum, time_to_si, specimen, days_since = line.strip('\n').split(',')
	days_d.update ( { (study_id, time_to_si, specimen) : days_since } )


# get patient information from Luke's spreadsheet
lines = reader(open('/Users/apoon/wip/swenson/dutch_v3/info/454 - complete 454 results_repaired.csv', 'rU'))
header = lines.next()

d = {}
for subj_id, patid, enum, avgTtoSI, samptype, x4_count, total_count, percent_x4, top_fpr in lines:
	d.update ( { enum : {'subject':subj_id, 'is_plasma':(samptype in ['PLASMA','PLA']), 'avgTtoSI':avgTtoSI} } )

# use UNIX find to generate list of paths to files
root = '/Users/apoon/wip/swenson/dutch_v3/'

p = subprocess.Popen('find ' + root + 'prelim -name "*.prealign.fas.csv"', shell=True, stdout=subprocess.PIPE)
paths = [path.strip('\n') for path in p.stdout.readlines()]

for path in paths:
	enum = path.split('/')[-1].split('_')[0]
	subj_id = d[enum]['subject']
	is_plasma = d[enum]['is_plasma']
	timing = d[enum]['avgTtoSI']
	
	days_since = days_d[ ( subj_id, timing, 'PLASMA' if is_plasma else 'PBMC' ) ]
	
	newpath = root+'merged/'+subj_id+'_'+('PLA' if is_plasma else 'PBM')+'.fasta'
	if os.path.exists(newpath):
		outfile = open(newpath, 'a')
	else:
		outfile = open(newpath, 'w')
	
	infile = open(path, 'rU')
	lines = infile.readlines()
	infile.close()
	
	# compress alignment (merge reads that are identical after screening for errors with emeline)
	temp = {}
	for line in lines:
		name, orig, score, num_shifts, aligned_ref, aligned_query, aligned_seq = line.strip('\n').split(',')
		if int(score) < 200:
			#print aligned_seq
			continue
		count = int(name.split()[1])
		if aligned_seq not in temp.keys():
			temp.update ( {aligned_seq : count} )
		else:
			temp[aligned_seq] += count
	
	intermed = [ (v, k) for k, v in temp.iteritems() ]
	intermed.sort(reverse=True)
	index = 0
	
	for count, seq in intermed:
		outfile.write('>' + enum + '_' + str(index) + '_' + str(count) + '_' + days_since + '\n')
		outfile.write(seq+'\n')
		index += 1
		
	outfile.close()


randomsampler.py

"""
Sample 50 sequences in proportion to their counts in longitudinal 454 data sets.
"""
import os, random
from seqUtils import *

sample_size = 50

path = '/Users/apoon/wip/swenson/dutch_v3/merged/'
files = os.listdir(path)

for f in files:
	if f.endswith('.fasta'):
		infile = open(path+f, 'rU')
		fasta = convert_fasta(infile.readlines())
		infile.close()
		
		seqs = {}
		counts = {}
		for h, s in fasta:
			enum, index, count, days_since = h.split('_')
			
			# generate dictionaries of sequences keyed by indices, stratified by "days since"
			if not seqs.has_key(days_since): seqs.update ( { days_since : { } } )
			seqs[days_since].update ( { index : (h,s) } )
			
			# also make lists of indices replicated by their respective counts
			if not counts.has_key(days_since): counts.update ( { days_since : [] } )
			counts[days_since] += [index] * int(count)
			
		
		outfile = open (path.replace('merged', 'samples')+f.split('.')[0]+'_sample'+str(sample_size) + 'a.fasta', 'w')
		
		for days_since in counts.iterkeys():
			indices = counts[days_since]
			sample = [ random.sample(indices, 1)[0] for i in range(sample_size) ]
			new_index = 0
			for ix in sample:
				h, s = seqs[days_since][ix]
				outfile.write ('>' + str(new_index) + '_' + h + '\n' + s + '\n')
				new_index += 1
		
		outfile.close()


parse_beast_xml.py

from xml.etree.ElementTree import ElementTree as Tree
from xml.etree.ElementTree import Element as Node
from Bio import SeqIO
import sys
import os

# _________________________________________________________________
if len(sys.argv) != 4:
	print "Incorrect number of arguments ("+str(len(sys.argv))+"), expecting 3"
	print "Usage: python parse_beast_xml.py [template file] [NEXUS alignment] [output stem]"
	sys.exit()

# stem should be the destination path

# unpack arguments from command line
(template_file, nexus_file, stem) = sys.argv[1:]

output_file = nexus_file.split('/')[-1].strip('.nex')	# the XML file to write
stem += output_file.split('/')[-1]	# add the filename prefix for naming log files
print 'Using stem: '+ stem

# import sequences from NEXUS file
alignment = []
for record in SeqIO.parse(open(nexus_file, 'rU'), 'nexus'):
	alignment.append([record.id, str(record.seq)])

template = Tree()
t_root = template.parse(template_file)

# reset TAXA and ALIGNMENT blocks
t_taxa = template.findall('taxa')[0]

t_taxa._children = []

t_aln = template.find('alignment')
t_aln._children = []

time_unit = 'days'
print 'WARNING: specifying time in units of '+time_unit+'!'

all_dates = []

# update blocks
for row in alignment:
	(id,seq) = row
	
	# TAXON
	date_val = id.split('_')[-1]
	if int(date_val) not in all_dates:
		all_dates.append(int(date_val))
	try:
		date = Node('date', {'units':time_unit, 'direction':'forwards', 'value':str(float(date_val))})
	except:
		print '********'
		print row
		raise
		
	taxon = Node('taxon', {'id':id})
	taxon.append(date)
	t_taxa.append(taxon)
	
	# SEQUENCE
	seqtag = Node('sequence', {})
	staxon = Node('taxon', {'idref':id})
	staxon.tail = '\n\t\t\t'+seq	# mimic formatting in BEAST XML
	seqtag.append(staxon)
	t_aln.append(seqtag)

if len(all_dates) < 2:
	print '********************************************'
	print 'ERROR: only one sample date in file ' + nexus_file
	print '********************************************'
	sys.exit()

date_diff = abs(all_dates[0]-all_dates[1])

# revise log file paths and prior settings
t_mcmc = template.find('mcmc')
log_elements = t_mcmc.findall('log')
for log in log_elements:
	if log.get('id') == 'fileLog':
		log.set('fileName', stem+'.log')
		break

log_tree_element = t_mcmc.find('logTree')

if (log_tree_element):
	log_tree_element.set('fileName', stem+'.trees')
else:
	print "ERROR: Failed to find tree log, exiting without write!"
	sys.exit()

# for local use
template.write(output_file+'.xml')


mpibeast.py

#!/usr/bin/env python
import os, sys
from subprocess import Popen, PIPE

from mpi4py import MPI
my_rank = MPI.COMM_WORLD.Get_rank()
nprocs = MPI.COMM_WORLD.Get_size()

if len(sys.argv) < 2:
	print 'python mpibeast.py [path to XML]'

path_to_xml = sys.argv[1]
if not path_to_xml.endswith('/'):
	path_to_xml += '/'

files = os.listdir(path_to_xml)
files = [f for f in files if f.endswith('.xml')]


for i in range(len(files)):
	if i%nprocs == my_rank:
		sys.stdout.write("Process %d of %d starting task %d\n"%(my_rank, nprocs, i))
		p = Popen(['java', '-jar', '/home/art/src/BEASTv1.6.1/lib/beast.jar', path_to_xml+files[i]], stdout=PIPE)
		result = p.communicate()[0].split('\n')
		sys.stdout.write("Process %d of %d completed task %d:\n%s\n"%(my_rank, nprocs, i, result[-1]))

MPI.COMM_WORLD.Barrier()
MPI.Finalize()


parseTreeLog.py

# open a BEAST treelog file, thin the trees inside,
# and convert into a conventional Newick format,
# write out to another file

import sys, re

if len(sys.argv) < 4:
	print 'Usage: python parseTreeLog.py [infile] [percent burnin (float)] [sample size]'
	sys.exit()

infile = open(sys.argv[1], 'rU')
lines = infile.readlines()
infile.close()

burnin = float(sys.argv[2])  # 0.5 is 50% burn-in
sampsize = int(sys.argv[3])	# how many trees to retain

outfile = open(sys.argv[1].replace('.trees', '.nwk'), 'w')

trBlock = False
label_d = {}
for i in range(len(lines)):
	if 'Translate' in lines[i]:
		trBlock = True
		continue
	
	if trBlock:
		if ';' in lines[i]:
			trBlock = False
			continue
			
		try:
			index, label = lines[i].strip('\n').strip(',').split()[-2:]
		except:
			print lines[i]
			raise
		
		label_d.update( { index:label } )

trees = [i for i in range(len(lines)) if lines[i].startswith('tree')]
ntrees = len(trees)

interval = int(round((ntrees*(1.-burnin))/sampsize))

figRegex = re.compile('\[[^]]+\]')
idxRegex = re.compile('([0-9]+):')

def idx2label (index):
	return label_d[index.groups()[0]]+':'

tally = 0
for i in range(trees[int(ntrees*burnin)], trees[-1], interval):
	tree = lines[i].split()[-1].strip(';')
	
	# sub out figtree annotations (branch lengths in evolutionary rates)
	tree = re.sub(figRegex, '', tree)
	
	# replace all indices with labels
	tree = re.sub(idxRegex, idx2label, tree)
	
	outfile.write(tree+';\n')
	tally += 1
	if tally == sampsize:
		break

outfile.close()


scanForIndels.py

"""
Scan FASTA file for different indel configurations
output binary character strings in FASTA format 
with one column for each configuration.
"""

import sys, re
from seqUtils import *
from pairAlign import *

hyphy = HyPhy._THyPhy (os.getcwd(),2)

g2p_ref = 'CTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHC'
init_hyphy_protein_align(hyphy)
hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 1;", False)

left_trail = re.compile('^([-]+)\w')
right_trail = re.compile('\w([-]+)$')

if len(sys.argv) < 3:
	print 'Usage: python scanForIndels.py [FASTA file] [outfile]'
	sys.exit()

# load FASTA file contents
infile = open(sys.argv[1], 'rU')
fasta = convert_fasta(infile.readlines())
infile.close()


# figure out how to trim sequences to V3 region

data = {}

# do not count prefix/suffix gaps in partial sequences
regex = re.compile('[A-Z][-]+[A-Z]')

for header, seq in fasta:
	aaseq = translate_nuc(seq,0)
	
	aligned_query, aligned_ref, score = pairwise_align (aaseq, g2p_ref, hyphy)
	
	try:
		left_clip = aligned_query[ : len(left_trail.findall(aligned_ref)[0])]
		right_clip = aligned_query[-len(right_trail.findall(aligned_ref)[0]) : ]
	except:
		print aligned_ref
		raise
	
	aaseq = aaseq.replace(left_clip,'').replace(right_clip,'')
	
	matches = regex.finditer(aaseq)
	binstr = [ '0' for i in range(len(aaseq)) ]
	for match in matches:
		left, right = match.span()
		for i in range(left+1, right-1):
			binstr[i] = '1'
	
	binstr = ''.join(binstr)
	
	if data.has_key(binstr):  data[binstr].append(header)
	else:  data.update( { binstr : [header] } )


# determine number of indel sites 
regex = re.compile('0[1]+0|^[1]+0|0[1]+$')
regions = []

for binstr in data.iterkeys():
	matches = regex.finditer(binstr)
	for m in matches:
		left, right = m.span()
		if m.group().startswith('0'):
			left += 1
		if m.group().endswith('0'):
			right -= 1
		res = range(left, right)
		if res not in regions:
			regions.append(res)


# sort regions by size
intermed = [ (len(r), r) for r in regions ]
intermed.sort(reverse=True)
regions = [ x[1] for x in intermed ]


# collapse regions that overlap, longest regions get transferred first
next_regions = []
for region in regions:
	if len(next_regions) == 0:
		next_regions.append(region)
		continue
	
	overlap = False
	for nr in next_regions:
		if len( set(region).intersection( set(nr) ) ) > 0:
			overlap = True
			break
	
	if not overlap:
		next_regions.append(region)

regions = next_regions


# generate binary sequence dictionary based on regions
max_state = 0
bin_d = {}
for region in regions:
	states = {}
	for binstr in data.iterkeys():
		state = ''.join( [ binstr[i] for i in region ] )
		count = len (data[binstr])
		if not states.has_key(state):
			states.update ( { state : count } )
		else:
			states[state] += count
	
	intermed = [ (v, k) for k, v in states.iteritems() ]
	intermed.sort (reverse=True)
	if len(intermed) > max_state:
		max_state = len(intermed)
	
	temp = dict( [ (intermed[i][1], i) for i in range(len(intermed)) ] )
	bin_d.update ( { (region[0], region[-1]+1) : temp } )


if len(bin_d) == 0:
	sys.exit()

keys = bin_d.keys()
keys.sort()


binstr2state = {}

for binstr in data.iterkeys():
	state = ''
	for start, end in keys:
		state += str( bin_d[(start, end)][binstr[start:end]] )
		
	binstr2state.update ( { binstr : state } )
	print sys.argv[1] + '\t' + binstr + '\t' + state


# output file
outfile = open(sys.argv[2], 'w')
outfile.write('$BASESET:"012"\n')

for k in data.iterkeys():
	headers = data[k]
	state = binstr2state[k]
	for h in headers:
		outfile.write('#' + h + '\n' + state + '\n')

outfile.close()


_batch_g2p_web.py

"""
batch g2p score alignments generated from prelim 454 files
"""

import os, sys
from g2pScorer import *

import re
import urllib, urllib2
import poster
from BeautifulSoup import BeautifulSoup

NSEQS = 100

poster.streaminghttp.register_openers()

G2P_base = 'http://coreceptor.bioinf.mpi-inf.mpg.de/'
G2P_index = G2P_base + 'index.php'


path = '/Users/apoon/wip/swenson/dutch_v3/hyphy/anc/'

files = os.listdir(path)

v3ref = 'TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT'

def do_g2p (f, of):
	infile = open(f, 'rU')
	lines = infile.readlines()
	infile.close()
	
	# 0,0,Node0,TGTACAAG...AGAGCACATTGT
	
	
	# reduce to unique sequences for efficiency
	# and clip out V3 region
	
	hyphy.ExecuteBF("alignOptions = {};", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_CHARACTER_MAP\"]=\"ACGT\";", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_SCORE_MATRIX\"] = "+scoreMatrixNuc, False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN\"] = 100;", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN2\"] = 100;", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND\"] = 10;", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND2\"] = 10;", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_AFFINE\"] = 1;", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 1;", False) # do not penalize prefix or suffix indels
	
	d = {}
	done = {}
	last_tree = ''
	
	for line in lines:
		if line.startswith('treerep'):
			continue
		
		try:
			treerep, ancrep, nodename, seq = line.strip('\n').split(',')
		except:
			print line
			raise
		
		if treerep != last_tree:
			last_tree = treerep
			print treerep
		
		id = (treerep, ancrep, nodename)
		
		if d.has_key(seq):
			d[seq]['ids'].append( id )
		else:
			d.update( { seq : {'ids':[id]} } )
	
	print '.. clipped and compressed V3 sequences to ' + str(len(d)) + ' variants'
	
	
	first_headers = dict([ ('_'.join(v['ids'][0]), k) for k,v in d.iteritems() ])
	res = []
	
	# send to g2p webserver in sets of 50
	for i in range(0, len(d.keys()), 50):
		tmpfile = open('/tmp/temp.fas', 'w')
		for h, s in first_headers.items()[i:(i+50)]:
			tmpfile.write ('>'+h+'\n'+s+'\n')
		
		tmpfile.close()
			
		# send tmp file to g2p webserver
		infile = open('/tmp/temp.fas', 'rb')
		
		data_to_submit = {	'identifier': 'test',
							'slev_cxcr4': 3,
							'uploaded_file': infile, 
							'selVL': 0, 'selD32': 0, 'selCD4pct': 0, 'selCD4': 0, 'selCD8': 0,
							'action': 1,
							'go': 'Go'}
		
		datagen, headers = poster.encode.multipart_encode(data_to_submit)
		g2p_req = urllib2.Request(G2P_index, datagen, headers)
		g2p_response = urllib2.urlopen(g2p_req)
		
		infile.close()
		
		# analyze response from server
		g2p_html = g2p_response.read()
		g2p_soup = BeautifulSoup(g2p_html)
		
		try:
			results = g2p_soup.findAll('table')[3]
		except:
			print g2p_soup.findAll('table')[-1]
			raise
		
		for row in results:
			try:
				index, header, aaseq, subtype, fpr, percentile, comment = [str(x)[4:-5] for x in row.contents]
			except:
				continue
			
			if header == 'header':
				continue
			
			fpr = float(fpr.strip('%'))
			try:
				# apply results to all instances of unique sequence
				for treerep, ancrep, nodename in d[first_headers[header]]['ids']:
					res.append( [ int(treerep), int(ancrep), nodename, convert2g2p(fpr), fpr ] )
			except:
				raise
	
	res.sort()
	
	print '.. calculated g2p scores'
	
	outfile = open(of, 'w')
	outfile.write('treerep,ancrep,nodename,g2p,g2pFPR\n')
	
	for items in res:
		outfile.write(str(items[0]))
		for val in items[1:]:
			outfile.write(','+str(val))
		outfile.write('\n')
		
	outfile.close()

map_g2p_shifts.py

import HyPhy

import os, math

#v3ref = 'CTRPNNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHC' # hxb2
v3ref =  'CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC' # subtype B consensus

h = HyPhy._THyPhy (os.getcwd(), 2)

h.ExecuteBF('ACCEPT_ROOTED_TREES = 1;', False)
h.ExecuteBF('ACCEPT_BRANCH_LENGTHS = 1;', False)
h.ExecuteBF('UseModel (USE_NO_MODEL);', False)


scoreMatrixHIV25 = """\
{{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-10,-17},\
{-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-10,-17},\
{-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-10,-17},\
{-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-10,-17},\
{-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-10,-17},\
{-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-10,-17},\
{-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-10,-17},\
{-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-10,-17},\
{-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-10,-17},\
{-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-10,-17},\
{-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-10,-17},\
{-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-10,-17},\
{-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-10,-17},\
{-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-10,-17},\
{-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-10,-17},\
{-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-10,-17},\
{1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-10,-17},\
{-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-10,-17},\
{-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-10,-17},\
{0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-10,-17},\
{-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-10,-17},\
{-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-10,-17},\
{-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10,1,-17},\
{-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}};
"""

h.ExecuteBF("alignOptions = {};", False)
h.ExecuteBF("alignOptions [\"SEQ_ALIGN_CHARACTER_MAP\"]=\"ARNDCQEGHILKMFPSTWYVBZX*\";", False)
h.ExecuteBF("alignOptions [\"SEQ_ALIGN_SCORE_MATRIX\"] = "+scoreMatrixHIV25, False)
h.ExecuteBF("alignOptions [\"SEQ_ALIGN_AFFINE\"] = 1;", False)
h.ExecuteBF("alignOptions [\"SEQ_ALIGN_NO_TP\"] = 0;", False)


def pairwise_align (hyphy, refseq, query, gapOpen=40, gapOpen2=20, gapExtend=10, gapExtend2=5, noTerminalPenalty = False):
	"""
	Returns a tuple containing aligned query and reference sequences using
	Smith-Wasserman algorithm.
	alignOptions is a persistent HyPhy object initialized in init_hyphy_nuc_align.
	"""
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN\"] = "+str(gapOpen)+";", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_OPEN2\"] = "+str(gapOpen2)+";", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND\"] = "+str(gapExtend)+";", False)
	hyphy.ExecuteBF("alignOptions [\"SEQ_ALIGN_GAP_EXTEND2\"] = "+str(gapExtend2)+";", False)
	
	dump = hyphy.ExecuteBF ('inStr={{"'+refseq+'","'+query+'"}};', False);
	dump = hyphy.ExecuteBF ('AlignSequences(aligned, inStr, alignOptions);', False);
	aligned = hyphy.ExecuteBF ('return aligned;', False);
	exec "d = " + aligned.sData
	
	align_score = int(d['0']['0'])
	aligned_ref = d['0']['1']
	aligned_query = d['0']['2']
	
	return (aligned_query, aligned_ref, align_score)


def newick2dict (newick, h):
	h.ExecuteBF('Tree mytree = '+newick+';', False)
	h.ExecuteBF('tre = mytree^1;', False)
	tre = h.ExecuteBF('return tre;', False)
	dictStr = tre.sData
	exec('res = '+dictStr)
	# re-key dictionary using node names
	d = {}
	for key in res.iterkeys():
		if key == '0':
			continue
		nodename = res[key]['Name']
		d.update ( { nodename : {	'depth':res[key]['Depth'],
									'length':res[key]['Length']
								} } )
		
		if res[key].has_key('Parent'):
			# nodename of this node's parent
			d[nodename].update ( { 'parent': res[str(res[key]['Parent'])]['Name'] } )
		
		if res[key].has_key('Children'):
			d[nodename].update ( { 'children': [] } )
			for child in res[key]['Children'].itervalues():
					d[nodename]['children'].append ( res[str(child)]['Name'] )
			
	return d


def get_tips (result, node, tree):
	if tree[node].has_key('children'):
		for child in tree[node]['children']:
			result = get_tips(result, child, tree)
	else:
		try:
			result.append(node)
		except:
			print result
			raise
	return result


def mctree (trees):
	"""
	given a list of phylogenetic trees as Dictionary objects,
	determine which one is the maximum credibility tree by
	tallying the frequency of every clade and then taking the
	product of these empirical probabilities for each tree.
	"""
	
	counts = {} # indexed by node name lists
	tiplists = [[] for i in range(len(trees))]
	
	for i in range(len(trees)):
		tree = trees[i]
		# starting at Node0, iterate through all internal nodes
		for node in tree.iterkeys():
			if node != 'Node0' and tree[node].has_key('children'):
				result = []
				tips = get_tips(result, node, tree)
				tips.sort()
				tips = tuple(tips)
				if not counts.has_key(tips):
						counts.update ( {tips: 0} )
				counts[tips] += 1./len(trees)
				tiplists[i].append(tips)
	
	creds = [0. for i in range(len(trees))]
	for i in range(len(trees)):
		tiplist = tiplists[i]
		for tips in tiplist:
			creds[i] += math.log(counts[tips])
	
	mc_index = creds.index(max(creds))
	
	return mc_index


path = '/Users/apoon/wip/swenson/dutch_v3/'

outfile = open(path+'resample/results/map_g2p_shifts_2.out', 'w')

#for patid in [str(i) for i in range(1,9)]+['X']:
for patid in ['2', '4', '6', '7', '8']:
	for compart in ['PLA']:
		prefix = 'DS'+str(patid)+'_'+compart+'_sample50'
		print prefix
		
		# load Newick trees
		infile = open(path+'hyphy/newick/'+prefix+'.nwk', 'rU')
		newicks = [ line.strip('\n') for line in infile.readlines() ]
		infile.close()
		
		# get the maximum credibility tree
		trees = [ newick2dict(n,h) for n in newicks ]
		maxcred = mctree(trees)
		data = trees[maxcred]
		
		print maxcred
		
		# assign integer indices to tips in left-to-right order in newick string
		h.ExecuteBF ('Tree mytree='+newicks[maxcred]+';', False)
		h.ExecuteBF ('bn=BranchName(mytree,-1);', False)
		recep = h.ExecuteBF ('return bn;', False)
		exec('nodenames=['+recep.sData.strip('{}\n')+']')
		tipnames = [ nodename for nodename in nodenames if not nodename.startswith('Node') ]
		
		# get node heights		
		for node in data.iterkeys():
			total_t = 0.
			current_node = node
			while data[current_node].has_key('children'):
				next_node = data[current_node]['children'][0]
				total_t += data[next_node]['length']
				current_node = next_node
			
			# extract days since January 1, 1990 from current node header
			days = int(current_node.split('_')[-1])
			end_date = days - total_t
			start_date = end_date - data[node]['length']
			data[node].update ( { 'start': start_date,
										'end': end_date } )
				
		
		
		# load g2p scores for observed sequences
		infile = open(path+'samples/'+prefix+'.g2p', 'rU')
		lines = infile.readlines()
		infile.close()
		
		for line in lines[1:]:
			header, aligned_query, g2p, fpr = line.strip('\n').split(',')
			data[header].update ( { 'g2p':float(g2p) } )
		
		
		# load ancestral sequence info
		infile = open(path+'hyphy/g2p/'+prefix+'.anc3_rev.g2p', 'rU')
		lines = infile.readlines()
		infile.close()
		
		for line in lines[1:]:
			treerep, ancrep, nodename, g2p, fpr = line.strip('\n').split(',')
			if int(treerep) == maxcred:
				if not data[nodename].has_key('g2p'):
					data[nodename].update ( { 'g2p': [] } )		
				data[nodename]['g2p'].append(float(g2p))
			elif int(treerep) > maxcred:
				break
		
		
		# find largest shifts in g2p
		shifts = {}		
		for ar in range(100):
			shifts.update ( {ar: {}} )
			for node in data.iterkeys():
				if node == 'Node0':
					continue
				if data[node].has_key('children'):
					parent = data[node]['parent']
					to_g2p = data[node]['g2p'][ar]
					from_g2p = data[parent]['g2p'][ar]
					
					to_val = math.log (1./to_g2p - 1)
					from_val = math.log (1./from_g2p - 1)
					
					delta = to_val - from_val
					if (delta < -0.01):
						shifts[ar].update ( { node: delta } )
		
		# use frequencies of nodes in shift lists to compute 
		# maximum credibilities of replicate ancestral reconstructions
		# use sum of credibilities so that we are not penalizing
		# shift dictionaries for size
		counts = {}
		for ar in range(100):
			for node in shifts[ar].iterkeys():
				if not counts.has_key(node):
					counts.update ( { node: 0.01 } )
				else:
					counts[node] += 0.01
		
		creds = []
		for ar in range(100):
			cred = 0.
			for node in shifts[ar].iterkeys():
				cred += counts[node]
				
			creds.append( [cred, ar] )
		
		creds.sort(reverse=True) # descending order
		#print creds[:10]
		
		maxcc_i = creds[0][1]
		maxcc = shifts[maxcc_i]
		print maxcc_i
		
		# create a new dictionary object with keys from maxcc
		# and incorporating information from 'data'
		theNodes = maxcc.keys()
		theTree = {}
		
		# identify the most recent ancestor for each node in this list
		# that only includes other nodes in the same list or Node0
		for node in theNodes:
			itsTips = []
			itsTips = get_tips (itsTips, node, data)
			
			# also figure out what the most recent tip is descended from this node
			ypos = 0.
			last_days = 0
			for tip in itsTips:
				ypos += tipnames.index(tip) / float(len(tipnames))
				days = int(tip.split('_')[-1])
				if days > last_days:
					last_days = days
					
			# also figure out what proportion of the sample at the most recent
			# time point is descended from this ancestor
			n_descends = 0
			for tip in itsTips:
				days = int(tip.split('_')[-1])
				if days == last_days:
					n_descends += 1
			
			theTree.update ( { node: {	'delta': maxcc[node],
										'start': data[node]['start'],
										'end': data[node]['end'],
										'g2p': data[node]['g2p'][maxcc_i],
										'ypos': ypos / len(itsTips),
										'lastdate': last_days,
										'n_descends': n_descends*2} } )
										
			curnode = node
			while 1:
				parent = data[curnode]['parent']
				if parent == 'Node0' or parent in theNodes:
					theTree[node].update ( { 'parent': parent } )
					break
				curnode = parent
		
		# determine if nodes are closest to tips
		for node in theTree.iterkeys():
			is_parent = False
			for node2 in theTree.iterkeys():
				if node == node2:
					continue
				if theTree[node2]['parent'] == node:
					is_parent = True
					break
				
			if is_parent:
				theTree[node]['lastdate'] = 'NA'
		
		# add root
		theTree.update ( { 'Node0': { 'start': data['Node0']['start'],
									'end': data['Node0']['end'],
									'g2p': data['Node0']['g2p'][maxcc_i],
									'ypos': 0.5,
									'lastdate': 'NA',
									'n_descends': '100'} } )
		
		
		# load protein sequence reconstructions
		infile = open(path+'hyphy/anc/'+prefix+'.prot', 'rU')
		lines = infile.readlines()
		infile.close()
		
		for line in lines:
			treerep, ancrep, nodename, aaseq = line.strip('\n').split(',')
			if int(treerep) == maxcred and int(ancrep) == maxcc_i:
				if theTree.has_key(nodename):
					aligned_query, aligned_ref, align_score = pairwise_align(h, v3ref, aaseq)
					theTree[nodename].update ( {'aaseq': aligned_query, 'refseq':aligned_ref} )
			elif int(treerep) > maxcred:
				break
		
		# output results
		for node in theTree.iterkeys():
			outfile.write(patid+','+compart+','+node+',')
			
			outfile.write(str(theTree[node]['ypos'])+',')
			outfile.write(str(theTree[node]['lastdate'])+',')
			outfile.write(str(theTree[node]['n_descends'])+',')
			
			outfile.write('%f,%f,' % (theTree[node]['start'], theTree[node]['end']))
			outfile.write(str(theTree[node]['g2p'])+',')
			
			if node == 'Node0':
				outfile.write('NA,')
			else:
				outfile.write(str(theTree[node]['delta'])+',')
			
			# get sequence
			outfile.write(theTree[node]['aaseq']+','+theTree[node]['refseq']+',')
			
			if node == 'Node0':
				outfile.write(',\n')
			else:
				parent = theTree[node]['parent']
				outfile.write(parent+','+theTree[parent]['aaseq']+'\n')
			
outfile.close()			

viz_g2p_shifts.py

import math
from g2pScorer import convert2fpr

last_timepts = [ 1436, 1631, 2318, 2052, 2978, 2016, 1354, 1449, 1211, 1625]
t_adjust = [  0, 181, 182, 182, 171, 182, 184, 207, 166, 187 ]
patids = [str(i) for i in range(1,10)] + ['X']
time_of_si = {}
for i in range(10):
	time_of_si.update( {patids[i]: last_timepts[i] - t_adjust[i] } )

print time_of_si

path = '/Users/apoon/wip/swenson/dutch_v3/'

infile = open(path+'results/map_g2p_shifts_2.out', 'rU')
lines = infile.readlines()
infile.close()

data = {}

for line in lines:
	patid, compart, node, ypos, lastdate, n_descend, start, end, g2p, delta, seq, refseq, parent, parseq = line.strip('\n').split(',')
	if not data.has_key(patid):
		data.update ( {patid: {}} )
	if not data[patid].has_key(compart):
		data[patid].update ( {compart: {}} )
	
	data[patid][compart].update ( {node: {	'ypos': float(ypos),
											'lastdate':lastdate,
											'n_descend':int(n_descend),
											'start': float(start), 
											'end': float(end), 
											'g2p': float(g2p), 
											'seq': seq, 
											'refseq': refseq, 
											'parent': parent,
											'parseq': parseq}} )


# do we need to rescale the x-axis?  can fit about 6 years into plot area
plot_settings = {}

for patid in data.iterkeys():
	plot_settings.update ( {patid: {'min':99999, 'max':0}} )
	for compart in data[patid].iterkeys():
		starts = [ v['start'] for v in data[patid][compart].itervalues() ]
		ends = [ int(v['lastdate']) for v in data[patid][compart].itervalues() if v['lastdate'] != 'NA' ]
		span = ( int(min(starts)), int(max(ends)) ) # number of years past 1990
		if span[0] < plot_settings[patid]['min']:
			plot_settings[patid]['min'] = span[0]
		if span[1] > plot_settings[patid]['max']:
			plot_settings[patid]['max'] = span[1]


print plot_settings

compart2label = {'PLA':'plasma', 'PBM':'PBMC'}

# iterate through patients and compartments

for patid in data.iterkeys():
	for compart in data[patid].iterkeys():
		prefix = 'DS'+patid+'_'+compart
		print prefix
		
		outfile = open(path+'results/'+prefix+'_2.ps', 'w')
		
		outfile.write('%!\n%%PS file '+prefix+'\n<< /PageSize [2500 2500] >> setpagedevice\n')
		
		
		# draw patient + compartment label
		outfile.write('/Helvetica findfont\n')
		outfile.write('80 scalefont\nsetfont\n')
		outfile.write('250 2250 moveto (%s) show\n' % ('DS'+patid+' '+compart2label[compart]))
		
		# draw timeline
		x_scale = 2000. / ((plot_settings[patid]['max'] - plot_settings[patid]['min']))
		
		outfile.write('0.5 0.5 0.5 setrgbcolor\n')
		x = 250 + x_scale * (time_of_si[patid] - plot_settings[patid]['min'])
		outfile.write('%d %d moveto %d %d lineto stroke\n' % (x, 250, x, 2250))
		
		outfile.write('/Helvetica findfont\n')
		outfile.write('48 scalefont\nsetfont\n')
		
		for node in data[patid][compart].iterkeys():
			print node
			
			# draw points given (x,y) coordinates = (end, ypos*2000)
			
			x = int(250 + x_scale * (data[patid][compart][node]['end'] - plot_settings[patid]['min']))
			
			y = int(2000*data[patid][compart][node]['ypos']+250)
			
			if node != 'Node0':
				# draw Bezier curve to parent node
				parent = data[patid][compart][node]['parent']
				px = int(250 + x_scale * (data[patid][compart][parent]['end'] - plot_settings[patid]['min']) )
				py = int(2000*data[patid][compart][parent]['ypos']+250)
				
				##rectangular tree
				#outfile.write( '%d %d moveto %d %d lineto stroke\n' % (x, y, px, y) )
				#outfile.write( '%d %d moveto %d %d lineto stroke\n' % (px, y, px, py) )
				n_descend = data[patid][compart][node]['n_descend']
				g2p = data[patid][compart][node]['g2p']
				fpr = convert2fpr (g2p) / 100.
				
				#print '%s\t\t%1.3f\t%1.1f' % (node, g2p, fpr*100)
				outfile.write('%d setlinewidth\n' % (5+n_descend/3))
				outfile.write( '%1.3f %1.3f %1.3f setrgbcolor\n' % (1-math.exp(-5*g2p), 0., math.exp(-5*g2p)) )
				
				if y > py:
					outfile.write( '%d %d moveto %d %d %d %d %d %d curveto stroke\n' % (x, y, px, y, px, y, px, py) )
				else:
					outfile.write( '%d %d moveto %d %d %d %d %d %d curveto stroke\n' % (x, y, px, y, px, y, px, py) )
				
				lastdate = data[patid][compart][node]['lastdate']
				if lastdate != 'NA':
					lastdate = 250 + x_scale * (int(lastdate) - plot_settings[patid]['min'])
					outfile.write('0.8 0.8 0.8 setrgbcolor\n')
					outfile.write('%d %d moveto %d %d lineto stroke\n' % (x, y, lastdate, y))
					outfile.write('0.5 0.5 0.5 setrgbcolor\n')
					outfile.write('%d %d moveto (%s) show\n' % (lastdate+5, y-10, str(n_descend)+'%'))
				
				# determine substitutions
				seq = data[patid][compart][node]['seq']
				refseq = data[patid][compart][node]['refseq']
				parseq = data[patid][compart][node]['parseq']
				
				coords = []
				index = 1
				for refchar in refseq:
					if refchar == '-':
						coords.append('\u2207'+str(index))
					else:
						coords.append(str(index))
						index += 1
				
				outfile.write( '0 0 0 setrgbcolor\n' )
				nsubs = 0
				for i in range(len(seq)):
					if seq[i] != parseq[i]:
						if y > py:
							outfile.write('%d %d moveto (%s) show\n' % (x-120, y+10+nsubs*50, parseq[i]+coords[i]+seq[i]))
						else:
							outfile.write('%d %d moveto (%s) show\n' % (x-120, y-50-nsubs*50, parseq[i]+coords[i]+seq[i]))
						nsubs += 1
				
				
			outfile.write( '0 0 0 setrgbcolor\n' )
			outfile.write( '%d %d 10 0 360 arc fill\n' % (x,y) )	
		
		# draw time line
		outfile.write( '0 0 0 setrgbcolor\n' )
		outfile.write('5 setlinewidth\n')
		outfile.write('250 200 moveto 2250 200 lineto stroke\n')
		
		outfile.write('/Helvetica findfont\n')
		outfile.write('56 scalefont\nsetfont\n')
		for i in range(int(plot_settings[patid]['min']/365.25), int(plot_settings[patid]['max']/365.25)+1):
			xcoord = 250 + x_scale * (i*365.25 - plot_settings[patid]['min'])
			if xcoord > 0:
				outfile.write('%d 150 moveto (%s) show\n' % ( xcoord, str(1990+i) ) )
		
		
		# draw legend
		refpts = [1., 0.351901, 0.156598, 0.105784, 0.059826, 0]
		fprs = ['0', '3.5', '10', '20', '50', '100']
		
		outfile.write('/Helvetica findfont\n')
		outfile.write('42 scalefont\nsetfont\n')
		
		for i in range(len(refpts)):
			outfile.write( '%1.3f %1.3f %1.3f setrgbcolor\n' % (1-math.exp(-5*refpts[i]), 0., math.exp(-5*refpts[i])) )
			outfile.write('newpath\n%d %d moveto\n' % (100, 300+75*i))
			outfile.write('%d %d lineto\n' % (200,300+75*i))
			outfile.write('%d %d lineto\n' % (200,375+75*i))
			outfile.write('%d %d lineto\n' % (100,375+75*i))
			outfile.write('%d %d lineto\n' % (100,300+75*i))
			outfile.write('closepath\nfill\n')
			outfile.write('1 1 1 setrgbcolor\n')
			outfile.write('%d %d moveto (%s) show\n' % (115, 325+75*i, fprs[i]))
		
		outfile.write('showpage\n')
		outfile.close()

Personal tools
Namespaces
Variants
Actions
Navigation
Toolbox