/* emeline v2.5 Align sequences in codon space against CfE references, correcting for frameshift-inducing indels. C H A N G E L O G April 18, 2011 - attempting a totally new algorithm for detecting frame shifts First, do translations in all 3 reading frames for the entire sequence Then find the path through the 3 frames that maximizes the alignment score relative to reference using a greedy search heuristic. This circumvents the problem of having to adjust gaps filled into a sequence with each iteration. --> 2.2 April 28, 2011 - found a bug in generating codon sequence from protein alignment results (splicing in frame shifts) - when attempting to do within-codon alignment, codon in reference sequence was not being correctly looked up because the coordinates were relative to the alignment width and not reference sequence length. --> 2.2.1 April 29, 2011 - there is a STILL a problem with processing: 35683A_V3LOOP_CFE4_14-Oct-2010.2_4_TAGH_reverse.seq introduces frame shifts where none exist in the top 3 sequences :-P Let's try solving this by using nucleotide sequence alignment to clip sequences to the reference (the problem arises when the reference is over-long - enforcing terminal gap penalties causes a handful of sites to slide over to the end, exchanging a huge internal gap for a huge terminal gap. --> 2.3 protein alignment character map is limited to amino acids - not scoring stop codons? maybe I should put this back in... Found a bug introduced by changing nucleotide alignment settings (not enforcing terminal gap penalty) which screwed up within-codon alignment. Changing this setting back to enforcing terminal gap penalties immediately before the within-codon alignment step. --> 2.3.1 Need to bump up the gap penalty (SEQ_ALIGN_GAP_OPEN2 -> 10) for initial alignment to make sure that we capture all of the V3 sequence in the query. What was happening was that the alignment would introduce gaps in the query sequence, pushing the rest of the V3 sequence past the end of the reference. That part would then get clipped out and lost, causing further problems downstream. see : 35682A_V3LOOP_CFE4_14-Oct-2010.1_1_TAGG_forward Revert this setting to its original value (5) before doing within-codon step. --> 2.3.2 Single gap deletions N-N generally represent real codons and should be retained, which is current behaviour. Single gap insertions N-- generally represent false codons when they are padded to bring the rest of the sequence into frame. We may want to remove these codons and rescue the sequence, but for now let's throw out any sequence with a frame-shifting indel. Now removing codons representing single nucleotide insertions. NOTE this is a form of imputation. --> 2.4 Trimming reference sequence down to the region that actually aligns with data during initial nucleotide alignments. --> 2.5 */ RequireVersion ("2.0020100903"); nucAlignOptions = {}; nucScoreMatrix = { {5,-4,-4,-4} {-4,5,-4,-4} {-4,-4,5,-4} {-4,-4,-4,5}}; nucAlignOptions ["SEQ_ALIGN_CHARACTER_MAP"] = "ACGT"; nucAlignOptions ["SEQ_ALIGN_SCORE_MATRIX"] = nucScoreMatrix; nucAlignOptions ["SEQ_ALIGN_GAP_OPEN"] = 10; nucAlignOptions ["SEQ_ALIGN_GAP_OPEN2"] = 10; nucAlignOptions ["SEQ_ALIGN_GAP_EXTEND"] = 1; nucAlignOptions ["SEQ_ALIGN_GAP_EXTEND2"] = 1; nucAlignOptions ["SEQ_ALIGN_AFFINE"] = 1; nucAlignOptions ["SEQ_ALIGN_NO_TP"] = 1; // minimum number of nucleotides that aligned sequences must overlap by MIN_OVERLAP = 12; /* --------------------------------------------------------------------- */ // returns a tuple of the lengths of gap prefix and suffix in a sequence function getBoundaries (str) { correctionFore = (str$"^\\-+")[1]+1; correctionAft = (str$"\\-+$")[0]; if (correctionAft < 0) { correctionAft = Abs(str); } return {{correctionFore__,correctionAft__}}; } /*-------------------------------------------------*/ function translateToAA (aSeq, offset) { seqLen = Abs (aSeq)-2; translString = ""; translString * (seqLen/3+1); for (seqPos = offset; seqPos < seqLen; seqPos = seqPos+3) { codon = aSeq[seqPos][seqPos+2]; prot = codonToAAMap[codon]; if (Abs(prot)) { translString * prot; } else { translString * "?"; } } translString * 0; translString = translString^{{"X$","?"}}; return translString; } /*-------------------------------------------------*/ /* call include files */ incFileName = HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"GrabBag.bf"; ExecuteAFile (incFileName); incFileName = HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"Utility"+DIRECTORY_SEPARATOR+"DescriptiveStatistics.bf"; ExecuteAFile (incFileName); incFileName = HYPHY_BASE_DIRECTORY+"TemplateBatchFiles"+DIRECTORY_SEPARATOR+"TemplateModels"+DIRECTORY_SEPARATOR+"chooseGeneticCode.def"; ExecuteAFile (incFileName); codonToAAMap = defineCodonToAA(); /* load the sequence data */ SetDialogPrompt ("Please select a file containing the sequences to align:"); DataSet unal = ReadDataFile (PROMPT_FOR_FILE); DataSetFilter filteredData = CreateFilter (unal, 1); // nucleotide partition GetString (sequenceNames, unal, -1); GetInformation (UnalignedSeqs,filteredData); unalSequenceCount = Rows(UnalignedSeqs)*Columns(UnalignedSeqs); /* ==================================================================== */ /* PHASE 0. Pre-processing of sequences */ /* ==================================================================== */ fprintf (stdout, "\n[PHASE 0] Pre-processing sequence data...\n"); // to determine which of the sequences is the longest - used only if aligning // against longest sequence (refSeq == 1, doLongestSequence == TRUE) longestSequence = 0; longestSequenceIDX = 0; // associative list to store information per sequence seqRecord = {}; // an associative list that will store sequence records, keyed by incremental index toDoSequences = {}; for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) { // remove non-alphabetical characters (e.g., '?', '*', '-') UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"[^a-zA-Z]",""}}; // remove runs of ambiguous nucleotides ('N's) at the ends UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"^N+",""}}; UnalignedSeqs[seqCounter] = UnalignedSeqs[seqCounter]^{{"N+$",""}}; seqRecord ["SEQUENCE_ID"] = sequenceNames[seqCounter]^{{","}{"_"}}; seqRecord ["RAW"] = UnalignedSeqs[seqCounter]; // store this record toDoSequences[Abs(toDoSequences)] = seqRecord; if (Abs (UnalignedSeqs[seqCounter]) > longestSequence) { longestSequence = Abs (UnalignedSeqs[seqCounter]); longestSequenceIDX = seqCounter; } SetParameter (STATUS_BAR_STATUS_STRING, "Initial processing ("+seqCounter+"/"+unalSequenceCount+" done)",0); } /* ==================================================================== */ /* PHASE 1. Determine reference sequence */ /* ==================================================================== */ fprintf (stdout, "\n[PHASE 1] Determining reference sequence...\n"); predefRefSeqs = {}; predefRefSeqs [0] = "CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACAGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCAGAAAAAGACAGCTGGACTGTCAATGACATACAGAAGTTAGTGGGGAAATTGAATTGGGCAAGTCAGATTTACCCAGGGATTAAAGTAAGGCAATTATGTAAACTCCTTAGAGGAACCAAAGCACTAACAGAAGTAATACCACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGAGAGATTCTAAAAGAACCAGTACATGGAGTGTATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAATTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAATATGCAAGAATGAGGGGTGCCCACACTAATGATGTAAAACAATTAACAGAGGCAGTGCAAAAAATAACCACAGAAAGCATAGTAATATGGGGAAAGACTCCTAAATTTAAACTGCCCATACAAAAGGAAACATGGGAAACATGGTGGACAGAGTAT"; predefRefSeqs [1] = "CCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTTCCCATTAGCCCTATTGAGACTGTACCAGTAAAATTAAAGCCAGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATTTGTACAGAGATGGAAAAGGAAGGGAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTATTTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAACTCAAGACTTCTGGGAAGTTCAATTAGGAATACCACATCCCGCAGGGTTAAAAAAGAAAAAATCAGTAACAGTACTGGATGTGGGTGATGCATATTTTTCAGTTCCCTTAGATGAAGACTTCAGGAAGTATACTGCATTTACCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAAAGGATCACCAGCAATATTCCAAAGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGACATAGTTATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAAAAATAGAGGAGCTGAGACAACATCTGTTGAGGTGGGGACTTACCACACCAGACAAAAAACATCAGAAAGAACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTG"; predefRefSeqs [2] = "GCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGT"; predefRefSeqs [3] = "GAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATGAAACATGGGAAACATGGTGGACAGAGTATTGGCAAGCCACCTGGATTCCTGAGTGGGAGTTTGTTAATACCCCTCCCTTAGTGAAATTATGGTACCAGTTAGAGAAAGAACCCATAGTAGGAGCAGAAACCTTCTATGTAGATGGGGCAGCTAACAGGGAGACTAAATTAGGAAAAGCAGGATATGTTACTAATAGAGGAAGACAAAAAGTTGTCACCCTAACTGACACAACAAATCAGAAGACTGAGTTACAAGCAATTTATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTAACAGACTCACAATATGCATTAGGAATCATTCAAGCACAACCAGATCAAAGTGAATCAGAGTTAGTCAATCAAATAATAGAGCAGTTAATAAAAAAGGAAAAGGTCTATCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAATTAGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGAT"; predefRefSeqs [4] = "TTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGATGAACATGAGAAATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTGCCACCTGTAGTAGCAAAAGAAATAGTAGCCAGCTGTGATAAATGTCAGCTAAAAGGAGAAGCCATGCATGGACAAGTAGACTGTAGTCCAGGAATATGGCAACTAGATTGTACACATTTAGAAGGAAAAGTTATCCTGGTAGCAGTTCATGTAGCCAGTGGATATATAGAAGCAGAAGTTATTCCAGCAGAAACAGGGCAGGAAACAGCATATTTTCTTTTAAAATTAGCAGGAAGATGGCCAGTAAAAACAATACATACTGACAATGGCAGCAATTTCACCGGTGCTACGGTTAGGGCCGCCTGTTGGTGGGCGGGAATCAAGCAGGAATTTGGAATTCCCTACAATCCCCAAAGTCAAGGAGTAGTAGAATCTATGAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAAATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAGTAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCGGGTTTATTACAGGGACAGCAGAAATCCACTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGGGCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATTAGGGATTATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGAT"; predefRefSeqs [5] = "ATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAA"; predefRefSeqs [6] = "GCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACGGTGAGTGACCCCGGCCCGGGGCGCAGGTCACGACCCCTCATCCCCCACGGACGGGCCAGGTCGCCCACAGTCTCCGGGTCCGAGATCCACCCCGAAGCCGCGGGACTCCGAGACCCTTGTCCCGGGAGAGGCCCAGGCGCCTTTACCCGGTTTCATTTTCAGTTTAGGCCAAAAATCCCCCCGGGTTGGTCGGGGCGGGGCGGGGCTCGGGGGACTGGGCTGACCGCGGGGTCGGGGCCAGGTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGCCGGACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCAGCTCAGATCACCAAGCGCAAGTGGGAGGCGGTCCATGCGGCGGAGCAGCGGAGAGTCTACCTGGAGGGCCGGTGCGTGGACGGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGG"; predefRefSeqs [7] = "GCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACCCAGTTCGTGAGGTTCGACAGCGACGCCGCGAGTCCGAGAGAGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCGGAACACACAGATCTACAAGGCCCAGGCACAGACTGACCGAGAGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG"; predefRefSeqs [8] = "GGTCTCACACCCTCCAGAGCATGTACGGCTGCGACGTGGGGCCGGACGGGCGCCTCCTCCGCGGGCATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACGGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCGAGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGACAAGCTGGAGCGCGCTG"; predefRefSeqs [9] = "GCTCCCACTCCATGAAGTATTTCTTCACATCCGTGTCCCGGCCTGGCCGCGGAGAGCCCCGCTTCATCTCAGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGTCCGAGAGGGGAGCCGCGGGCGCCGTGGGTGGAGCAGGAGGGGCCGGAGTATTGGGACCGGGAGACACAGAAGTACAAGCGCCAGGCACAGACTGACCGAGTGAGCCTGCGGAACCTGCGCGGCTACTACAACCAGAGCGAGGCCG"; predefRefSeqs [10] = "GGTCTCACACCCTCCAGTGGATGTGTGGCTGCGACCTGGGGCCCGACGGGCGCCTCCTCCGCGGGTATGACCAGTACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCCTGGACCGCCGCGGACACCGCGGCTCAGATCACCCAGCGCAAGTGGGAGGCGGCCCGTGAGGCGGAGCAGCGGAGAGCCTACCTGGAGGGCACGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCGCGG"; predefRefSeqs [11] = "ATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTGGATGGCCTACTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATAGGGTGGGAGCAGCATCTCGAGACCTGGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTACCAATGCTGCTTGTGCCTGGCTAGAAGCACAAGAGGAGGAGGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACAAGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATAAGATAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGGATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTGCATCCGGAGTACTTCAAGAACTGC"; predefRefSeqs [12] = "ATGGGTGCGAGAGCGTCAGTATTAAGCGGGGGAGAATTAGATCGATGGGAAAAAATTCGGTTAAGGCCAGGGGGAAAGAAAAAATATAAATTAAAACATATAGTATGGGCAAGCAGGGAGCTAGAACGATTCGCAGTTAATCCTGGCCTGTTAGAAACATCAGAAGGCTGTAGACAAATACTGGGACAGCTACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAGTAGCAACCCTCTATTGTGTGCATCAAAGGATAGAGATAAAAGACACCAAGGAAGCTTTAGACAAGATAGAGGAAGAGCAAAACAAAAGTAAGAAAAAAGCACAGCAAGCAGCAGCTGACACAGGACACAGCAATCAGGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAAGTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTGATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCCCACAAGATTTAAACACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGACCATCAATGAGGAAGCTGCAGAATGGGATAGAGTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAGATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGACAAATAATCCACCTATCCCAGTAGGAGAAATTTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGTAAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTAGACCGGTTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAGGAGGTAAAAAATTGGATGACAGAAACCTTGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGCGGCTACACTAGAAGAAATGATGACAGCATGTCAGGGAGTAGGAGGACCCGGCCATAAGGCAAGAGTTTTGGCTGAAGCAATGAGCCAAGTAACAAATTCAGCTACCATAATGATGCAGAGAGGCAATTTTAGGAACCAAAGAAAGATTGTTAAGTGTTTCAATTGTGGCAAAGAAGGGCACACAGCCAGAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTGGAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATCTGGCCTTCCTACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAGAGAGCTTCAGGTCTGGGGTAGAGACAACAACTCCCCCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTATCCTTTAACTTCCCTCAGGTCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAGTTTGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT"; predefRefSeqs [13] = "ATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAGAACATGGAAAAGTTTAGTAAAACACCATATGTATGTTTCAGGGAAAGCTAGGGGATGGTTTTATAGACATCACTATGAAAGCCCTCATCCAAGAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAGATTGGTAATAACAACATATTGGGGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGATATAGCACACAAGTAGACCCTGAACTAGCAGACCAACTAATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTAGCAGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAGAGCTTTTAGAGGAGCTTAAGAATGAAGCTGTTAGACATTTTCCTAGGATTTGGCTCCATGGCTTAGGGCAACATATCTATGAAACTTATGGGGATACTTGGGCAGGAGTGGAAGCCATAATAAGAATTCTGCAACAACTGCTGTTTATCCATTTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACTCGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAGCCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATAACAAAAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGACTCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAACGCAACCTATACCAATAGTAGCAATAGTAGCATTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAAGACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTG"; predefRefSeqs [14] = "TATGATTACTGTTAATGTTGCTACTACTGCTGACAATGCTGCTGCTGCTTCTCCTCACTGTCTCCACTTCCTTGAACAATGCGCCGTCATGCTTCTTTTGCCTCCCGCTGCTCCAGAAAGCTAGGCCGCAGATCAGAACCACCACAGTCAATATCACCACCTTCCTCTTATAGATTCGGAATCTCATGATAGGGGCTCAGCCTCTGTGCGAGTGGAGAGAAGTTTGCAGGCGAGCTGAGGAGCAATTGCAGGTGATATGATGTGCTCGGCTCAAGAAGCGGGCCCGGAGAGGAAGAAGTCGTGCCGGGGCTAATTATTGGCAAAACGAGCTCTTGTTGTAAACATTGATCCAACTGGAATGTCACTAATGGCGAATCAATATTCCATAAGGCATGATGGTTGCTCAGAGGCAGGAGAAGAGCAACGAATACGATCCTATAAAAGATAAAACATAAATAAACAGTCTTGATTATATTCTGGGTATTAAAGCCACAATCAGAACAAATATATGCTTTGTATCTTTTCTTGCCTTCTTCATTACCAACTGCTTCCGCGGCCACATTAAGAGAACTTGTGGTAAGATAAGAAGATATTTTATTCGTTCTGCTGACTTGCTGGATGTCGGGAAATATTCTGCATTTGATAAGAGGCGGTTAATTGCAGATATAATTGGTAGTGAAAAGGGTCGTTGCTATGGTCACCGTGAAGCGAGTACAGCAGCACAAGAATGTGTGCCGTTCTCAGTTAATATTGTTTGAATATGGTAACCTGTTTTAGTCGGTTTAAAGGTAAGAAGATCTAACCAAAAACAACACTGCAGTGACTGATTGTAGTATTTATTTTTTTACTTAATCTTAATTTTGGTGTAAA"; predefRefSeqs [15] = "ATGAGAGTGAAGGAGAAATATCAGCACTTGTGGAGATGGGGGTGGAGATGGGGCACCATGCTCCTTGGGATGTTGATGATCTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACCACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAAAAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTAAAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTAGCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGGTAAGGTGCAGAAAGAATATGCATTTTTTTATAAACTTGATATAATACCAATAGATAATGATACTACCAGCTATAAGTTGACAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATACATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATGTACAAATGTCAGCACAGTACAATGTACACATGGAATTAGGCCAGTAGTATCAACTCAACTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGGTAGTAATTAGATCTGTCAATTTCACGGACAATGCTAAAACCATAATAGTACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGAATCCGTATCCAGAGAGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAACATTAGTAGAGCAAAATGGAATAACACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATAATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGGAGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACTGAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACCCTCCCATGCAGAATAAAACAAATTATAAACATGTGGCAGAAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATATTACAGGGCTGCTATTAACAAGAGATGGTGGTAATAGCAACAATGAGTCCGAGATCTTCAGACCTGGAGGAGGAGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAGTAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTTCCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCCTCAATGACGCTGACGGTACAGGCCAGACAATTATTGTCTGGTATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGTTGCAACTCACAGTCTGGGGCATCAAGCAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGATCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCTAGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATCACACGACCTGGATGGAGTGGGACAGAGAAATTAACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGAATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATAAAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGAATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAACCCCGAGGGGACCCGACAGGCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCCTTGGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCTTGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCTCCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAGCTTGCTCAATGCCACAGCCATAGCAGTAGCTGAGGGGACAGATAGGGTTATAGAAGTAGTACAAGGAGCTTGTAGAGCTATTCGCCACATACCTAGAAGAATAAGACAGGGCTTGGAAAGGATTTTGCTATAA"; predefRefSeqs [16] = "TGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGT"; predefRefSeqs [17] = "AATTTCACAGACAATACCAAAACCATAATAGTACAGCTGAAGGAATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATACATATAGGACCAGGGAGAGCATTTTATGCAACAGGAGAAATAATAGGAGATATAAGACAAGCACATTGTAACCTTAGTAGAGCAAAATGGAATGACACTTTAAACCAGATAGTT"; predefSeqNames = { /*0*/ {"DRT clinical", "PR-RT reference for CfE Drug Resistance"} /*1*/ {"POL1017 clinical", "PR-RT reference for ?"} /*2*/ {"gp41", "gp41 CfE reference"} /*3*/ {"Int/RNAse H/RT end", "Integrase, RNAse H and the end of RT reference"} /*4*/ {"Int clinical", "Clinical reference for HIV integrase"} /*5*/ {"Gag", "gag CfE reference"} /*6*/ {"HLA-A, clinical", "HLA-A clinical reference"} /*7*/ {"HLA-Ba, clinical", "HLA-Ba clinical reference"} /*8*/ {"HLA-Bb, clinical", "HLA-Bb clinical reference"} /*9*/ {"HLA-Ca, clinical", "HLA-Ca clinical reference"} /*10*/ {"HLA-Cb, clinical", "HLA-Cb clinical reference"} /*11*/ {"Nef", "nef CfE reference"} /*12*/ {"Gag/Pol", "gag/pol CfE reference"} /*13*/ {"Vif/Vpr/Vpu", "vif/vpr/vpu CfE reference"} /*14*/ {"ABI Standard", "ABI standard"} /*15*/ {"gp160", "gp160 CfE reference"} /*16*/ {"V3, clinical", "Clinical reference for HIV env V3"} /*17*/ {"V3, extended", "Extended V3 CfE reference"} }; ChoiceList (alignMode, "Choose an alignment mode:", 1, SKIP_NONE, "Custom", "Input your own sequence to use as a reference", "First in file", "Use the first sequence in the data file as a reference", "Longest in file", "Use the longest sequence in the data file as the reference", "Select from list", "Choose a reference sequence from a list", "Automatic", "Automatically determine which reference sequence to use. NOTE: this will only align the first sequence in your data against a limited number of CfE HIV reference sequences."); assert (alignMode >= 0, "Alignment mode selection"); if (alignMode == 0) { fprintf (stdout, "Enter sequence to use as reference in plain-text: "); fscanf (stdin, "String", masterReferenceSequence); } if (alignMode == 1) { // first sequence as reference masterReferenceSequence = UnalignedSeqs[0]; } if (alignMode == 2) { // report longest sequence fprintf (stdout, "\nSelected sequence ", sequenceNames[longestSequenceIDX], " as reference."); masterReferenceSequence = UnalignedSeqs[longestSequenceIDX]; } if (alignMode == 3) { ChoiceList (refIndex, "Choose a reference sequence", 1, SKIP_NONE, predefSeqNames); assert (refIndex >= 0, "Reference sequence from list"); masterReferenceSequence = predefRefSeqs[refIndex]; } if (alignMode > 3) { // do nucleotide sequence alignment to determine best reference /* NOTE: this does alignments using the first sequence in the data set only. If this sequence is crap, then the whole scheme goes out the window :-P Usually the first sequence will comprise a relatively large number of identical reads, so this -should- be OK. */ bestIndex = -1; bestScore = 0; inStr = {{"", UnalignedSeqs[0]}}; for (rc = 0; rc < 2; rc = rc + 1) { if (rc) { inStr = {{"", nucleotideReverseComplement(UnalignedSeqs[0]) }}; } for (pd = 0; pd < Abs(predefRefSeqs); pd = pd+1) { inStr[0] = predefRefSeqs[pd]; AlignSequences (aligned, inStr, nucAlignOptions); aligned = aligned[0]; score = aligned[0]; /* determine length of aligned region without terminal penalties, the sequences are free to slide over until they overlap by only a handful of nucleotides */ bounds1 = getBoundaries(aligned[1]); bounds2 = getBoundaries(aligned[2]); left_bound = Max(bounds1[0], bounds2[0]); right_bound = Min(bounds1[1], bounds2[1]); lenOverlap = right_bound-left_bound; if (score > bestScore && lenOverlap >= MIN_OVERLAP) { bestIndex = pd; bestScore = score; } } } if (bestScore == 0) { fprintf (stdout, "WARNING: None of the predefined reference sequences aligned well to the first sequence in your data!\n"); return 0; } fprintf (stdout, "Selected predefined reference sequence: ", predefSeqNames[bestIndex][0], "\n"); masterReferenceSequence = predefRefSeqs[bestIndex]; } /* -------------------------------------------- */ /* Use nucleotide alignment to clip sequences */ /* to reference; also determine direction of */ /* sequence (i.e., reverse-complement?) */ /* -------------------------------------------- */ fprintf (stdout, "\n[PHASE 1.5] Clipping nucleotide sequences to reference...\n"); inStr = {{masterReferenceSequence, ""}}; allBestScores = {unalSequenceCount,2}; for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter + 1) { aRecord = toDoSequences[seqCounter]; aSeq = aRecord["RAW"]; inStr[1] = aSeq; bestRC = 0; bestScore = -9999999; bestAl = {}; for (rc = 0; rc < 2; rc = rc + 1) { if (rc) { inStr[1] = nucleotideReverseComplement(aSeq); } AlignSequences (aligned, inStr, nucAlignOptions); aligned = aligned[0]; score = aligned[0]; bounds1 = getBoundaries(aligned[1]); bounds2 = getBoundaries(aligned[2]); left_bound = Max(bounds1[0], bounds2[0]); right_bound = Min(bounds1[1], bounds2[1]); lenOverlap = right_bound-left_bound; if (score > bestScore && lenOverlap >= MIN_OVERLAP) { bestRC = rc; bestScore = score; bestAl = aligned; } } bounds1 = getBoundaries(bestAl[1]); bounds2 = getBoundaries(bestAl[2]); left_bound = Max(bounds1[0], bounds2[0]); right_bound = Min(bounds1[1], bounds2[1]); bestSeq = bestAl["2"]; bestSeq = bestSeq[left_bound][right_bound-1]; bestSeq = bestSeq^{{"-"}{""}}; aRecord["IS_REVERSE_COMPL"] = bestRC; aRecord["CLIPPED"] = bestSeq; toDoSequences[seqCounter] = aRecord; allBestScores[seqCounter][0] = bestScore; allBestScores[seqCounter][1] = seqCounter; SetParameter (STATUS_BAR_STATUS_STRING, "Initial nucleotide alignment ("+seqCounter+"/"+unalSequenceCount+" done)",0); } // sorts matrix on first column (0-index) sortedBestScores = allBestScores%0; // indices are stored in second column - now the best scoring sequence index is in the last row allBestIndex = sortedBestScores[Rows(sortedBestScores)-1][1]; /* Using the highest-scoring sequence helps makes sure that we are not using a partial sequence to clip the reference. */ aRecord = toDoSequences[allBestIndex]; aSeq = aRecord["CLIPPED"]; inStr[1] = aSeq; AlignSequences (aligned, inStr, nucAlignOptions); aligned = aligned[0]; bounds1 = getBoundaries(aligned[1]); bounds2 = getBoundaries(aligned[2]); left_bound = Max(bounds1[0], bounds2[0]); right_bound = Min(bounds1[1], bounds2[1]); clippedReferenceSequence = masterReferenceSequence[left_bound][right_bound-1]; clippedRefOffset = left_bound%3; //inStr = {{clippedReferenceSequence, ""}}; /*==============================================*/ /* Protein sequence alignment settings */ /*==============================================*/ /* select an amino acid substitution rate matrix */ protAlignOptions = {}; protAlignOptions ["SEQ_ALIGN_CHARACTER_MAP"]="ARNDCQEGHILKMFPSTWYV"; ChoiceList (matrixOpt, "Scoring Matrix", 1, SKIP_NONE, "BLOSUM62", "Default BLAST BLOSUM62 matrix", "HIV 5%", "Empirically derived 5% divergence HIV matrix", "HIV 25%", "Empirically derived 25% divergence HIV matrix", "HIV 50%", "Empirically derived 50% divergence HIV matrix"); assert(matrixOpt >= 0, "Scoring matrix option"); if (matrixOpt == 0) { // BLOSUM62 protScoreMatrix = {{6,-3,-4,-4,-2,-2,-2,-1,-3,-3,-3,-2,-2,-4,-2,0,-1,-5,-3,-1,-4,-2,-2,-7},{-3,8,-2,-4,-6,0,-2,-5,-2,-6,-4,1,-3,-5,-4,-2,-3,-5,-3,-5,-3,-1,-2,-7},{-4,-2,8,0,-5,-1,-2,-2,0,-6,-6,-1,-4,-5,-4,0,-1,-7,-4,-5,6,-2,-2,-7},{-4,-4,0,8,-6,-2,0,-3,-3,-5,-6,-2,-6,-6,-3,-1,-3,-7,-6,-6,6,0,-3,-7},{-2,-6,-5,-6,10,-5,-7,-5,-5,-3,-3,-6,-3,-5,-5,-2,-2,-4,-4,-2,-5,-6,-4,-7},{-2,0,-1,-2,-5,8,1,-4,0,-6,-4,0,-1,-6,-3,-1,-2,-3,-3,-4,-1,6,-2,-7},{-2,-2,-2,0,-7,1,7,-4,-1,-6,-5,0,-4,-6,-3,-1,-2,-5,-4,-4,0,6,-2,-7},{-1,-5,-2,-3,-5,-4,-4,7,-4,-7,-6,-3,-5,-5,-4,-2,-4,-4,-5,-6,-2,-4,-4,-7},{-3,-2,0,-3,-5,0,-1,-4,10,-6,-5,-2,-3,-3,-4,-2,-4,-5,0,-6,-1,-1,-3,-7},{-3,-6,-6,-5,-3,-6,-6,-7,-6,6,0,-5,0,-1,-5,-5,-2,-5,-3,2,-5,-6,-2,-7},{-3,-4,-6,-6,-3,-4,-5,-6,-5,0,6,-5,1,-1,-5,-5,-3,-3,-3,0,-6,-5,-2,-7},{-2,1,-1,-2,-6,0,0,-3,-2,-5,-5,7,-3,-6,-2,-1,-2,-5,-3,-4,-2,0,-2,-7},{-2,-3,-4,-6,-3,-1,-4,-5,-3,0,1,-3,9,-1,-5,-3,-2,-3,-3,0,-5,-2,-1,-7},{-4,-5,-5,-6,-5,-6,-6,-5,-3,-1,-1,-6,-1,8,-6,-4,-4,0,1,-3,-6,-6,-3,-7},{-2,-4,-4,-3,-5,-3,-3,-4,-4,-5,-5,-2,-5,-6,9,-2,-3,-6,-5,-4,-4,-3,-4,-7},{0,-2,0,-1,-2,-1,-1,-2,-2,-5,-5,-1,-3,-4,-2,7,0,-5,-3,-4,-1,-1,-2,-7},{-1,-3,-1,-3,-2,-2,-2,-4,-4,-2,-3,-2,-2,-4,-3,0,7,-4,-3,-1,-2,-2,-2,-7},{-5,-5,-7,-7,-4,-3,-5,-4,-5,-5,-3,-5,-3,0,-6,-5,-4,12,0,-6,-7,-4,-4,-7},{-3,-3,-4,-6,-4,-3,-4,-5,0,-3,-3,-3,-3,1,-5,-3,-3,0,9,-3,-5,-3,-2,-7},{-1,-5,-5,-6,-2,-4,-4,-6,-6,2,0,-4,0,-3,-4,-4,-1,-6,-3,6,-6,-4,-2,-7},{-4,-3,6,6,-5,-1,0,-2,-1,-5,-6,-2,-5,-6,-4,-1,-2,-7,-5,-6,7,-1,-3,-7},{-2,-1,-2,0,-6,6,6,-4,-1,-6,-5,0,-2,-6,-3,-1,-2,-4,-3,-4,-1,7,-2,-7},{-2,-2,-2,-3,-4,-2,-2,-4,-3,-2,-2,-2,-1,-3,-4,-2,-2,-4,-2,-2,-3,-2,-2,-7},{-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,-7,1}}; } if (matrixOpt == 1) { // HIV-1 between 5% protScoreMatrix = {{8,-13,-16,-8,-15,-17,-8,-7,-16,-14,-14,-17,-16,-20,-7,-7,-2,-23,-23,-4,-10,-10,-8,-24}, {-13,8,-12,-20,-12,-6,-15,-6,-3,-11,-11,-1,-7,-21,-9,-6,-7,-10,-18,-16,-14,-8,-7,-24}, {-16,-12,9,-1,-15,-11,-14,-12,-4,-11,-20,-4,-19,-20,-17,-2,-4,-24,-8,-18,7,-12,-7,-24}, {-8,-20,-1,9,-22,-19,-3,-7,-8,-19,-22,-15,-22,-23,-19,-11,-13,-23,-11,-9,7,-4,-7,-24}, {-15,-12,-15,-22,11,-23,-23,-10,-13,-19,-15,-21,-22,-3,-20,-5,-10,-7,-4,-12,-17,-23,-9,-24}, {-17,-6,-11,-19,-23,8,-7,-17,-4,-20,-8,-4,-13,-20,-5,-15,-13,-20,-15,-20,-12,6,-8,-24}, {-8,-15,-14,-3,-23,-7,8,-6,-15,-20,-23,-5,-14,-24,-20,-19,-13,-23,-17,-9,-5,6,-9,-24}, {-7,-6,-12,-7,-10,-17,-6,7,-19,-20,-22,-11,-21,-13,-20,-5,-12,-9,-22,-10,-8,-7,-9,-24}, {-16,-3,-4,-8,-13,-4,-15,-19,11,-16,-8,-14,-19,-13,-7,-12,-10,-17,-1,-22,-5,-6,-7,-24}, {-14,-11,-11,-19,-19,-20,-20,-20,-16,7,-4,-13,-3,-6,-17,-9,-3,-22,-15,-1,-12,-20,-7,-24}, {-14,-11,-20,-22,-15,-8,-23,-22,-8,-4,7,-16,-5,-3,-7,-10,-16,-10,-15,-8,-21,-11,-9,-24}, {-17,-1,-4,-15,-21,-4,-5,-11,-14,-13,-16,8,-9,-19,-17,-11,-5,-20,-22,-13,-6,-5,-6,-24}, {-16,-7,-19,-22,-22,-13,-14,-21,-19,-3,-5,-9,11,-13,-20,-18,-5,-16,-23,-4,-20,-14,-7,-24}, {-20,-21,-20,-23,-3,-20,-24,-13,-13,-6,-3,-19,-13,10,-19,-10,-18,-10,-2,-10,-21,-22,-8,-24}, {-7,-9,-17,-19,-20,-5,-20,-20,-7,-17,-7,-17,-20,-19,9,-5,-8,-18,-18,-20,-18,-8,-9,-24}, {-7,-6,-2,-11,-5,-15,-19,-5,-12,-9,-10,-11,-18,-10,-5,8,-3,-19,-11,-16,-4,-17,-7,-24}, {-2,-7,-4,-13,-10,-13,-13,-12,-10,-3,-16,-5,-5,-18,-8,-3,8,-23,-16,-10,-6,-13,-6,-24}, {-23,-10,-24,-23,-7,-20,-23,-9,-17,-22,-10,-20,-16,-10,-18,-19,-23,10,-9,-23,-23,-21,-12,-24}, {-23,-18,-8,-11,-4,-15,-17,-22,-1,-15,-15,-22,-23,-2,-18,-11,-16,-9,10,-18,-9,-16,-9,-24}, {-4,-16,-18,-9,-12,-20,-9,-10,-22,-1,-8,-13,-4,-10,-20,-16,-10,-23,-18,8,-11,-11,-7,-24}, {-10,-14,7,7,-17,-12,-5,-8,-5,-12,-21,-6,-20,-21,-18,-4,-6,-23,-9,-11,8,-6,-8,-24}, {-10,-8,-12,-4,-23,6,6,-7,-6,-20,-11,-5,-14,-22,-8,-17,-13,-21,-16,-11,-6,7,-9,-24}, {-8,-7,-7,-7,-9,-8,-9,-9,-7,-7,-9,-6,-7,-8,-9,-7,-6,-12,-9,-7,-8,-9,-8,-24}, {-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,-24,1}}; } if (matrixOpt == 2) { // HIV-1 between 25% protScoreMatrix = {{7,-7,-7,-4,-10,-11,-4,-3,-10,-6,-9,-9,-7,-13,-3,-2,1,-16,-15,0,-5,-5,-3,-17},{-7,7,-5,-11,-8,-2,-7,-2,0,-6,-6,2,-3,-12,-4,-2,-2,-5,-9,-10,-7,-3,-3,-17},{-7,-5,8,2,-9,-6,-6,-7,0,-6,-12,0,-10,-12,-9,1,0,-17,-3,-10,6,-6,-3,-17},{-4,-11,2,8,-14,-10,0,-2,-3,-11,-15,-7,-13,-15,-13,-5,-6,-16,-6,-5,7,0,-3,-17},{-10,-8,-9,-14,11,-16,-15,-5,-7,-11,-9,-13,-14,0,-12,-1,-6,-2,0,-8,-10,-16,-5,-17},{-11,-2,-6,-10,-16,8,-2,-10,0,-12,-4,0,-8,-12,-1,-9,-8,-14,-9,-13,-7,6,-4,-17},{-4,-7,-6,0,-15,-2,7,-1,-9,-12,-15,-1,-10,-17,-13,-11,-8,-15,-12,-5,0,6,-4,-17},{-3,-2,-7,-2,-5,-10,-1,7,-10,-11,-14,-6,-12,-9,-11,-1,-7,-5,-14,-5,-4,-3,-4,-17},{-10,0,0,-3,-7,0,-9,-10,10,-10,-4,-5,-10,-6,-3,-6,-6,-11,2,-14,-1,-2,-3,-17},{-6,-6,-6,-11,-11,-12,-12,-11,-10,7,0,-7,0,-2,-10,-4,0,-14,-9,2,-7,-12,-2,-17},{-9,-6,-12,-15,-9,-4,-15,-14,-4,0,6,-10,0,0,-3,-5,-8,-6,-8,-4,-13,-6,-4,-17},{-9,2,0,-7,-13,0,-1,-6,-5,-7,-10,7,-4,-14,-9,-5,-1,-12,-13,-9,-1,-1,-2,-17},{-7,-3,-10,-13,-14,-8,-10,-12,-10,0,0,-4,10,-7,-11,-9,-1,-11,-15,0,-11,-9,-3,-17},{-13,-12,-12,-15,0,-12,-17,-9,-6,-2,0,-14,-7,10,-11,-5,-10,-5,1,-5,-13,-14,-3,-17},{-3,-4,-9,-13,-12,-1,-13,-11,-3,-10,-3,-9,-11,-11,8,-1,-3,-13,-11,-12,-10,-3,-5,-17},{-2,-2,1,-5,-1,-9,-11,-1,-6,-4,-5,-5,-9,-5,-1,8,0,-12,-6,-9,0,-10,-3,-17},{1,-2,0,-6,-6,-8,-8,-7,-6,0,-8,-1,-1,-10,-3,0,7,-16,-10,-4,-2,-8,-2,-17},{-16,-5,-17,-16,-2,-14,-15,-5,-11,-14,-6,-12,-11,-5,-13,-12,-16,10,-4,-16,-16,-14,-8,-17},{-15,-9,-3,-6,0,-9,-12,-14,2,-9,-8,-13,-15,1,-11,-6,-10,-4,10,-12,-4,-10,-4,-17},{0,-10,-10,-5,-8,-13,-5,-5,-14,2,-4,-9,0,-5,-12,-9,-4,-16,-12,7,-7,-7,-3,-17},{-5,-7,6,7,-10,-7,0,-4,-1,-7,-13,-1,-11,-13,-10,0,-2,-16,-4,-7,7,-2,-4,-17},{-5,-3,-6,0,-16,6,6,-3,-2,-12,-6,-1,-9,-14,-3,-10,-8,-14,-10,-7,-2,6,-4,-17},{-3,-3,-3,-3,-5,-4,-4,-4,-3,-2,-4,-2,-3,-3,-5,-3,-2,-8,-4,-3,-4,-4,-3,-17},{-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,1}}; } if (matrixOpt == 3) { // HIV-1 between 50% protScoreMatrix = {{7,-5,-4,-2,-7,-8,-2,-1,-8,-2,-6,-5,-4,-9,-1,-1,2,-13,-12,1,-3,-4,-2,-13},{-5,6,-2,-7,-6,0,-4,0,1,-4,-4,3,-1,-9,-3,0,-1,-4,-6,-7,-4,-1,-1,-13},{-4,-2,7,3,-6,-3,-3,-4,0,-4,-8,0,-7,-8,-5,2,0,-13,-2,-7,6,-3,-1,-13},{-2,-7,3,8,-10,-6,2,-1,-1,-8,-12,-4,-10,-11,-9,-2,-4,-12,-4,-4,6,0,-2,-13},{-7,-6,-6,-10,11,-12,-12,-3,-4,-7,-6,-9,-10,2,-8,0,-4,0,1,-6,-7,-12,-3,-13},{-8,0,-3,-6,-12,7,-1,-7,1,-8,-2,0,-6,-8,0,-6,-5,-11,-6,-10,-5,5,-2,-13},{-2,-4,-3,2,-12,-1,7,0,-6,-8,-11,0,-7,-13,-9,-7,-5,-12,-9,-3,0,5,-2,-13},{-1,0,-4,-1,-3,-7,0,7,-7,-8,-10,-3,-9,-7,-8,0,-4,-3,-10,-4,-2,-1,-3,-13},{-8,1,0,-1,-4,1,-6,-7,9,-7,-2,-2,-7,-3,-1,-3,-4,-8,3,-10,0,-1,-1,-13},{-2,-4,-4,-8,-7,-8,-8,-8,-7,6,0,-5,2,0,-7,-3,0,-11,-7,3,-5,-8,-1,-13},{-6,-4,-8,-12,-6,-2,-11,-10,-2,0,6,-7,0,1,-1,-4,-5,-4,-5,-1,-10,-5,-3,-13},{-5,3,0,-4,-9,0,0,-3,-2,-5,-7,6,-3,-11,-6,-2,0,-9,-9,-6,0,0,-1,-13},{-4,-1,-7,-10,-10,-6,-7,-9,-7,2,0,-3,10,-4,-7,-6,0,-9,-10,1,-8,-6,-1,-13},{-9,-9,-8,-11,2,-8,-13,-7,-3,0,1,-11,-4,9,-7,-4,-7,-3,3,-3,-9,-10,-2,-13},{-1,-3,-5,-9,-8,0,-9,-8,-1,-7,-1,-6,-7,-7,8,0,-1,-11,-8,-8,-7,-2,-3,-13},{-1,0,2,-2,0,-6,-7,0,-3,-3,-4,-2,-6,-4,0,7,1,-9,-4,-6,0,-6,-1,-13},{2,-1,0,-4,-4,-5,-5,-4,-4,0,-5,0,0,-7,-1,1,6,-12,-7,-1,0,-5,-1,-13},{-13,-4,-13,-12,0,-11,-12,-3,-8,-11,-4,-9,-9,-3,-11,-9,-12,10,-2,-12,-12,-11,-6,-13},{-12,-6,-2,-4,1,-6,-9,-10,3,-7,-5,-9,-10,3,-8,-4,-7,-2,9,-9,-3,-7,-3,-13},{1,-7,-7,-4,-6,-10,-3,-4,-10,3,-1,-6,1,-3,-8,-6,-1,-12,-9,6,-5,-5,-1,-13},{-3,-4,6,6,-7,-5,0,-2,0,-5,-10,0,-8,-9,-7,0,0,-12,-3,-5,7,0,-2,-13},{-4,-1,-3,0,-12,5,5,-1,-1,-8,-5,0,-6,-10,-2,-6,-5,-11,-7,-5,0,6,-3,-13},{-2,-1,-1,-2,-3,-2,-2,-3,-1,-1,-3,-1,-1,-2,-3,-1,-1,-6,-3,-1,-2,-3,-2,-13}, {-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,-13,1}}; } protAlignOptions ["SEQ_ALIGN_SCORE_MATRIX"] = protScoreMatrix[{{0,0}}][{{19,19}}]; // note that we are excluding the stop codon scores! protAlignOptions ["SEQ_ALIGN_GAP_OPEN"] = 40; protAlignOptions ["SEQ_ALIGN_GAP_OPEN2"] = 20; protAlignOptions ["SEQ_ALIGN_GAP_EXTEND"] = 10; protAlignOptions ["SEQ_ALIGN_GAP_EXTEND2"] = 5; protAlignOptions ["SEQ_ALIGN_AFFINE"] = 1; /* true or false, do affine gap cost */ protAlignOptions ["SEQ_ALIGN_NO_TP"] = 0; /* enforce terminal gap penalties - should be ok now that seqs are clipped */ /* ==================================================================== */ /* PHASE 2. Determine reading frames */ /* ==================================================================== */ MIN_FRAGMENT = 3; fprintf (stdout, "\n[PHASE 2] Detecting reading frames for each unprocessed sequence...\n"); aRecord = {}; // for alignment of protein sequences // we may have clipped the reference so now we need to figure out its reading frame translRef = translateToAA (clippedReferenceSequence, (3-clippedRefOffset)%3); inStr = {{translRef,""}}; initialScores = {unalSequenceCount, 1}; for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) { aRecord = toDoSequences[seqCounter]; aSeq = aRecord["CLIPPED"]; // if the raw sequence has a frameshift then all alignments will be garbage bestScore = -99999; // do initial alignment to reference for (offset = 0; offset < 3; offset = offset+1) /* for each codon position */ { translString = translateToAA (aSeq, offset); inStr[1] = translString; AlignSequences (aligned, inStr, protAlignOptions); aligned = aligned[0]; if (aligned[0] > bestScore) { bestSeq = aSeq; bestScore = aligned[0]; bestFrame = offset; bestAl = aligned; } } aRecord["ALIGN_SCORE"] = bestAl[0]; initialScores[seqCounter] = bestAl[0]; aRecord["ALIGNED_REF"] = bestAl[1]; aRecord["ALIGNED_QUERY"] = bestAl[2]; aRecord["FRAME"] = bestFrame; toDoSequences[seqCounter] = aRecord; SetParameter (STATUS_BAR_STATUS_STRING, "Initial protein alignment ("+seqCounter+"/"+unalSequenceCount+" done)",0); } //fprintf (stdout, "Last aligned query:\n", bestAl[2], "\n"); desc_stats = GatherDescriptiveStats (initialScores); fprintf (stdout, "Distribution of initial alignment scores:\n"); fprintf (stdout, "Minimum:\t", desc_stats["Min"], "\n"); fprintf (stdout, "2.5%:\t\t", desc_stats["2.5%"], "\n"); fprintf (stdout, "Median:\t", desc_stats["Median"], "\n"); fprintf (stdout, "97.5%\t\t", desc_stats["97.5%"], "\n"); fprintf (stdout, "Maximum:\t", desc_stats["Max"], "\n\n"); ChoiceList (cutoffOpt, "Filter sequences option:", 1, SKIP_NONE, /* 0 */ "None", "Set cutoff to minimum score", /* 1 */ "Median", "Set cutoff to median initial score", /* 2 */ "All", "Set cutoff above maximum initial score", /* 3 */ "Custom", "User-specified cutoff score"); assert (cutoffOpt >= 0, "Cutoff option"); if (cutoffOpt == 0) { min_score = desc_stats["Min"]; } else { if (cutoffOpt == 1) { min_score = desc_stats["Median"]; } else { if (cutoffOpt == 2) { min_score = desc_stats["Max"]+1; } else { fprintf (stdout, "Enter cutoff alignment score: "); fscanf (stdin, "Number", min_score); } } } /* We are going to assume that a sufficient majority of the sequence is in frame that we can reliably determine direction (reverse-complement) and the reading frame of that majority. Now we need to check for shifts in reading frame induced by indels. */ /* ==================================================================== */ /* PHASE 3. Detect frame shifts */ /* ==================================================================== */ fprintf (stdout, "\n[PHASE 3] Scanning for frame shift indels...\n"); MAX_NUM_FRAMESHIFTS = 3; function sortListByNumericalKey (theList) { _keys = Rows(theList); aVector = {Columns(_keys), 1}; for (__i = 0; __i < Columns(_keys); __i = __i + 1) { ExecuteCommands("aVector["+__i+"] = "+_keys[__i]+";"); } aVector = aVector%0; // ascending sort newList = {}; for (__i = 0; __i < Rows(aVector); __i = __i + 1) { ExecuteCommands("newList[\""+aVector[__i]+"\"]="+theList[aVector[__i]]+";"); } return (newList); } function seqFromSpliceList (spList, frames) { //_sortedKeys = _sortStrings(Rows(spList)); _sortedList = sortListByNumericalKey(spList); _sortedKeys = Rows(_sortedList); _newSeq = ""; lastSite = 0; for (_key = 0; _key < Columns(_sortedKeys); _key = _key + 1) { thisFrame = spList[_sortedKeys[_key]]; ExecuteCommands("_newSeq = _newSeq + (frames[thisFrame])[lastSite]["+_sortedKeys[_key]+"];"); ExecuteCommands("lastSite = "+_sortedKeys[_key]+"+1;"); } return (_newSeq); } for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) { aRecord = toDoSequences[seqCounter]; bestScore = aRecord["ALIGN_SCORE"]; if (bestScore >= min_score) { // just write the clipped nuc sequence over to the next key aRecord["ALIGNED_NUC"] = aRecord["CLIPPED"]; toDoSequences[seqCounter] = aRecord; continue; } bestFrame = aRecord["FRAME"]; // bring all sequences into our best guess of frame zero theSeq = aRecord["CLIPPED"]; // already reverse-complemented if necessary // translations of sequence in all 3 frames frames = {}; aalen = 0; for (_frame = 0; _frame < 3; _frame = _frame + 1) { frames[_frame] = translateToAA (theSeq, _frame); if (Abs(frames[_frame]) > aalen) { aalen = Abs(frames[_frame]); } } // make up for truncated translations for (_frame = 0; _frame < 3; _frame = _frame + 1) { if (Abs(frames[_frame]) < aalen) { diff = aalen - Abs(frames[_frame]); for (_d = 0; _d < diff; _d = _d + 1) { frames[_frame] = frames[_frame] + "?"; } } } // length of translated sequence aalen = Abs(frames[0]); /* Keep track of frame shifts in associative list Key: right bound of interval (inclusive) Value: reading frame up to right bound doing it this way to make it easier to construct a sequence */ _shifts = {}; _shifts[aalen] = bestFrame; _tryFS = 0; // the number of times we have looked for a frameshift while (_tryFS < MAX_NUM_FRAMESHIFTS) { // sort the keys in the associative list of splice sites if (Columns(Rows(_shifts)) > 1) { sortedKeys = _sortStrings(Rows(_shifts)); newList = {}; for (_key = 0; _key < Rows(sortedKeys); _key = _key + 1) { newList[sortedKeys[_key]] = _shifts[sortedKeys[_key]]; } _shifts = newList; } bestSpliceScore = bestScore; // iterate through intervals and try to split them leftBound = 0; for (_i = 0; _i < Columns(Rows(_shifts)); _i = _i + 1) { ExecuteCommands("rightBound = "+(Rows(_shifts))[_i]+";"); currentFrame = _shifts[rightBound]; if ( (rightBound - leftBound) < 2 * MIN_FRAGMENT) { // current interval cannot be split down any further continue; } //fprintf (stdout, "\ncheck interval ", _i, " (", leftBound, ", ", rightBound, ") in frame ", currentFrame, "\n"); for (_site = leftBound + MIN_FRAGMENT; _site < rightBound - MIN_FRAGMENT; _site = _site + 1) { // retain interval's original frame to the left of new splice? for (retainLeft = 0; retainLeft < 2; retainLeft = retainLeft + 1) { for (_step = 1; _step < 3; _step = _step + 1) { // reset to original configuration of intervals _temp_shifts = _shifts; // a way of cycling through the alterate reading frames newFrame = (currentFrame + _step) % 3; if (retainLeft) { _temp_shifts[_site] = currentFrame; _temp_shifts[rightBound] = newFrame; } else { _temp_shifts[_site] = newFrame; _temp_shifts[rightBound] = currentFrame; } newSeq = seqFromSpliceList (_temp_shifts, frames); // re-use container that has translated reference in [0] inStr[1] = newSeq; AlignSequences (aligned, inStr, protAlignOptions); aligned = aligned[0]; //fprintf (stdout, _site, "\t", retainLeft, "\t", newFrame, "\t", newSeq, "\t", aligned[0], "\n"); if (aligned[0] > bestSpliceScore) { bestSpliceScore = aligned[0]; bestSplice = _temp_shifts; } } } //SetParameter (STATUS_BAR_STATUS_STRING, "Tested splice site ("+_site+"/"+(rightBound-leftBound-2*MIN_FRAGMENT)+" done)",0); } // prepare to go to next interval leftBound = rightBound + 1; } if (bestSpliceScore <= bestScore) { // no improvement in alignment by further splicing break; } // else, update splice list and repeat bestScore = bestSpliceScore; _shifts = bestSplice; _tryFS = _tryFS + 1; } if (_tryFS > 0) { fprintf (stdout, "*** Detected ", _tryFS," frameshift(s) in sequence ", aRecord["SEQUENCE_ID"], "\n"); if (_tryFS == MAX_NUM_FRAMESHIFTS) { fprintf (stdout, " WARNING: this is the maximum number of frameshifts screened for!\n"); } } if (Columns(Rows(_shifts)) > 0) { // sort keys one more time _shifts = sortListByNumericalKey(_shifts); // use splice list to update nucleotide sequence newSeq = ""; newSeq * (2*Abs(theSeq)); leftBound = 0; for (_in = 0; _in < Columns(Rows(_shifts)); _in = _in + 1) { __key = (Rows(_shifts))[_in]; ExecuteCommands("rightBound = "+__key+";"); if (_in == 0) { // leftmost interval always relative to frame 0 frameshift = _shifts[__key]; } else { frameshift = _shifts[__key] - _shifts[(Rows(_shifts))[_in-1]]; } this_frame = (3-frameshift)%3; for (_fs = 0; _fs < this_frame; _fs = _fs + 1) { newSeq * "-"; } newSeq * (theSeq[3*leftBound][3*rightBound+2]); leftBound = rightBound + 1; } newSeq * 0; // regenerate translation to insert ?'s where we have placed frameshifts translString = translateToAA(newSeq, 0); inStr[1] = translString; AlignSequences(aligned, inStr, protAlignOptions); aligned = aligned[0]; aRecord["ALIGNED_QUERY"] = aligned[2]; aRecord["ALIGNED_REF"] = aligned[1]; aRecord["ALIGN_SCORE"] = aligned[0]; aRecord["NUM_FRAME_SHIFTS"] = _tryFS; aRecord["ALIGNED_NUC"] = newSeq; aRecord["FRAME"] = bestSplice; toDoSequences[seqCounter] = aRecord; } SetParameter (STATUS_BAR_STATUS_STRING, "Splicing sequences ("+seqCounter+"/"+unalSequenceCount+" done)",0); } /* ==================================================================== */ /* PHASE 4. Apply splicing to aligned sequences */ /* ==================================================================== */ ChoiceList (outputMode, "Output format", 1, SKIP_NONE, "FASTA", "Write pairwise-aligned codon sequences into FASTA file.", "MSA", "Write Multiple Sequence Alignment to FASTA file. UNDER DEVELOPMENT.", "CSV", "Write lots of information into comma-separated file."); assert (outputMode>=0, "Output format"); fprintf (stdout, "\n[PHASE 4] Bringing sequences into reading frame and trimming to aligned region...\n"); /* Pad nucleotide sequence with gaps in aligned protein query Re-align incomplete codons (containing frameshifts) in nucleotide space ********************************************************** *** THIS IS BETTER DONE AGAINST CONSENSUS NUC SEQUENCE *** ********************************************************** Make this an addition to emeline V2.3 Use this loop to compute nucleotide consensus then loop through sequences stopping only at those annotated with >0 frameshifts to realign incomplete codons Track gaps in reference for merging alignment */ refLength = Abs(inStr[0]); paddedLength = refLength; refInsertions = {refLength, 1}; // need to modify this setting for aligning within codons nucAlignOptions ["SEQ_ALIGN_GAP_OPEN2"] = 5; nucAlignOptions ["SEQ_ALIGN_NO_TP"] = 0; for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter+1) { aRecord = toDoSequences[seqCounter]; theSeq = aRecord["ALIGNED_NUC"]; bestFrame = aRecord["FRAME"]; _aligned_ref = aRecord["ALIGNED_REF"]; _aligned_query = aRecord["ALIGNED_QUERY"]; if (Columns(Rows(bestFrame)) == 0) { // this record was not processed by splicing offset = bestFrame; } else { offset = 0; } newSeq = ""; newSeq * (Abs(theSeq)+3); // find boundaries of aligned region bounds1 = getBoundaries(_aligned_ref); bounds2 = getBoundaries(_aligned_query); left_bound = Max(bounds1[0], bounds2[0]); right_bound = Min(bounds1[1], bounds2[1]); codon_pos = 3*bounds1[0] + offset; // loop over residues in aligned region for (_aa = left_bound; _aa < right_bound; _aa = _aa+1) { theCodon = theSeq[codon_pos][codon_pos+2]; //fprintf (stdout, codon_pos, "\t", theCodon, "\t", _aligned_query[_aa], "\t", masterReferenceSequence[3*_aa][3*_aa+2], "\n"); // insertion in query relative to reference if (_aligned_query[_aa] == "-") { newSeq * "---"; } else { if (_aligned_query[_aa] == "?") { // does the codon contain a gap? match = theCodon$"-"; if (match[0] > (-1)) { // how many gaps? gapCount = 0; for (_pos = 0; _pos < 3; _pos = _pos + 1) { if (theCodon[_pos] == "-") { gapCount = gapCount + 1; } } if (gapCount == 1) { // align incomplete codon to reference in nucleotide space refCodon = masterReferenceSequence[3*(_aa-left_bound)][3*(_aa-left_bound)+2]; inStr[0] = refCodon; inStr[1] = theCodon^{{"-"}{""}}; AlignSequences (aligned, inStr, nucAlignOptions); aligned = aligned[0]; if (Abs(aligned[2]) == 3) { newSeq * aligned[2]; } else { // alignment failed, no longer a codon newSeq * theCodon; } } // else do not append single nucleotide insertion codon_pos = codon_pos + 3; } else { // codon contains mixtures newSeq * theCodon; codon_pos = codon_pos + 3; } } else { newSeq * theCodon; codon_pos = codon_pos + 3; } } } newSeq * 0; // track insertions in query relative to reference // copied code from SeqAlignShared.ibf refInsert = _aligned_ref||"-+"; if (refInsert[0]>0 && outputMode == 1) { insCount = Rows (refInsert)/2; // how many gaps in aligned ref AA? offset = 0; for (insN = 0; insN < insCount; insN = insN + 1) { insPos = refInsert[insN*2]; insLength = refInsert[insN*2+1]-insPos+1; insPos = insPos-offset; if (refInsertions[insPos] < insLength) { refInsertions[insPos] = insLength; } offset = offset + insLength; } } // refresh the sequence record aRecord["ALIGNED_CODON"] = newSeq; aRecord["LEFT_BOUND"] = left_bound; aRecord["RIGHT_BOUND"] = right_bound; toDoSequences[seqCounter] = aRecord; } if (outputMode == 1) { for (_ri = 0; _ri < refLength; _ri = _ri + 1) { paddedLength = paddedLength + refInsertions[_ri]; } } /* ==================================================================== */ /* PHASE 5. Output results to file */ /* ==================================================================== */ SetDialogPrompt ("Please select a file to output aligned codon sequences: "); fprintf (PROMPT_FOR_FILE, CLEAR_FILE, KEEP_OPEN); // FASTA if (outputMode == 0) { for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter + 1) { aRecord = toDoSequences[seqCounter]; fprintf(LAST_FILE_PATH, ">", aRecord["SEQUENCE_ID"], "\n", aRecord["ALIGNED_CODON"], "\n"); } } else { // MSA if (outputMode == 1) { /* wait till v2.3 to do this */ // generate padded reference sequence refLength = Abs(inStr[0]); fullRefSeq = ""; fullRefSeq * (3*paddedLength); // allocate lots of space for insertions fullRefSeq * (masterReferenceSequence[0][2]); for (_rl = 1; _rl < refLength; _rl = _rl + 1) { gapCount = refInsertions[_rl]; for (_gc=0; _gc < gapCount; _gc = _gc+1) { fullRefSeq * ("---"); } fullRefSeq * (masterReferenceSequence[3*_rl][3*_rl+2]); } for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter + 1) { aRecord = toDoSequences[seqCounter]; seqid = aRecord["SEQUENCE_ID"]; seq = aRecord["ALIGNED_CODON"]; left = aRecord["LEFT_BOUND"]; right = aRecord["RIGHT_BOUND"]; gappedSeq = ""; gappedSeq * (Abs(fullRefSeq)); gappedSeq * 0; fprintf(LAST_FILE_PATH, ">", aRecord["SEQUENCE_ID"], "\n", aRecord["ALIGNED_CODON"], "\n"); } // a padded alignment might not be good enough - should we generate a guide tree // and do a proper MSA? } // CSV else { for (seqCounter = 0; seqCounter < unalSequenceCount; seqCounter = seqCounter + 1) { aRecord = toDoSequences[seqCounter]; fprintf (LAST_FILE_PATH, aRecord["SEQUENCE_ID"], ",", aRecord["RAW"], ","); fprintf (LAST_FILE_PATH, aRecord["ALIGN_SCORE"], ",", aRecord["NUM_FRAME_SHIFTS"], ","); fprintf (LAST_FILE_PATH, aRecord["ALIGNED_REF"], ",", aRecord["ALIGNED_QUERY"], ","); fprintf (LAST_FILE_PATH, aRecord["ALIGNED_CODON"], "\n"); } } } fprintf (LAST_FILE_PATH, CLOSE_FILE);