00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00023 #include "log.h"
00024 #include "format.h"
00025 #include "common.h"
00026 #include "seq.h"
00027 #include "numUtil.h"
00028
00029
00030 #define MASKED_BASE_BIT 8
00031
00032
00033 struct codonTable
00034 {
00035 DNA *codon;
00036 AA protCode;
00037 AA mitoCode;
00038 };
00039
00040
00041 struct codonTable codonTable[] =
00042 {
00043 {"ttt", 'F', 'F',},
00044 {"ttc", 'F', 'F',},
00045 {"tta", 'L', 'L',},
00046 {"ttg", 'L', 'L',},
00047
00048 {"tct", 'S', 'S',},
00049 {"tcc", 'S', 'S',},
00050 {"tca", 'S', 'S',},
00051 {"tcg", 'S', 'S',},
00052
00053 {"tat", 'Y', 'Y',},
00054 {"tac", 'Y', 'Y',},
00055 {"taa", 0, 0,},
00056 {"tag", 0, 0,},
00057
00058 {"tgt", 'C', 'C',},
00059 {"tgc", 'C', 'C',},
00060 {"tga", 0, 'W',},
00061 {"tgg", 'W', 'W',},
00062
00063 {"ctt", 'L', 'L',},
00064 {"ctc", 'L', 'L',},
00065 {"cta", 'L', 'L',},
00066 {"ctg", 'L', 'L',},
00067
00068 {"cct", 'P', 'P',},
00069 {"ccc", 'P', 'P',},
00070 {"cca", 'P', 'P',},
00071 {"ccg", 'P', 'P',},
00072
00073 {"cat", 'H', 'H',},
00074 {"cac", 'H', 'H',},
00075 {"caa", 'Q', 'Q',},
00076 {"cag", 'Q', 'Q',},
00077
00078 {"cgt", 'R', 'R',},
00079 {"cgc", 'R', 'R',},
00080 {"cga", 'R', 'R',},
00081 {"cgg", 'R', 'R',},
00082
00083 {"att", 'I', 'I',},
00084 {"atc", 'I', 'I',},
00085 {"ata", 'I', 'M',},
00086 {"atg", 'M', 'M',},
00087
00088 {"act", 'T', 'T',},
00089 {"acc", 'T', 'T',},
00090 {"aca", 'T', 'T',},
00091 {"acg", 'T', 'T',},
00092
00093 {"aat", 'N', 'N',},
00094 {"aac", 'N', 'N',},
00095 {"aaa", 'K', 'K',},
00096 {"aag", 'K', 'K',},
00097
00098 {"agt", 'S', 'S',},
00099 {"agc", 'S', 'S',},
00100 {"aga", 'R', 0,},
00101 {"agg", 'R', 0,},
00102
00103 {"gtt", 'V', 'V',},
00104 {"gtc", 'V', 'V',},
00105 {"gta", 'V', 'V',},
00106 {"gtg", 'V', 'V',},
00107
00108 {"gct", 'A', 'A',},
00109 {"gcc", 'A', 'A',},
00110 {"gca", 'A', 'A',},
00111 {"gcg", 'A', 'A',},
00112
00113 {"gat", 'D', 'D',},
00114 {"gac", 'D', 'D',},
00115 {"gaa", 'E', 'E',},
00116 {"gag", 'E', 'E',},
00117
00118 {"ggt", 'G', 'G',},
00119 {"ggc", 'G', 'G',},
00120 {"gga", 'G', 'G',},
00121 {"ggg", 'G', 'G',},
00122 };
00123
00124
00125
00126
00127
00128
00129
00130 int ntVal[256];
00131 int ntValLower[256];
00132 int ntValUpper[256];
00133 int ntVal5[256];
00134 int ntValNoN[256];
00135 DNA valToNt[(N_BASE_VAL|MASKED_BASE_BIT)+1];
00136
00137
00138 int ntValMasked[256];
00139 DNA valToNtMasked[256];
00140
00141
00142 static int inittedNtVal = 0;
00143
00144 static void initNtVal()
00145 {
00146 if (!inittedNtVal)
00147 {
00148 int i;
00149 for (i=0; i<NUMELE(ntVal); i++)
00150 {
00151 ntValUpper[i] = ntValLower[i] = ntVal[i] = -1;
00152 ntValNoN[i] = T_BASE_VAL;
00153 if (isspace(i) || isdigit(i))
00154 ntVal5[i] = ntValMasked[i] = -1;
00155 else
00156 {
00157 ntVal5[i] = N_BASE_VAL;
00158 ntValMasked[i] = (islower(i) ? (N_BASE_VAL|MASKED_BASE_BIT) : N_BASE_VAL);
00159 }
00160 }
00161 ntVal5['t'] = ntVal5['T'] = ntValNoN['t'] = ntValNoN['T'] = ntVal['t'] = ntVal['T'] =
00162 ntValLower['t'] = ntValUpper['T'] = T_BASE_VAL;
00163 ntVal5['u'] = ntVal5['U'] = ntValNoN['u'] = ntValNoN['U'] = ntVal['u'] = ntVal['U'] =
00164 ntValLower['u'] = ntValUpper['U'] = U_BASE_VAL;
00165 ntVal5['c'] = ntVal5['C'] = ntValNoN['c'] = ntValNoN['C'] = ntVal['c'] = ntVal['C'] =
00166 ntValLower['c'] = ntValUpper['C'] = C_BASE_VAL;
00167 ntVal5['a'] = ntVal5['A'] = ntValNoN['a'] = ntValNoN['A'] = ntVal['a'] = ntVal['A'] =
00168 ntValLower['a'] = ntValUpper['A'] = A_BASE_VAL;
00169 ntVal5['g'] = ntVal5['G'] = ntValNoN['g'] = ntValNoN['G'] = ntVal['g'] = ntVal['G'] =
00170 ntValLower['g'] = ntValUpper['G'] = G_BASE_VAL;
00171
00172 valToNt[T_BASE_VAL] = valToNt[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00173 valToNt[C_BASE_VAL] = valToNt[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00174 valToNt[A_BASE_VAL] = valToNt[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00175 valToNt[G_BASE_VAL] = valToNt[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00176 valToNt[N_BASE_VAL] = valToNt[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00177
00178
00179 ntValMasked['T'] = T_BASE_VAL;
00180 ntValMasked['U'] = U_BASE_VAL;
00181 ntValMasked['C'] = C_BASE_VAL;
00182 ntValMasked['A'] = A_BASE_VAL;
00183 ntValMasked['G'] = G_BASE_VAL;
00184
00185 ntValMasked['t'] = T_BASE_VAL|MASKED_BASE_BIT;
00186 ntValMasked['u'] = U_BASE_VAL|MASKED_BASE_BIT;
00187 ntValMasked['c'] = C_BASE_VAL|MASKED_BASE_BIT;
00188 ntValMasked['a'] = A_BASE_VAL|MASKED_BASE_BIT;
00189 ntValMasked['g'] = G_BASE_VAL|MASKED_BASE_BIT;
00190
00191 valToNtMasked[T_BASE_VAL] = 'T';
00192 valToNtMasked[C_BASE_VAL] = 'C';
00193 valToNtMasked[A_BASE_VAL] = 'A';
00194 valToNtMasked[G_BASE_VAL] = 'G';
00195 valToNtMasked[N_BASE_VAL] = 'N';
00196
00197 valToNtMasked[T_BASE_VAL|MASKED_BASE_BIT] = 't';
00198 valToNtMasked[C_BASE_VAL|MASKED_BASE_BIT] = 'c';
00199 valToNtMasked[A_BASE_VAL|MASKED_BASE_BIT] = 'a';
00200 valToNtMasked[G_BASE_VAL|MASKED_BASE_BIT] = 'g';
00201 valToNtMasked[N_BASE_VAL|MASKED_BASE_BIT] = 'n';
00202
00203 inittedNtVal = 1;
00204 }
00205 }
00206
00207
00208
00212 AA seq_lookupCodon(DNA *dna)
00213 {
00214 int ix;
00215 int i;
00216 char c;
00217
00218 if (!inittedNtVal)
00219 initNtVal();
00220 ix = 0;
00221 for (i=0; i<3; ++i)
00222 {
00223 int bv = ntVal[(int)dna[i]];
00224 if (bv<0)
00225 return 'X';
00226 ix = (ix<<2) + bv;
00227 }
00228 c = codonTable[ix].protCode;
00229 c = toupper(c);
00230 return c;
00231 }
00232
00233
00234
00238 AA seq_lookupMitochondrialCodon(DNA *dna)
00239 {
00240 int ix;
00241 int i;
00242 char c;
00243
00244 if (!inittedNtVal)
00245 initNtVal();
00246 ix = 0;
00247 for (i=0; i<3; ++i)
00248 {
00249 int bv = ntVal[(int)dna[i]];
00250 if (bv<0)
00251 return 'X';
00252 ix = (ix<<2) + bv;
00253 }
00254 c = codonTable[ix].mitoCode;
00255 c = toupper(c);
00256 return c;
00257 }
00258
00259
00260
00264 Codon seq_codonVal(DNA *start)
00265 {
00266 int v1,v2,v3;
00267
00268 if ((v1 = ntVal[(int)start[0]]) < 0)
00269 return -1;
00270 if ((v2 = ntVal[(int)start[1]]) < 0)
00271 return -1;
00272 if ((v3 = ntVal[(int)start[2]]) < 0)
00273 return -1;
00274 return ((v1<<4) + (v2<<2) + v3);
00275 }
00276
00277
00278
00282 DNA* seq_valToCodon(int val)
00283 {
00284 assert(val >= 0 && val < 64);
00285 return codonTable[val].codon;
00286 }
00287
00288
00289
00290 char* seq_dnaTranslate (DNA *dna, int terminateAtStopCodon)
00291 {
00292 static Stringa buffer = NULL;
00293 int i;
00294 int dnaSize;
00295 char aa;
00296
00297 stringCreateClear (buffer,500);
00298 dnaSize = strlen(dna);
00299 for (i=0; i<dnaSize-2; i+=3) {
00300 aa = seq_lookupCodon(dna+i);
00301 if (aa == 0) {
00302 if (terminateAtStopCodon) {
00303 break;
00304 }
00305 aa = '*';
00306 }
00307 stringCatChar (buffer,aa);
00308 }
00309 return string (buffer);
00310 }
00311
00312
00313
00314
00315 char ntChars[256];
00316
00317 static void initNtChars()
00318 {
00319 static int initted = 0;
00320
00321 if (!initted)
00322 {
00323 zeroBytes(ntChars, sizeof(ntChars));
00324 ntChars['a'] = ntChars['A'] = 'a';
00325 ntChars['c'] = ntChars['C'] = 'c';
00326 ntChars['g'] = ntChars['G'] = 'g';
00327 ntChars['t'] = ntChars['T'] = 't';
00328 ntChars['n'] = ntChars['N'] = 'n';
00329 ntChars['u'] = ntChars['U'] = 'u';
00330 ntChars['-'] = 'n';
00331 initted = 1;
00332 }
00333 }
00334
00335 char ntMixedCaseChars[256];
00336
00337 static void initNtMixedCaseChars()
00338 {
00339 static int initted = 0;
00340
00341 if (!initted)
00342 {
00343 zeroBytes(ntMixedCaseChars, sizeof(ntMixedCaseChars));
00344 ntMixedCaseChars['a'] = 'a';
00345 ntMixedCaseChars['A'] = 'A';
00346 ntMixedCaseChars['c'] = 'c';
00347 ntMixedCaseChars['C'] = 'C';
00348 ntMixedCaseChars['g'] = 'g';
00349 ntMixedCaseChars['G'] = 'G';
00350 ntMixedCaseChars['t'] = 't';
00351 ntMixedCaseChars['T'] = 'T';
00352 ntMixedCaseChars['n'] = 'n';
00353 ntMixedCaseChars['N'] = 'N';
00354 ntMixedCaseChars['u'] = 'u';
00355 ntMixedCaseChars['U'] = 'U';
00356 ntMixedCaseChars['-'] = 'n';
00357 initted = 1;
00358 }
00359 }
00360
00361
00362
00363 DNA ntCompTable[256];
00364 static int inittedCompTable = 0;
00365
00366 static void initNtCompTable()
00367 {
00368 zeroBytes(ntCompTable, sizeof(ntCompTable));
00369 ntCompTable[' '] = ' ';
00370 ntCompTable['-'] = '-';
00371 ntCompTable['='] = '=';
00372 ntCompTable['a'] = 't';
00373 ntCompTable['c'] = 'g';
00374 ntCompTable['g'] = 'c';
00375 ntCompTable['t'] = 'a';
00376 ntCompTable['u'] = 'a';
00377 ntCompTable['n'] = 'n';
00378 ntCompTable['-'] = '-';
00379 ntCompTable['.'] = '.';
00380 ntCompTable['A'] = 'T';
00381 ntCompTable['C'] = 'G';
00382 ntCompTable['G'] = 'C';
00383 ntCompTable['T'] = 'A';
00384 ntCompTable['U'] = 'A';
00385 ntCompTable['N'] = 'N';
00386 ntCompTable['R'] = 'Y';
00387 ntCompTable['Y'] = 'R';
00388 ntCompTable['M'] = 'K';
00389 ntCompTable['K'] = 'M';
00390 ntCompTable['S'] = 'S';
00391 ntCompTable['W'] = 'W';
00392 ntCompTable['V'] = 'B';
00393 ntCompTable['H'] = 'D';
00394 ntCompTable['D'] = 'H';
00395 ntCompTable['B'] = 'V';
00396 ntCompTable['X'] = 'N';
00397 ntCompTable['r'] = 'y';
00398 ntCompTable['y'] = 'r';
00399 ntCompTable['s'] = 's';
00400 ntCompTable['w'] = 'w';
00401 ntCompTable['m'] = 'k';
00402 ntCompTable['k'] = 'm';
00403 ntCompTable['v'] = 'b';
00404 ntCompTable['h'] = 'd';
00405 ntCompTable['d'] = 'h';
00406 ntCompTable['b'] = 'v';
00407 ntCompTable['x'] = 'n';
00408 ntCompTable['('] = ')';
00409 ntCompTable[')'] = '(';
00410 inittedCompTable = 1;
00411 }
00412
00413
00414
00418 void seq_complement(DNA *dna, long length)
00419 {
00420 int i;
00421
00422 if (!inittedCompTable)
00423 initNtCompTable();
00424 for (i=0; i<length; ++i)
00425 {
00426 *dna = ntCompTable[(int)*dna];
00427 ++dna;
00428 }
00429 }
00430
00431
00432
00436 void seq_reverseComplement(DNA *dna, long length)
00437 {
00438 reverseBytes(dna, length);
00439 seq_complement(dna, length);
00440 }
00441
00442
00443
00447 int seq_seqIsLower(Seq *seq)
00448 {
00449 int size = seq->size, i;
00450 char *poly = seq->sequence;
00451 for (i=0; i<size; ++i)
00452 if (!islower(poly[i]))
00453 return 0;
00454 return 1;
00455 }
00456
00457
00458
00463 aaSeq* seq_translateSeqN(dnaSeq *inSeq, unsigned offset, unsigned inSize, int stop)
00464 {
00465 aaSeq *seq;
00466 DNA *dna = inSeq->sequence;
00467 AA *pep, aa;
00468 int i, lastCodon;
00469 int actualSize = 0;
00470
00471 assert(offset <= inSeq->size);
00472 if ((inSize == 0) || (inSize > (inSeq->size - offset)))
00473 inSize = inSeq->size - offset;
00474 lastCodon = offset + inSize - 3;
00475
00476 AllocVar(seq);
00477 seq->sequence = pep = needLargeMem(inSize/3+1);
00478 for (i=offset; i <= lastCodon; i += 3)
00479 {
00480 aa = seq_lookupCodon(dna+i);
00481 if (aa == 0)
00482 {
00483 if (stop)
00484 break;
00485 else
00486 aa = 'Z';
00487 }
00488 *pep++ = aa;
00489 ++actualSize;
00490 }
00491 *pep = 0;
00492 assert(actualSize <= inSize/3+1);
00493 seq->size = actualSize;
00494 seq->name = hlr_strdup(inSeq->name);
00495 return seq;
00496 }
00497
00498
00499
00505 aaSeq* seq_translateSeq(dnaSeq *inSeq, unsigned offset, int stop)
00506 {
00507 return seq_translateSeqN(inSeq, offset, 0, stop);
00508 }
00509
00510
00511
00515 Bits* seq_maskFromUpperCaseSeq(Seq *seq)
00516 {
00517 int size = seq->size, i;
00518 char *poly = seq->sequence;
00519 Bits *b = bitAlloc(size);
00520 for (i=0; i<size; ++i)
00521 {
00522 if (isupper(poly[i]))
00523 bitSetOne(b, i);
00524 }
00525 return b;
00526 }
00527
00528
00529
00533 void seq_toRna(DNA *dna)
00534 {
00535 DNA c;
00536 for (;;)
00537 {
00538 c = *dna;
00539 if (c == 't')
00540 *dna = 'u';
00541 else if (c == 'T')
00542 *dna = 'U';
00543 else if (c == 0)
00544 break;
00545 ++dna;
00546 }
00547 }
00548
00549
00550
00551
00552 static void dnaOrAaFilter(char *in, char *out, char filter[256])
00553 {
00554 char c;
00555 seq_init();
00556 while ((c = *in++) != 0)
00557 {
00558 if ((c = filter[(int)c]) != 0) *out++ = c;
00559 }
00560 *out++ = 0;
00561 }
00562
00563
00564
00568 void seq_dnaFilter(char *in, DNA *out)
00569 {
00570 dnaOrAaFilter(in, out, ntChars);
00571 }
00572
00573
00574
00578 void seq_dnaMixedCaseFilter(char *in, DNA *out)
00579 {
00580 dnaOrAaFilter(in, out, ntMixedCaseChars);
00581 }
00582
00583
00584
00588 void seq_aaFilter(char *in, DNA *out)
00589 {
00590 dnaOrAaFilter(in, out, aaChars);
00591 }
00592
00593
00594
00598 void seq_dnaBaseHistogram(DNA *dna, int dnaSize, int histogram[4])
00599 {
00600 int val;
00601 zeroBytes(histogram, 4*sizeof(int));
00602 while (--dnaSize >= 0)
00603 {
00604 if ((val = ntVal[(int)*dna++]) >= 0)
00605 ++histogram[val];
00606 }
00607 }
00608
00609
00610
00615 int seq_intronOrientation(DNA *iStart, DNA *iEnd)
00616 {
00617 if (iEnd - iStart < 32)
00618 return 0;
00619 if (iStart[0] == 'g' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'g')
00620 {
00621 return 1;
00622 }
00623 else if (iStart[0] == 'c' && iStart[1] == 't' && iEnd[-2] == 'a' && iEnd[-1] == 'c')
00624 {
00625 return -1;
00626 }
00627 else
00628 return 0;
00629 }
00630
00631
00632
00636 int seq_dnaOrAaScoreMatch(char *a, char *b, int size, int matchScore, int mismatchScore, char ignore)
00637 {
00638 int i;
00639 int score = 0;
00640 for (i=0; i<size; ++i)
00641 {
00642 char aa = a[i];
00643 char bb = b[i];
00644 if (aa == ignore || bb == ignore)
00645 continue;
00646 if (aa == bb)
00647 score += matchScore;
00648 else
00649 score += mismatchScore;
00650 }
00651 return score;
00652 }
00653
00654
00655
00656
00657 int aaVal[256];
00658 AA valToAa[20];
00659 AA aaChars[256];
00660
00661
00662
00663 struct aminoAcidTable
00664 {
00665 int ix;
00666 char letter;
00667 char abbreviation[3];
00668 char *name;
00669 };
00670
00671 struct aminoAcidTable aminoAcidTable[] =
00672 {
00673 {0, 'A', "ala", "alanine"},
00674 {1, 'C', "cys", "cysteine"},
00675 {2, 'D', "asp", "aspartic acid"},
00676 {3, 'E', "glu", "glutamic acid"},
00677 {4, 'F', "phe", "phenylalanine"},
00678 {5, 'G', "gly", "glycine"},
00679 {6, 'H', "his", "histidine"},
00680 {7, 'I', "ile", "isoleucine"},
00681 {8, 'K', "lys", "lysine"},
00682 {9, 'L', "leu", "leucine"},
00683 {10, 'M', "met", "methionine"},
00684 {11, 'N', "asn", "asparagine"},
00685 {12, 'P', "pro", "proline"},
00686 {13, 'Q', "gln", "glutamine"},
00687 {14, 'R', "arg", "arginine"},
00688 {15, 'S', "ser", "serine"},
00689 {16, 'T', "thr", "threonine"},
00690 {17, 'V', "val", "valine"},
00691 {18, 'W', "try", "tryptophan"},
00692 {19, 'Y', "tyr", "tyrosine"},
00693 };
00694
00695
00696 static void initAaVal()
00697
00698 {
00699 int i;
00700 char c, lowc;
00701
00702 for (i=0; i<NUMELE(aaVal); ++i)
00703 aaVal[i] = -1;
00704 for (i=0; i<NUMELE(aminoAcidTable); ++i)
00705 {
00706 c = aminoAcidTable[i].letter;
00707 lowc = tolower(c);
00708 aaVal[(int)c] = aaVal[(int)lowc] = i;
00709 aaChars[(int)c] = aaChars[(int)lowc] = c;
00710 valToAa[i] = c;
00711 }
00712 aaChars['x'] = aaChars['X'] = 'X';
00713 }
00714
00715
00716
00720 void seq_init()
00721 {
00722 static int opened = 0;
00723 if (!opened)
00724 {
00725 initNtVal();
00726 initAaVal();
00727 initNtChars();
00728 initNtMixedCaseChars();
00729 initNtCompTable();
00730 opened = 1;
00731 }
00732 }
00733