1 /** 2 Some additional string functions. 3 4 Copyright: © 2019 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dalicious..string; 10 11 import std.algorithm : 12 countUntil, 13 equal, 14 joiner, 15 min, 16 map; 17 import std.array : 18 appender, 19 array, 20 minimallyInitializedArray; 21 import std.conv : to; 22 import std.exception : basicExceptionCtors, enforce; 23 import std.functional : binaryFun; 24 import std.math : 25 ceil, 26 floor, 27 isInfinity, 28 isNaN, 29 round, 30 sgn, 31 sqrt; 32 import std.range : 33 chain, 34 chunks, 35 cycle, 36 only, 37 take, 38 zip; 39 import std.range.primitives : hasSlicing; 40 import std.string : lineSplitter, tr; 41 import std.traits : 42 isFloatingPoint, 43 isSomeString; 44 import std.typecons : 45 Flag, 46 No, 47 tuple, 48 Yes; 49 import transforms : 50 snakeCase, 51 snakeCaseCT; 52 53 54 /// Convert a `camelCase` string to `dash-case`. 55 string dashCase(string camelCase) 56 { 57 return camelCase.snakeCase.tr("_", "-"); 58 } 59 60 /// ditto 61 enum dashCaseCT(string camelCase) = camelCase.snakeCaseCT.tr("_", "-"); 62 63 /** 64 Adds one level of indentation for a multi-line string. Adds indentSize 65 spaces to each non-empty line. 66 67 Returns: indented string 68 */ 69 S indent(S)(S str, in size_t indentSize = 4) if (isSomeString!S) 70 { 71 enum lineSep = "\n"; 72 alias indentLine = (line) => chain(" ".cycle.take(line.length == 0 ? 0 : indentSize), line); 73 74 return str[] 75 .lineSplitter 76 .map!indentLine 77 .joiner(lineSep) 78 .chain(str[$ - lineSep.length .. $] == lineSep ? lineSep : "") 79 .array 80 .to!S; 81 } 82 83 /// 84 unittest 85 { 86 assert("a\nb".indent == " a\n b"); 87 assert("a\nb\n\n".indent == " a\n b\n\n"); 88 assert("a\nb\n".indent(2) == " a\n b\n"); 89 } 90 91 class AlignmentException : Exception 92 { 93 /// 94 mixin basicExceptionCtors; 95 } 96 97 /// One edit operation of the Needleman-Wunsch algorithm. 98 enum EditOp: byte 99 { 100 substitution, 101 deletetion, 102 insertion, 103 } 104 105 alias score_t = uint; 106 107 static enum Strip : byte 108 { 109 none = 0b00, 110 back = 0b01, 111 front = 0b10, 112 both = 0b11, 113 } 114 115 /// Represents an alignment of two sequences. 116 struct SequenceAlignment(S, alias scoreFun = "a == b ? 0 : 1") 117 { 118 alias getScore = binaryFun!scoreFun; 119 120 score_t score; 121 EditOp[] editPath; 122 S reference; 123 S query; 124 score_t indelPenalty; 125 Flag!"freeShift" freeShift; 126 127 /// Compute alignment score. 128 score_t computeScore() const pure 129 { 130 auto walkResult = walkEditOps(); 131 enforce!AlignmentException( 132 walkResult.i == reference.length && walkResult.j == query.length, 133 "invalid alignment", 134 ); 135 136 return walkResult.computedScore; 137 } 138 139 bool isValid() const pure nothrow 140 { 141 auto walkResult = walkEditOps(); 142 143 return isValid(walkResult); 144 } 145 146 private bool isValid(in WalkResult walkResult) const pure nothrow 147 { 148 return walkResult.i == reference.length && 149 walkResult.j == query.length && 150 walkResult.computedScore == score; 151 } 152 153 private static struct WalkResult 154 { 155 score_t computedScore; 156 size_t i, j; 157 } 158 159 private auto walkEditOps() const pure nothrow 160 { 161 WalkResult result; 162 foreach (k, editOp; editPath) 163 { 164 if (result.i > reference.length || result.j > query.length) 165 return result; 166 167 final switch (editOp) 168 { 169 case EditOp.substitution: 170 if (result.i == reference.length || result.j == query.length) 171 return result; 172 result.computedScore += getScore(reference[result.i], query[result.j]); 173 ++result.i; 174 ++result.j; 175 break; 176 case EditOp.deletetion: 177 if (result.i == reference.length) 178 return result; 179 result.computedScore += indelPenalty; 180 ++result.i; 181 break; 182 case EditOp.insertion: 183 if (result.j == query.length) 184 return result; 185 result.computedScore += indelPenalty; 186 ++result.j; 187 break; 188 } 189 } 190 if (freeShift) 191 { 192 auto firstSubstitution = editPath.countUntil(EditOp.substitution); 193 result.computedScore -= firstSubstitution >= 0 194 ? firstSubstitution 195 : editPath.length; 196 } 197 198 return result; 199 } 200 201 /// Strip leading/trailing insertions. 202 auto stripInsertions(Strip strip) inout pure nothrow 203 { 204 return partial(0, reference.length, strip); 205 } 206 207 /** 208 Get a partial alignment with respect to `reference`. 209 */ 210 auto partial(in size_t begin, in size_t end, Strip stripInsertions = Strip.none) inout pure nothrow 211 in 212 { 213 assert(this.isValid(), "Attempting to get a partial alignment of an invalid alignment"); 214 } 215 out (partialAlignment) 216 { 217 assert(partialAlignment.isValid(), "Partial alignment is invalid"); 218 } 219 do 220 { 221 assert(0 <= begin && begin <= end && end <= reference.length, "index out of bounds"); 222 223 if (end == begin) 224 return typeof(this)(0, [], reference[0 .. 0], query[0 .. 0], indelPenalty); 225 226 bool hasStarted; 227 score_t newScore; 228 size_t editBegin, editEnd; 229 size_t queryBegin, queryEnd; 230 size_t i, j; 231 foreach (k, editOp; editPath) 232 { 233 if (!hasStarted && i == begin) 234 { 235 hasStarted = !(stripInsertions & Strip.front); 236 newScore = 0; 237 editBegin = k; 238 queryBegin = j; 239 } 240 241 if (freeShift && i == begin) 242 newScore = 0; 243 244 if (i == end && ((stripInsertions & Strip.back) || editOp != EditOp.insertion)) 245 break; 246 247 final switch (editOp) 248 { 249 case EditOp.substitution: 250 newScore += getScore(reference[i], query[j]); 251 ++i; 252 ++j; 253 break; 254 case EditOp.deletetion: 255 newScore += indelPenalty; 256 ++i; 257 break; 258 case EditOp.insertion: 259 newScore += indelPenalty; 260 ++j; 261 break; 262 } 263 264 if (i == end) 265 { 266 editEnd = k + 1; 267 queryEnd = j; 268 } 269 } 270 271 if ( 272 !(stripInsertions & Strip.back) && 273 editEnd < editPath.length && 274 editPath[editEnd] == EditOp.insertion 275 ) 276 { 277 ++editEnd; 278 queryEnd = j; 279 } 280 281 auto partialAlignment = typeof(this)( 282 newScore, 283 editPath[editBegin .. editEnd], 284 reference[begin .. end], 285 query[queryBegin .. queryEnd], 286 indelPenalty, 287 freeShift, 288 ); 289 290 return partialAlignment; 291 } 292 293 auto opDollar() const pure nothrow 294 { 295 return reference.length; 296 } 297 298 /// ditto 299 auto opIndex(in size_t[2] slice) inout pure nothrow 300 { 301 return partial(slice[0], slice[1]); 302 } 303 304 /// 305 unittest 306 { 307 enum indelPenalty = 1; 308 auto alignment = findAlignment("GCATGCT", "GATTACA", indelPenalty); 309 310 assert(alignment.score == 4); 311 assert(alignment.toString == 312 "GCAT-GCT\n" ~ 313 "| || *|*\n" ~ 314 "G-ATTACA"); 315 316 auto partialAlignment = alignment[1 .. 5]; 317 318 assert(partialAlignment.score == 3); 319 assert(partialAlignment.toString == 320 "CAT-G\n" ~ 321 " || *\n" ~ 322 "-ATTA"); 323 } 324 325 size_t[2] opSlice(size_t dim)(in size_t begin, in size_t end) const pure nothrow 326 { 327 return [begin, end]; 328 } 329 330 /// Get a string representation of this alignment. Visual alignment breaks 331 /// unless elements of the sequences convert to single chars via `to!string`. 332 string toString( 333 alias matchSymbol = '|', 334 alias substitutionSymbol = '*', 335 alias indelSymbol = ' ', 336 alias gapSymbol = '-', 337 )(in size_t width = 0) const pure 338 { 339 auto referenceLine = appender!string; 340 referenceLine.reserve(editPath.length); 341 auto compareLine = appender!string; 342 compareLine.reserve(editPath.length); 343 auto queryLine = appender!string; 344 queryLine.reserve(editPath.length); 345 346 size_t i, j; 347 foreach (editOp; editPath) 348 { 349 final switch (editOp) 350 { 351 case EditOp.substitution: 352 referenceLine ~= reference[i].to!string; 353 compareLine ~= reference[i] == query[j] 354 ? matchSymbol 355 : substitutionSymbol; 356 queryLine ~= query[j].to!string; 357 ++i; 358 ++j; 359 break; 360 case EditOp.deletetion: 361 referenceLine ~= reference[i].to!string; 362 compareLine ~= indelSymbol; 363 queryLine ~= gapSymbol; 364 ++i; 365 break; 366 case EditOp.insertion: 367 referenceLine ~= gapSymbol; 368 compareLine ~= indelSymbol; 369 queryLine ~= query[j].to!string; 370 ++j; 371 break; 372 } 373 } 374 375 if (width == 0) 376 return only( 377 referenceLine.data.to!string, 378 compareLine.data.to!string, 379 queryLine.data.to!string, 380 ) 381 .joiner("\n") 382 .to!string; 383 else 384 return zip( 385 referenceLine.data.to!string.chunks(width), 386 compareLine.data.to!string.chunks(width), 387 queryLine.data.to!string.chunks(width), 388 ) 389 .map!(lineTriple => only(lineTriple.expand) 390 .joiner("\n")) 391 .joiner("\n\n") 392 .to!string; 393 } 394 395 /// 396 unittest 397 { 398 auto alignment = findAlignment("ACGTC", "AGTC", 1); 399 assert(alignment.toString == 400 "ACGTC\n" ~ 401 "| |||\n" ~ 402 "A-GTC"); 403 } 404 405 /// 406 unittest 407 { 408 auto alignment = findAlignment("GCATGCT", "GATTACA", 1); 409 410 assert(alignment.toString == 411 "GCAT-GCT\n" ~ 412 "| || *|*\n" ~ 413 "G-ATTACA"); 414 } 415 } 416 417 /** 418 Compute an alignment of `query` against `reference` using the 419 Needleman-Wunsch algorithm with non-negative scores and constant 420 indel penalty. Optionally, the `freeShift` mode may be activated 421 as to allow large indels at the beginning and end of the alignment. 422 423 **Implementation Notes:** The current implementation needs 424 `O(reference.length * query.length)` in time and memory. As the 425 memory requirement easily exceeds available memory it can be 426 limited for now. This may change in future and an implementation 427 using `O(max(reference.length, query.length))` memory will be 428 silently selected for large inputs. 429 430 Params: 431 scoreFun = calculate score for a 'substitution' at `i, j` using 432 `scoreFun(reference[i], reference[j])` 433 reference = Sequence to compare `query` against 434 query = Sequence to compare against `reference` 435 indelPenalty = Penalize each indel with this value 436 freeShift = Allow indels at the beginning and end of the alignment 437 memoryLimit = throw an error if the calculation would require more 438 than `memoryLimit` bytes. 439 Throws: AlignmentException if the calculation would require more than 440 `memoryLimit` bytes. 441 442 See_Also: http://en.wikipedia.org/wiki/Needleman-Wunsch_algorithm 443 */ 444 SequenceAlignment!(const(S), scoreFun) findAlignment( 445 alias scoreFun = "a == b ? 0 : 1", 446 S, 447 )( 448 in S reference, 449 in S query, 450 in score_t indelPenalty, 451 Flag!"freeShift" freeShift = No.freeShift, 452 size_t memoryLimit = 2^^20, 453 ) 454 { 455 alias score = binaryFun!scoreFun; 456 enforce!AlignmentException( 457 memoryRequired(reference, query) <= memoryLimit, 458 "memory limit exceeded; use short reference and/or query", 459 ); 460 461 auto F = DPMatrix!score_t(reference.length + 1, query.length + 1); 462 463 // Initialize scoring matrix 464 foreach (i; 0 .. reference.length + 1) 465 F[i, 0] = freeShift ? 0 : cast(score_t) i * indelPenalty; 466 foreach (j; 0 .. query.length + 1) 467 F[0, j] = freeShift ? 0 : cast(score_t) j * indelPenalty; 468 469 // Compute scoring matrix by rows in a cache friendly manner 470 foreach (i; 1 .. reference.length + 1) 471 foreach (j; 1 .. query.length + 1) 472 F[i, j] = min( 473 F[i - 1, j - 1] + score(reference[i - 1], query[j - 1]), 474 F[i - 1, j ] + indelPenalty, 475 F[i , j - 1] + indelPenalty, 476 ); 477 478 return typeof(return)( 479 F[$ - 1, $ - 1], 480 tracebackScoringMatrix(F), 481 reference, 482 query, 483 indelPenalty, 484 freeShift, 485 ); 486 } 487 488 /// 489 unittest 490 { 491 auto alignment = findAlignment("ACGTC", "AGTC", 1); 492 493 // - A G T C 494 // - 0 1 2 3 4 495 // A 1 0 1 2 3 496 // C 2 1 1 2 2 497 // G 3 2 1 2 3 498 // T 4 3 2 1 2 499 // C 5 4 3 2 1 500 // 501 // ACGTC 502 // | ||| 503 // A-GTC 504 505 assert(alignment.score == 1); 506 assert(alignment.editPath == [ 507 EditOp.substitution, 508 EditOp.deletetion, 509 EditOp.substitution, 510 EditOp.substitution, 511 EditOp.substitution, 512 ]); 513 } 514 515 /// 516 unittest 517 { 518 auto alignment = findAlignment("GCATGCT", "GATTACA", 1); 519 520 // - G A T T A C A 521 // - 0 1 2 3 4 5 6 7 522 // G 1 0 1 2 3 4 5 6 523 // C 2 1 2 3 4 5 4 5 524 // A 3 2 1 2 3 4 5 4 525 // T 4 3 2 1 2 3 6 5 526 // G 5 4 3 2 3 4 5 6 527 // C 6 5 4 3 4 5 4 5 528 // T 7 6 5 4 3 6 5 6 529 // 530 // GCAT-GCT 531 // | || *|* 532 // G-ATTACA 533 assert(alignment.score == 4); 534 assert(alignment.editPath == [ 535 EditOp.substitution, 536 EditOp.deletetion, 537 EditOp.substitution, 538 EditOp.substitution, 539 EditOp.insertion, 540 EditOp.substitution, 541 EditOp.substitution, 542 EditOp.substitution, 543 ]); 544 } 545 546 /// 547 unittest 548 { 549 auto alignment = findAlignment!"a == b ? 0 : 1"( 550 "tgaggacagaagggtcataggtttaattctggtcacaggcacattcctgg" ~ 551 "gttgcaggtttgatctccacctggtcggggcacatgca", 552 "tgaggacagaagggtcatggtttaattctggtcacaggcacattcctggg" ~ 553 "ttgtaggctcaattcccacccggtcggggccacatgca", 554 1, 555 ); 556 557 assert(alignment.toString(50) == 558 "tgaggacagaagggtcataggtttaattctggtcacaggcacattcctgg\n" ~ 559 "|||||||||||||||||| |||||||||||||||||||||||||||||||\n" ~ 560 "tgaggacagaagggtcat-ggtttaattctggtcacaggcacattcctgg\n" ~ 561 "\n" ~ 562 "gttgcaggtttgatctcc-acctggtcggggc-acatgca\n" ~ 563 "||||*|||*|**|| ||| |||*||||||||| |||||||\n" ~ 564 "gttgtaggctcaat-tcccacccggtcggggccacatgca"); 565 } 566 567 /// 568 unittest 569 { 570 auto alignment = findAlignment( 571 "tgaggacagaagggtcataggtttaattctggtcacaggcacattcctgg" ~ 572 "gttgcaggtttgatctccacctggtcggggcacatgca", 573 "aatgaacagctgaggacagaagggtcatggtttaattctggtcacaggca" ~ 574 "cattcctgggttgtaggctcaattcccacccggtcggggccacatgca", 575 1, 576 Yes.freeShift, 577 ); 578 579 assert(alignment.toString(50) == 580 "----------tgaggacagaagggtcataggtttaattctggtcacaggc\n" ~ 581 " |||||||||||||||||| |||||||||||||||||||||\n" ~ 582 "aatgaacagctgaggacagaagggtcat-ggtttaattctggtcacaggc\n" ~ 583 "\n" ~ 584 "acattcctgggttgcaggtttgatctcc-acctggtcggggc-acatgca\n" ~ 585 "||||||||||||||*|||*|**|| ||| |||*||||||||| |||||||\n" ~ 586 "acattcctgggttgtaggctcaat-tcccacccggtcggggccacatgca"); 587 } 588 589 /// 590 unittest 591 { 592 auto alignment = findAlignment( 593 "tatcctcaggtgaggactaacaacaaatatatatatatttatatctaaca" ~ 594 "acatatgatttctaaaatttcaaaaatcttaaggctgaattaat", 595 "tatcctcaggtgaggcttaacaacaaatatatatatactgtaatatctaa" ~ 596 "caacatatgattctaaaatttcaaaatgcttaaaggtctga", 597 1, 598 Yes.freeShift, 599 ); 600 601 assert(alignment.toString(50) == 602 "tatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatcta\n" ~ 603 "||||||||||||||| || ||||||||||||||||||| |*|| ||||||\n" ~ 604 "tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatcta\n" ~ 605 "\n" ~ 606 "acaacatatgatttctaaaatttcaaaaatcttaa-gg-ctgaattaat\n" ~ 607 "||||||||||||| ||||||||||||||**||||| || |||| \n" ~ 608 "acaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga------"); 609 } 610 611 unittest 612 { 613 enum reference = "tatcctcaggtgaggcttaacaacaaatatatatatactgtaatatctaa" ~ 614 "caacatatgattctaaaatttcaaaatgcttaaaggtctga"; 615 enum query = "atatcctcaggtgaggactaacaacaaatatatatatatttatatctaac" ~ 616 "aacatatgatttctaaaatttcaaaaatcttaaggctgaattaat"; 617 auto alignment = findAlignment( 618 reference, 619 query, 620 1 621 ); 622 623 assert(alignment.partial(0, reference.length, Strip.none).toString(50) == 624 "-tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatct\n" ~ 625 " ||||||||||||||| || ||||||||||||||||||| |*|| |||||\n" ~ 626 "atatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatct\n" ~ 627 "\n" ~ 628 "aacaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga------\n" ~ 629 "|||||||||||||| ||||||||||||||**||||| || |||| \n" ~ 630 "aacaacatatgatttctaaaatttcaaaaatcttaa-gg-ctgaattaat"); 631 assert(alignment.partial(0, reference.length, Strip.front).toString(50) == 632 "tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatcta\n" ~ 633 "||||||||||||||| || ||||||||||||||||||| |*|| ||||||\n" ~ 634 "tatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatcta\n" ~ 635 "\n" ~ 636 "acaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga------\n" ~ 637 "||||||||||||| ||||||||||||||**||||| || |||| \n" ~ 638 "acaacatatgatttctaaaatttcaaaaatcttaa-gg-ctgaattaat"); 639 assert(alignment.partial(0, reference.length, Strip.back).toString(50) == 640 "-tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatct\n" ~ 641 " ||||||||||||||| || ||||||||||||||||||| |*|| |||||\n" ~ 642 "atatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatct\n" ~ 643 "\n" ~ 644 "aacaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga\n" ~ 645 "|||||||||||||| ||||||||||||||**||||| || ||||\n" ~ 646 "aacaacatatgatttctaaaatttcaaaaatcttaa-gg-ctga"); 647 assert(alignment.partial(0, reference.length, Strip.both).toString(50) == 648 "tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatcta\n" ~ 649 "||||||||||||||| || ||||||||||||||||||| |*|| ||||||\n" ~ 650 "tatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatcta\n" ~ 651 "\n" ~ 652 "acaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga\n" ~ 653 "||||||||||||| ||||||||||||||**||||| || ||||\n" ~ 654 "acaacatatgatttctaaaatttcaaaaatcttaa-gg-ctga"); 655 } 656 657 unittest 658 { 659 enum reference = "tatcctcaggtgaggcttaacaacaaatatatatatactgtaatatctaa"; 660 enum query = "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"; 661 auto alignment = findAlignment!"1 - (a == b || a == 'n' || b == 'n')"( 662 reference, 663 query, 664 1 665 ); 666 667 import std.stdio; 668 assert(alignment.partial(0, reference.length, Strip.front).toString(50) == 669 "tatcctcaggtgaggcttaacaacaaatatatatatactgtaatatctaa\n" ~ 670 "**************************************************\n" ~ 671 "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn"); 672 } 673 674 unittest 675 { 676 auto alignment = findAlignment( 677 "tgaggacagaagggtcataggtttaattctggtcacaggcacattcctgg" ~ 678 "gttgcaggtttgatctccacctggtcggggcacatgcaggaggcaaccaa" ~ 679 "tcaatgtgtctctttctcagtgatgtttcttctctctgtctctctccccc" ~ 680 "ctctcttctactctctctgaaaaatagatggaaaaaatatcctcaggtga" ~ 681 "ggactaacaacaaatatatatatatttatatctaacaacatatgatttct" ~ 682 "aaaatttcaaaaatcttaaggctgaattaat", 683 "aatgaacagctgaggacagaagggtcatggtttaattctggtcacaggca" ~ 684 "cattcctgggttgtaggctcaattcccacccggtcggggccacatgcaga" ~ 685 "aggcaaccatcaatgtgtctctttcaagtgatgtttcttctctctgtcta" ~ 686 "ctccccctccctatctactctctgggaaacttatggaaaaaatatcctca" ~ 687 "ggtgaggcttaacaacaaatatatatatactgtaatatctaacaacatat" ~ 688 "gattctaaaatttcaaaatgcttaaaggtctga", 689 1, 690 Yes.freeShift, 691 ); 692 693 assert(alignment.toString(50) == 694 "----------tgaggacagaagggtcataggtttaattctggtcacaggc\n" ~ 695 " |||||||||||||||||| |||||||||||||||||||||\n" ~ 696 "aatgaacagctgaggacagaagggtcat-ggtttaattctggtcacaggc\n" ~ 697 "\n" ~ 698 "acattcctgggttgcaggtttgatctcc-acctggtcggggc-acatgca\n" ~ 699 "||||||||||||||*|||*|**|| ||| |||*||||||||| |||||||\n" ~ 700 "acattcctgggttgtaggctcaat-tcccacccggtcggggccacatgca\n" ~ 701 "\n" ~ 702 "ggaggcaaccaatcaatgtgtctctttctcagtgatgtttcttctctctg\n" ~ 703 "|*||||||||| |||||||||||||||| *||||||||||||||||||||\n" ~ 704 "gaaggcaacca-tcaatgtgtctctttc-aagtgatgtttcttctctctg\n" ~ 705 "\n" ~ 706 "tctctctcccccctctct-tctactctctctgaaaaatagatggaaaaaa\n" ~ 707 "||| *||||||| ||*|| ||||||||||**||||**|| |||||||||\n" ~ 708 "tct-actccccc-tccctatctactctctgggaaactta--tggaaaaaa\n" ~ 709 "\n" ~ 710 "tatcctcaggtgaggact-aacaacaaatatatatata-ttta-tatcta\n" ~ 711 "||||||||||||||| || ||||||||||||||||||| |*|| ||||||\n" ~ 712 "tatcctcaggtgagg-cttaacaacaaatatatatatactgtaatatcta\n" ~ 713 "\n" ~ 714 "acaacatatgatttctaaaatttcaaaaatcttaa-gg-ctgaattaat\n" ~ 715 "||||||||||||| ||||||||||||||**||||| || |||| \n" ~ 716 "acaacatatgatt-ctaaaatttcaaaatgcttaaaggtctga------"); 717 } 718 719 // Returns the amount of memory required to compute an alignment between reference and query. 720 size_t memoryRequired(S)(in S reference, in S query) 721 { 722 return score_t.sizeof * ( 723 // Space for the scoring matrix F 724 ((reference.length + 1) * (query.length + 1)) + 725 // Space for the edit path 726 (reference.length + query.length) 727 ); 728 } 729 730 /// Returns longest query and refernce length possible with memoryLimit. 731 size_t longestInputsLength(size_t memoryLimit) pure 732 { 733 return round(sqrt((memoryLimit / score_t.sizeof + 3).to!double) - 2).to!size_t; 734 } 735 736 // Find edit path of the best alignment 737 private EditOp[] tracebackScoringMatrix(in DPMatrix!score_t F) 738 { 739 auto editPath = minimallyInitializedArray!(EditOp[])(F.size[0] + F.size[1]); 740 size_t i = F.size[0] - 1; 741 size_t j = F.size[1] - 1; 742 size_t k = editPath.length; 743 744 while (0 < i && 0 < j) 745 { 746 auto matchScore = F[i - 1, j - 1]; 747 auto insertionScore = F[i , j - 1]; 748 auto deletionScore = F[i - 1, j ]; 749 auto nextScore = min( 750 matchScore, 751 deletionScore, 752 insertionScore, 753 ); 754 755 assert(k > 0); 756 if (nextScore == matchScore) 757 { 758 editPath[--k] = EditOp.substitution; 759 --i; 760 --j; 761 } 762 else if (nextScore == insertionScore) 763 { 764 editPath[--k] = EditOp.insertion; 765 --j; 766 } 767 else if (nextScore == deletionScore) 768 { 769 editPath[--k] = EditOp.deletetion; 770 --i; 771 } 772 else 773 { 774 assert(0, "corrupted scoring matrix"); 775 } 776 } 777 778 while (0 < i) 779 { 780 assert(k > 0); 781 editPath[--k] = EditOp.deletetion; 782 --i; 783 } 784 785 while (0 < j) 786 { 787 assert(k > 0); 788 editPath[--k] = EditOp.insertion; 789 --j; 790 } 791 792 return editPath[k .. $]; 793 } 794 795 unittest 796 { 797 auto F = DPMatrix!score_t(6, 5); 798 F.elements = [ 799 // - A G T C 800 0, 1, 2, 3, 4, // - 801 1, 0, 1, 2, 3, // A 802 2, 1, 1, 2, 2, // C 803 3, 2, 1, 2, 3, // G 804 4, 3, 2, 1, 2, // T 805 5, 4, 3, 2, 1, // C 806 ]; 807 808 assert(F.tracebackScoringMatrix == [ 809 EditOp.substitution, 810 EditOp.deletetion, 811 EditOp.substitution, 812 EditOp.substitution, 813 EditOp.substitution, 814 ]); 815 } 816 817 private struct DPMatrix(T) 818 { 819 size_t[2] size; 820 T[] elements; 821 822 this(in size_t n, in size_t m) 823 { 824 this.size[0] = n; 825 this.size[1] = m; 826 this.elements = minimallyInitializedArray!(T[])(n * m); 827 } 828 829 this(in size_t n, in size_t m, ref T[] buffer) 830 { 831 this.size[0] = n; 832 this.size[1] = m; 833 auto numElements = n * m; 834 assert(buffer.length <= numElements); 835 this.elements = buffer; 836 } 837 838 unittest 839 { 840 auto M = DPMatrix!int(2, 3); 841 842 assert(M.size[0] == 2); 843 assert(M.size[1] == 3); 844 845 foreach (i; 0 .. M.size[0]) 846 foreach (j; 0 .. M.size[1]) 847 M[i, j] = cast(int) (i + j); 848 849 assert(M.elements == [ 850 0, 1, 2, 851 1, 2, 3, 852 ]); 853 } 854 855 size_t opDollar(size_t dim)() const pure nothrow if (dim < 2) 856 { 857 return size[dim]; 858 } 859 860 auto ref T opIndex(size_t i, size_t j) pure nothrow 861 { 862 assert(i < size[0] && j < size[1], "index out of bounds"); 863 864 return elements[i * size[1] + j]; 865 } 866 867 const(T) opIndex(size_t i, size_t j) const pure nothrow 868 { 869 assert(i < size[0] && j < size[1], "index out of bounds"); 870 871 return elements[i * size[1] + j]; 872 } 873 874 unittest 875 { 876 auto M = DPMatrix!int(2, 3); 877 M.elements = [ 878 0, 1, 2, 879 1, 2, 3, 880 ]; 881 882 assert(M[0, 0] == 0 + 0); 883 assert(M[0, 1] == 0 + 1); 884 assert(M[0, 2] == 0 + 2); 885 assert(M[1, 0] == 1 + 0); 886 assert(M[1, 1] == 1 + 1); 887 assert(M[1, 2] == 1 + 2); 888 } 889 890 int opApply(scope int delegate(size_t i, size_t j, ref T) yield) 891 { 892 mixin(opApplyImpl!(No.reverse)); 893 } 894 895 int opApplyReverse(scope int delegate(size_t i, size_t j, ref T) yield) 896 { 897 mixin(opApplyImpl!(Yes.reverse)); 898 } 899 900 unittest 901 { 902 auto M = DPMatrix!int(2, 3); 903 904 foreach (i, j, ref elem; M) 905 elem = cast(int) (i + j); 906 907 assert(M.elements == [ 908 0, 1, 2, 909 1, 2, 3, 910 ]); 911 } 912 913 int opApply(scope int delegate(size_t i, size_t j, in T) yield) const 914 { 915 mixin(opApplyImpl!(No.reverse)); 916 } 917 918 int opApplyReverse(scope int delegate(size_t i, size_t j, in T) yield) const 919 { 920 mixin(opApplyImpl!(Yes.reverse)); 921 } 922 923 unittest 924 { 925 auto M = DPMatrix!int(2, 3); 926 M.elements = [ 927 0, 1, 2, 928 1, 2, 3, 929 ]; 930 931 foreach (i, j, elem; cast(const(typeof(M))) M) 932 { 933 assert(M[i, j] == elem); 934 assert(elem == i + j); 935 } 936 } 937 938 private static enum opApplyImpl(Flag!"reverse" reverse) = ` 939 int result = 0; 940 941 foreach` ~ (reverse ? `_reverse` : ``) ~ ` (i, ref element; elements) 942 { 943 result = yield(i / size[1], i % size[1], element); 944 945 if (result) 946 break; 947 } 948 949 return result; 950 `; 951 } 952 953 /// Convert a floating point number to a base-10 string at compile time. 954 /// This function is very crude and will not work in many cases! 955 string toString(Float)(in Float value, in uint precision) pure nothrow 956 if (isFloatingPoint!Float) 957 { 958 if (value.isNaN) 959 return "nan"; 960 else if (value.isInfinity) 961 return value > 0 ? "inf" : "-inf"; 962 963 if (precision == 0) 964 { 965 auto intPart = cast(long) round(value); 966 967 return intPart.to!string; 968 } 969 else 970 { 971 auto intPart = cast(long) (value > 0 ? floor(value) : ceil(value)); 972 auto fraction = sgn(value) * (value - intPart); 973 assert(fraction >= 0, "fractional part of value should be non-negative"); 974 auto fracPart = cast(ulong) round(10^^precision * fraction); 975 976 return intPart.to!string ~ "." ~ fracPart.to!string; 977 } 978 } 979 980 /// 981 unittest 982 { 983 enum x = 42.0; 984 enum y = -13.37f; 985 enum z = 0.9; 986 987 static assert(float.nan.toString(0) == "nan"); 988 static assert(double.infinity.toString(0) == "inf"); 989 static assert((-double.infinity).toString(0) == "-inf"); 990 static assert(x.toString(0) == "42"); 991 static assert(x.toString(1) == "42.0"); 992 static assert(y.toString(2) == "-13.37"); 993 static assert(y.toString(1) == "-13.4"); 994 static assert(y.toString(0) == "-13"); 995 static assert(z.toString(1) == "0.9"); 996 static assert(z.toString(0) == "1"); 997 } 998 999 1000 private struct SuffixPrefixMatches(alias pred, C) 1001 { 1002 C[] stringA; 1003 C[] stringB; 1004 size_t shift; 1005 1006 1007 this(C[] stringA, C[] stringB) 1008 { 1009 this.stringA = stringA; 1010 this.stringB = stringB; 1011 this.shift = stringA.length > stringB.length 1012 ? stringA.length - stringB.length 1013 : 0; 1014 1015 findNextMatch(); 1016 } 1017 1018 1019 void popFront() 1020 { 1021 assert(!empty, "Attempting to popFront an empty " ~ typeof(this).stringof); 1022 1023 ++shift; 1024 findNextMatch(); 1025 } 1026 1027 1028 @property auto front() 1029 { 1030 assert(!empty, "Attempting to fetch the front of an empty " ~ typeof(this).stringof); 1031 1032 return suffixA; 1033 } 1034 1035 1036 @property bool empty() const pure nothrow @safe @nogc 1037 { 1038 return shift >= stringA.length; 1039 } 1040 1041 1042 @property SuffixPrefixMatches!(pred, C) save() pure nothrow @safe @nogc 1043 { 1044 return this; 1045 } 1046 1047 1048 private: 1049 1050 1051 void findNextMatch() 1052 { 1053 while (shift < stringA.length && !equal!pred(suffixA, prefixB)) 1054 ++shift; 1055 } 1056 1057 1058 @property C[] suffixA() pure nothrow @safe @nogc 1059 { 1060 return stringA[shift .. $]; 1061 } 1062 1063 1064 @property C[] prefixB() pure nothrow @safe @nogc 1065 { 1066 return stringB[0 .. stringA.length - shift]; 1067 } 1068 } 1069 1070 1071 /// Find all suffixes of stringA that are a prefix of stringB. The matches are 1072 /// computed lazily, decreasing length of the match. 1073 /// 1074 /// Returns: lazily computed range of all suffixes of stringA that are a 1075 /// prefix of stringB. 1076 auto suffixPrefixMatches(alias pred = "a == b", C)(C[] stringA, C[] stringB) 1077 { 1078 return SuffixPrefixMatches!(pred, C)(stringA, stringB); 1079 } 1080 1081 /// 1082 unittest 1083 { 1084 auto matches = suffixPrefixMatches("abcdef", "cdefghij"); 1085 1086 assert(equal(matches, ["cdef"])); 1087 } 1088 1089 /// 1090 unittest 1091 { 1092 auto matches = suffixPrefixMatches("abcdefcdef", "cdefcdefghij"); 1093 1094 assert(equal(matches, ["cdefcdef", "cdef"])); 1095 } 1096 1097 unittest 1098 { 1099 auto matches = suffixPrefixMatches("abcdefcdefc", "cdefcdefcghij"); 1100 1101 assert(equal(matches, ["cdefcdefc", "cdefc", "c"])); 1102 } 1103 1104 unittest 1105 { 1106 struct ModulusChar 1107 { 1108 int value; 1109 alias value this; 1110 1111 bool opEquals(const ModulusChar other) const pure nothrow @safe @nogc 1112 { 1113 return this.value % 10 == other.value % 10; 1114 } 1115 } 1116 1117 alias m = ModulusChar; 1118 const modulusStringA = [m(1), m(2), m(3), m(1), m(2), m(3), m(1), m(2)]; 1119 const modulusStringB = [ m(12), m(13), m(11), m(12)]; 1120 1121 auto matches = suffixPrefixMatches(modulusStringA, modulusStringB); 1122 1123 assert(equal(matches, [ 1124 modulusStringA[4 .. $], 1125 modulusStringA[7 .. $], 1126 ])); 1127 } 1128 1129 unittest 1130 { 1131 const modulusStringA = [1, 2, 3, 1, 2, 3, 1, 2]; 1132 const modulusStringB = [ 12, 13, 11, 12]; 1133 1134 auto matches = suffixPrefixMatches!"a % 10 == b % 10"(modulusStringA, modulusStringB); 1135 1136 assert(equal(matches, [ 1137 modulusStringA[4 .. $], 1138 modulusStringA[7 .. $], 1139 ])); 1140 }