1 /** 2 Some functions to work with FASTA data. 3 4 Copyright: © 2019 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dalicious.genome; 10 11 import std.algorithm : 12 count, 13 equal, 14 find, 15 joiner, 16 startsWith; 17 import std.ascii : newline; 18 import std.array : 19 appender, array; 20 import std.conv : to; 21 import std.exception : enforce; 22 import std.format : 23 format, 24 formattedRead; 25 import std.range : 26 chain, 27 chunks, 28 drop, 29 ElementType, 30 empty, 31 front, 32 isBidirectionalRange, 33 only, 34 popBack, 35 popFront, 36 take, 37 walkLength; 38 import std.stdio : File; 39 import std..string : indexOf, lineSplitter, outdent; 40 import std.traits : isSomeChar, isSomeString; 41 import std.typecons : tuple, Tuple; 42 43 44 /// Lines starting with this character designate the beginning of a FASTA record. 45 enum headerIndicator = '>'; 46 47 48 /** 49 Calculate the sequence length of the first record in fastaFile. Returns 50 the length of the next record in fastaFile if it is a File object. 51 */ 52 size_t getFastaLength(in string fastaFile) 53 { 54 alias isHeaderLine = (line) => line.length > 0 && line[0] == headerIndicator; 55 56 return File(fastaFile).getFastaLength(); 57 } 58 59 /// ditto 60 size_t getFastaLength(File fastaFile) 61 { 62 alias isHeaderLine = (line) => line.length > 0 && line[0] == headerIndicator; 63 64 auto fastaLines = fastaFile 65 .byLine 66 .find!isHeaderLine; 67 68 enforce(!fastaLines.empty, "cannot determine FASTA length: file has no records"); 69 70 // ignore header 71 fastaLines.popFront(); 72 73 char peek() 74 { 75 import core.stdc.stdio : getc, ungetc; 76 import std.exception : errnoEnforce; 77 78 auto c = getc(fastaFile.getFP()); 79 ungetc(c, fastaFile.getFP()); 80 81 errnoEnforce(!fastaFile.error); 82 83 return cast(char) c; 84 } 85 86 // sum length of all sequence lines up to next record 87 size_t length; 88 89 if (peek() == headerIndicator) 90 return 0; 91 foreach (line; fastaLines) 92 { 93 length += line.length; 94 95 if (peek() == headerIndicator) 96 break; 97 } 98 99 return length; 100 } 101 102 /// 103 unittest 104 { 105 import std.process : pipe; 106 107 string fastaRecordData = q"EOF 108 >sequence1 109 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 110 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 111 EOF".outdent; 112 auto fastaFile = pipe(); 113 fastaFile.writeEnd.write(fastaRecordData); 114 fastaFile.writeEnd.close(); 115 116 auto fastaLength = getFastaLength(fastaFile.readEnd); 117 118 assert(fastaLength == 100); 119 } 120 121 /// 122 unittest 123 { 124 import std.process : pipe; 125 126 string fastaRecordData = q"EOF 127 >sequence1 128 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 129 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 130 >sequence2 131 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 132 EOF".outdent; 133 auto fastaFile = pipe(); 134 fastaFile.writeEnd.write(fastaRecordData); 135 fastaFile.writeEnd.close(); 136 137 auto fastaLength1 = getFastaLength(fastaFile.readEnd); 138 auto fastaLength2 = getFastaLength(fastaFile.readEnd); 139 140 assert(fastaLength1 == 100); 141 assert(fastaLength2 == 50); 142 } 143 144 unittest 145 { 146 import std.exception : assertThrown; 147 import std.process : pipe; 148 149 auto fastaFile = pipe(); 150 fastaFile.writeEnd.close(); 151 152 assertThrown(getFastaLength(fastaFile.readEnd)); 153 } 154 155 template PacBioHeader(T) if (isSomeString!T) 156 { 157 struct PacBioHeader 158 { 159 static enum headerFormat = ">%s/%d/%d_%d %s"; 160 161 T name; 162 size_t well; 163 size_t qualityRegionBegin; 164 size_t qualityRegionEnd; 165 string additionalInformation; 166 167 /// Construct a `PacBioHeader!T` from `header`. 168 this(T header) 169 { 170 this.parse(header); 171 } 172 173 /// Assign new `header` data. 174 void opAssign(T header) 175 { 176 this.parse(header); 177 } 178 179 /// Builds the header string. 180 S to(S : T)() const 181 { 182 return buildHeader(); 183 } 184 185 private T buildHeader() const 186 { 187 return format!headerFormat( 188 name, 189 well, 190 qualityRegionBegin, 191 qualityRegionEnd, 192 additionalInformation, 193 ); 194 } 195 196 private void parse(in T header) 197 { 198 auto numMatches = header[].formattedRead!headerFormat( 199 name, 200 well, 201 qualityRegionBegin, 202 qualityRegionEnd, 203 additionalInformation, 204 ); 205 206 assert(numMatches == 5); 207 } 208 } 209 } 210 211 /// 212 unittest 213 { 214 string header = ">name/1/0_1337 RQ=0.75"; 215 auto pbHeader1 = PacBioHeader!string(header); 216 217 assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75"); 218 assert(pbHeader1.name == "name"); 219 assert(pbHeader1.well == 1); 220 assert(pbHeader1.qualityRegionBegin == 0); 221 assert(pbHeader1.qualityRegionEnd == 1337); 222 assert(pbHeader1.additionalInformation == "RQ=0.75"); 223 224 PacBioHeader!string pbHeader2 = header; 225 226 assert(pbHeader2 == pbHeader1); 227 } 228 229 /// Convenience wrapper around `PacBioHeader!T(T header)`. 230 PacBioHeader!T parsePacBioHeader(T)(T header) 231 { 232 return typeof(return)(header); 233 } 234 235 /// 236 unittest 237 { 238 string header = ">name/1/0_1337 RQ=0.75"; 239 auto pbHeader1 = header.parsePacBioHeader(); 240 241 assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75"); 242 assert(pbHeader1.name == "name"); 243 assert(pbHeader1.well == 1); 244 assert(pbHeader1.qualityRegionBegin == 0); 245 assert(pbHeader1.qualityRegionEnd == 1337); 246 assert(pbHeader1.additionalInformation == "RQ=0.75"); 247 } 248 249 /** 250 Get the complement of a DNA base. Only bases A, T, C, G will be translated; 251 all other characters are left as is. Replacement preserves casing of 252 the characters. 253 */ 254 C complement(C)(C base) if (isSomeChar!C) 255 { 256 import std.range : zip; 257 258 enum from = `AGTCagtc`; 259 enum to = `TCAGtcag`; 260 261 switch (base) 262 { 263 static foreach (conv; zip(from, to)) 264 { 265 case conv[0]: 266 return conv[1]; 267 } 268 default: 269 return base; 270 } 271 } 272 273 /** 274 Compute the reverse complement of a DNA sequence. Only bases A, T, C, G 275 will be translated; all other characters are left as is. Replacement 276 preserves casing of the characters. 277 */ 278 auto reverseComplementer(Range)(Range sequence) 279 if (isBidirectionalRange!Range && isSomeChar!(ElementType!Range)) 280 { 281 import std.algorithm : map; 282 import std.range : retro; 283 284 return sequence 285 .retro 286 .map!complement; 287 } 288 289 /// ditto 290 T reverseComplement(T)(in T sequence) if (isSomeString!T) 291 { 292 import std.array : array; 293 294 return sequence[].reverseComplementer.array.to!T; 295 } 296 297 FastaRecord!T reverseComplement(T)(in FastaRecord!T fastaRecord) if (isSomeString!T) 298 { 299 enum lineSep = FastaRecord!T.lineSep; 300 auto header = fastaRecord.header; 301 auto sequence = fastaRecord[].array.reverseComplement; 302 auto builder = appender!T; 303 304 builder.reserve(header.length + sequence.length + 2 * lineSep.length); 305 306 builder ~= header; 307 builder ~= lineSep; 308 builder ~= sequence; 309 builder ~= lineSep; 310 311 return typeof(return)(builder.data); 312 } 313 314 /// 315 unittest 316 { 317 auto seq = "GGTTGTAAATTGACTGTTGTCTGCT\ngccaatctactggtgggggagagat"; 318 auto revComp = "atctctcccccaccagtagattggc\nAGCAGACAACAGTCAATTTACAACC"; 319 320 assert(seq.reverseComplement == revComp); 321 assert(seq.reverseComplementer.equal(revComp)); 322 323 auto fastaRecord1 = q"EOF 324 >sequence1 325 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 326 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 327 EOF".outdent.parseFastaRecord; 328 auto fastaRecord1RevComp = fastaRecord1.reverseComplement; 329 330 assert(fastaRecord1RevComp.header == ">sequence1"); 331 assert(fastaRecord1RevComp[].equal("GGGTTAGGGTTAGGGTTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG")); 332 assert(fastaRecord1RevComp[0 .. 5].equal("GGGTT")); 333 assert(fastaRecord1RevComp.toFasta(13).equal(q"EOF 334 >sequence1 335 GGGTTAGGGTTAG 336 GGTTGTTAGGGTT 337 AGGGTTAGGGTTA 338 GGGTTAGGGTTAG 339 GGTTAGGGTTAGG 340 GTTAGGGTTAGGG 341 TTAGGGTTAGGGT 342 TAGGGTTAG 343 EOF".outdent)); 344 }