1 /**
2     Some functions to work with FASTA data.
3 
4     Copyright: © 2019 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dalicious.genome;
10 
11 import std.algorithm :
12     count,
13     equal,
14     find,
15     joiner,
16     startsWith;
17 import std.ascii : newline;
18 import std.array :
19     appender, array;
20 import std.conv : to;
21 import std.exception : enforce;
22 import std.format :
23     format,
24     formattedRead;
25 import std.range :
26     chain,
27     chunks,
28     drop,
29     ElementType,
30     empty,
31     front,
32     isBidirectionalRange,
33     only,
34     popBack,
35     popFront,
36     take,
37     walkLength;
38 import std.stdio : File;
39 import std..string : indexOf, lineSplitter, outdent;
40 import std.traits : isSomeChar, isSomeString;
41 import std.typecons : tuple, Tuple;
42 
43 
44 /// Lines starting with this character designate the beginning of a FASTA record.
45 enum headerIndicator = '>';
46 
47 
48 /**
49     Calculate the sequence length of the first record in fastaFile. Returns
50     the length of the next record in fastaFile if it is a File object.
51 */
52 size_t getFastaLength(in string fastaFile)
53 {
54     alias isHeaderLine = (line) => line.length > 0 && line[0] == headerIndicator;
55 
56     return File(fastaFile).getFastaLength();
57 }
58 
59 /// ditto
60 size_t getFastaLength(File fastaFile)
61 {
62     alias isHeaderLine = (line) => line.length > 0 && line[0] == headerIndicator;
63 
64     auto fastaLines = fastaFile
65         .byLine
66         .find!isHeaderLine;
67 
68     enforce(!fastaLines.empty, "cannot determine FASTA length: file has no records");
69 
70     // ignore header
71     fastaLines.popFront();
72 
73     char peek()
74     {
75         import core.stdc.stdio : getc, ungetc;
76         import std.exception : errnoEnforce;
77 
78         auto c = getc(fastaFile.getFP());
79         ungetc(c, fastaFile.getFP());
80 
81         errnoEnforce(!fastaFile.error);
82 
83         return cast(char) c;
84     }
85 
86     // sum length of all sequence lines up to next record
87     size_t length;
88 
89     if (peek() == headerIndicator)
90         return 0;
91     foreach (line; fastaLines)
92     {
93         length += line.length;
94 
95         if (peek() == headerIndicator)
96             break;
97     }
98 
99     return length;
100 }
101 
102 ///
103 unittest
104 {
105     import std.process : pipe;
106 
107     string fastaRecordData = q"EOF
108         >sequence1
109         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
110         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
111 EOF".outdent;
112     auto fastaFile = pipe();
113     fastaFile.writeEnd.write(fastaRecordData);
114     fastaFile.writeEnd.close();
115 
116     auto fastaLength = getFastaLength(fastaFile.readEnd);
117 
118     assert(fastaLength == 100);
119 }
120 
121 ///
122 unittest
123 {
124     import std.process : pipe;
125 
126     string fastaRecordData = q"EOF
127         >sequence1
128         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
129         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
130         >sequence2
131         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
132 EOF".outdent;
133     auto fastaFile = pipe();
134     fastaFile.writeEnd.write(fastaRecordData);
135     fastaFile.writeEnd.close();
136 
137     auto fastaLength1 = getFastaLength(fastaFile.readEnd);
138     auto fastaLength2 = getFastaLength(fastaFile.readEnd);
139 
140     assert(fastaLength1 == 100);
141     assert(fastaLength2 == 50);
142 }
143 
144 unittest
145 {
146     import std.exception : assertThrown;
147     import std.process : pipe;
148 
149     auto fastaFile = pipe();
150     fastaFile.writeEnd.close();
151 
152     assertThrown(getFastaLength(fastaFile.readEnd));
153 }
154 
155 template PacBioHeader(T) if (isSomeString!T)
156 {
157     struct PacBioHeader
158     {
159         static enum headerFormat = ">%s/%d/%d_%d %s";
160 
161         T name;
162         size_t well;
163         size_t qualityRegionBegin;
164         size_t qualityRegionEnd;
165         string additionalInformation;
166 
167         /// Construct a `PacBioHeader!T` from `header`.
168         this(T header)
169         {
170             this.parse(header);
171         }
172 
173         /// Assign new `header` data.
174         void opAssign(T header)
175         {
176             this.parse(header);
177         }
178 
179         /// Builds the header string.
180         S to(S : T)() const
181         {
182             return buildHeader();
183         }
184 
185         private T buildHeader() const
186         {
187             return format!headerFormat(
188                 name,
189                 well,
190                 qualityRegionBegin,
191                 qualityRegionEnd,
192                 additionalInformation,
193             );
194         }
195 
196         private void parse(in T header)
197         {
198             auto numMatches = header[].formattedRead!headerFormat(
199                 name,
200                 well,
201                 qualityRegionBegin,
202                 qualityRegionEnd,
203                 additionalInformation,
204             );
205 
206             assert(numMatches == 5);
207         }
208     }
209 }
210 
211 ///
212 unittest
213 {
214     string header = ">name/1/0_1337 RQ=0.75";
215     auto pbHeader1 = PacBioHeader!string(header);
216 
217     assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75");
218     assert(pbHeader1.name == "name");
219     assert(pbHeader1.well == 1);
220     assert(pbHeader1.qualityRegionBegin == 0);
221     assert(pbHeader1.qualityRegionEnd == 1337);
222     assert(pbHeader1.additionalInformation == "RQ=0.75");
223 
224     PacBioHeader!string pbHeader2 = header;
225 
226     assert(pbHeader2 == pbHeader1);
227 }
228 
229 /// Convenience wrapper around `PacBioHeader!T(T header)`.
230 PacBioHeader!T parsePacBioHeader(T)(T header)
231 {
232     return typeof(return)(header);
233 }
234 
235 ///
236 unittest
237 {
238     string header = ">name/1/0_1337 RQ=0.75";
239     auto pbHeader1 = header.parsePacBioHeader();
240 
241     assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75");
242     assert(pbHeader1.name == "name");
243     assert(pbHeader1.well == 1);
244     assert(pbHeader1.qualityRegionBegin == 0);
245     assert(pbHeader1.qualityRegionEnd == 1337);
246     assert(pbHeader1.additionalInformation == "RQ=0.75");
247 }
248 
249 /**
250     Get the complement of a DNA base. Only bases A, T, C, G will be translated;
251     all other characters are left as is. Replacement preserves casing of
252     the characters.
253 */
254 C complement(C)(C base) if (isSomeChar!C)
255 {
256     import std.range : zip;
257 
258     enum from = `AGTCagtc`;
259     enum to = `TCAGtcag`;
260 
261     switch (base)
262     {
263         static foreach (conv; zip(from, to))
264         {
265             case conv[0]:
266                 return conv[1];
267         }
268         default:
269             return base;
270     }
271 }
272 
273 /**
274     Compute the reverse complement of a DNA sequence. Only bases A, T, C, G
275     will be translated; all other characters are left as is. Replacement
276     preserves casing of the characters.
277 */
278 auto reverseComplementer(Range)(Range sequence)
279         if (isBidirectionalRange!Range && isSomeChar!(ElementType!Range))
280 {
281     import std.algorithm : map;
282     import std.range : retro;
283 
284     return sequence
285         .retro
286         .map!complement;
287 }
288 
289 /// ditto
290 T reverseComplement(T)(in T sequence) if (isSomeString!T)
291 {
292     import std.array : array;
293 
294     return sequence[].reverseComplementer.array.to!T;
295 }
296 
297 FastaRecord!T reverseComplement(T)(in FastaRecord!T fastaRecord) if (isSomeString!T)
298 {
299     enum lineSep = FastaRecord!T.lineSep;
300     auto header = fastaRecord.header;
301     auto sequence = fastaRecord[].array.reverseComplement;
302     auto builder = appender!T;
303 
304     builder.reserve(header.length + sequence.length + 2 * lineSep.length);
305 
306     builder ~= header;
307     builder ~= lineSep;
308     builder ~= sequence;
309     builder ~= lineSep;
310 
311     return typeof(return)(builder.data);
312 }
313 
314 ///
315 unittest
316 {
317     auto seq = "GGTTGTAAATTGACTGTTGTCTGCT\ngccaatctactggtgggggagagat";
318     auto revComp = "atctctcccccaccagtagattggc\nAGCAGACAACAGTCAATTTACAACC";
319 
320     assert(seq.reverseComplement == revComp);
321     assert(seq.reverseComplementer.equal(revComp));
322 
323     auto fastaRecord1 = q"EOF
324         >sequence1
325         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
326         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
327 EOF".outdent.parseFastaRecord;
328     auto fastaRecord1RevComp = fastaRecord1.reverseComplement;
329 
330     assert(fastaRecord1RevComp.header == ">sequence1");
331     assert(fastaRecord1RevComp[].equal("GGGTTAGGGTTAGGGTTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG"));
332     assert(fastaRecord1RevComp[0 .. 5].equal("GGGTT"));
333     assert(fastaRecord1RevComp.toFasta(13).equal(q"EOF
334         >sequence1
335         GGGTTAGGGTTAG
336         GGTTGTTAGGGTT
337         AGGGTTAGGGTTA
338         GGGTTAGGGTTAG
339         GGTTAGGGTTAGG
340         GTTAGGGTTAGGG
341         TTAGGGTTAGGGT
342         TAGGGTTAG
343 EOF".outdent));
344 }