1 
2 //          Copyright Ferdinand Majerech 2011.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 /**
8  * Class that processes YAML mappings, sequences and scalars into nodes.
9  * This can be used to add custom data types. A tutorial can be found
10  * $(LINK2 https://dlang-community.github.io/D-YAML/, here).
11  */
12 module dyaml.constructor;
13 
14 import mir.timestamp;
15 import std.algorithm;
16 import std.array;
17 import std.base64;
18 import std.container;
19 import std.conv;
20 import std.exception;
21 import std.regex;
22 import std..string;
23 import std.utf;
24 
25 import dyaml.node;
26 import dyaml.exception;
27 import dyaml.style;
28 
29 package:
30 
31 // Exception thrown at constructor errors.
32 class ConstructorException : YAMLException
33 {
34     /// Construct a ConstructorException.
35     ///
36     /// Params:  msg   = Error message.
37     ///          start = Start position of the error context.
38     ///          end   = End position of the error context.
39     this(string msg, Mark start, Mark end, string file = __FILE__, size_t line = __LINE__)
40         @safe pure nothrow
41     {
42         super(msg ~ "\nstart: " ~ start.toString() ~ "\nend: " ~ end.toString(),
43               file, line);
44     }
45 }
46 
47 /** Constructs YAML values.
48  *
49  * Each YAML scalar, sequence or mapping has a tag specifying its data type.
50  * Constructor uses user-specifyable functions to create a node of desired
51  * data type from a scalar, sequence or mapping.
52  *
53  *
54  * Each of these functions is associated with a tag, and can process either
55  * a scalar, a sequence, or a mapping. The constructor passes each value to
56  * the function with corresponding tag, which then returns the resulting value
57  * that can be stored in a node.
58  *
59  * If a tag is detected with no known constructor function, it is considered an error.
60  */
61 /*
62  * Construct a node.
63  *
64  * Params:  start = Start position of the node.
65  *          end   = End position of the node.
66  *          tag   = Tag (data type) of the node.
67  *          value = Value to construct node from (string, nodes or pairs).
68  *          style = Style of the node (scalar or collection style).
69  *
70  * Returns: Constructed node.
71  */
72 Node constructNode(T)(const Mark start, const Mark end, const string tag,
73                 T value) @safe
74     if((is(T : string) || is(T == Node[]) || is(T == Node.Pair[])))
75 {
76     Node newNode;
77     try
78     {
79         switch(tag)
80         {
81             case "tag:yaml.org,2002:null":
82                 newNode = Node(null, tag);
83                 break;
84             case "tag:yaml.org,2002:bool":
85                 static if(is(T == string))
86                 {
87                     newNode = Node(constructBool(value), tag);
88                     break;
89                 }
90                 else throw new Exception("Only scalars can be bools");
91             case "tag:yaml.org,2002:int":
92                 static if(is(T == string))
93                 {
94                     newNode = Node(constructLong(value), tag);
95                     break;
96                 }
97                 else throw new Exception("Only scalars can be ints");
98             case "tag:yaml.org,2002:float":
99                 static if(is(T == string))
100                 {
101                     newNode = Node(constructReal(value), tag);
102                     break;
103                 }
104                 else throw new Exception("Only scalars can be floats");
105             case "tag:yaml.org,2002:binary":
106                 static if(is(T == string))
107                 {
108                     newNode = Node(constructBinary(value), tag);
109                     break;
110                 }
111                 else throw new Exception("Only scalars can be binary data");
112             case "tag:yaml.org,2002:timestamp":
113                 static if(is(T == string))
114                 {
115                     newNode = Node(constructTimestamp(value), tag);
116                     break;
117                 }
118                 else throw new Exception("Only scalars can be timestamps");
119             case "tag:yaml.org,2002:str":
120                 static if(is(T == string))
121                 {
122                     newNode = Node(constructString(value), tag);
123                     break;
124                 }
125                 else throw new Exception("Only scalars can be strings");
126             case "tag:yaml.org,2002:value":
127                 static if(is(T == string))
128                 {
129                     newNode = Node(constructString(value), tag);
130                     break;
131                 }
132                 else throw new Exception("Only scalars can be values");
133             case "tag:yaml.org,2002:omap":
134                 static if(is(T == Node[]))
135                 {
136                     newNode = Node(constructOrderedMap(value), tag);
137                     break;
138                 }
139                 else throw new Exception("Only sequences can be ordered maps");
140             case "tag:yaml.org,2002:pairs":
141                 static if(is(T == Node[]))
142                 {
143                     newNode = Node(constructPairs(value), tag);
144                     break;
145                 }
146                 else throw new Exception("Only sequences can be pairs");
147             case "tag:yaml.org,2002:set":
148                 static if(is(T == Node.Pair[]))
149                 {
150                     newNode = Node(constructSet(value), tag);
151                     break;
152                 }
153                 else throw new Exception("Only mappings can be sets");
154             case "tag:yaml.org,2002:seq":
155                 static if(is(T == Node[]))
156                 {
157                     newNode = Node(constructSequence(value), tag);
158                     break;
159                 }
160                 else throw new Exception("Only sequences can be sequences");
161             case "tag:yaml.org,2002:map":
162                 static if(is(T == Node.Pair[]))
163                 {
164                     newNode = Node(constructMap(value), tag);
165                     break;
166                 }
167                 else throw new Exception("Only mappings can be maps");
168             case "tag:yaml.org,2002:merge":
169                 newNode = Node(YAMLMerge(), tag);
170                 break;
171             default:
172                 newNode = Node(value, tag);
173                 break;
174         }
175     }
176     catch(Exception e)
177     {
178         throw new ConstructorException("Error constructing " ~ typeid(T).toString()
179                         ~ ":\n" ~ e.msg, start, end);
180     }
181 
182     newNode.startMark_ = start;
183 
184     return newNode;
185 }
186 
187 private:
188 // Construct a boolean _node.
189 bool constructBool(const string str) @safe
190 {
191     string value = str.toLower();
192     if(value.among!("yes", "true", "on")){return true;}
193     if(value.among!("no", "false", "off")){return false;}
194     throw new Exception("Unable to parse boolean value: " ~ value);
195 }
196 
197 // Construct an integer (long) _node.
198 long constructLong(const string str) @safe
199 {
200     string value = str.replace("_", "");
201     const char c = value[0];
202     const long sign = c != '-' ? 1 : -1;
203     if(c == '-' || c == '+')
204     {
205         value = value[1 .. $];
206     }
207 
208     enforce(value != "", new Exception("Unable to parse float value: " ~ value));
209 
210     long result;
211     try
212     {
213         //Zero.
214         if(value == "0")               {result = cast(long)0;}
215         //Binary.
216         else if(value.startsWith("0b")){result = sign * to!int(value[2 .. $], 2);}
217         //Hexadecimal.
218         else if(value.startsWith("0x")){result = sign * to!int(value[2 .. $], 16);}
219         //Octal.
220         else if(value[0] == '0')       {result = sign * to!int(value, 8);}
221         //Sexagesimal.
222         else if(value.canFind(":"))
223         {
224             long val;
225             long base = 1;
226             foreach_reverse(digit; value.split(":"))
227             {
228                 val += to!long(digit) * base;
229                 base *= 60;
230             }
231             result = sign * val;
232         }
233         //Decimal.
234         else{result = sign * to!long(value);}
235     }
236     catch(ConvException e)
237     {
238         throw new Exception("Unable to parse integer value: " ~ value);
239     }
240 
241     return result;
242 }
243 @safe unittest
244 {
245     string canonical   = "685230";
246     string decimal     = "+685_230";
247     string octal       = "02472256";
248     string hexadecimal = "0x_0A_74_AE";
249     string binary      = "0b1010_0111_0100_1010_1110";
250     string sexagesimal = "190:20:30";
251 
252     assert(685230 == constructLong(canonical));
253     assert(685230 == constructLong(decimal));
254     assert(685230 == constructLong(octal));
255     assert(685230 == constructLong(hexadecimal));
256     assert(685230 == constructLong(binary));
257     assert(685230 == constructLong(sexagesimal));
258 }
259 
260 // Construct a floating point (double) _node.
261 double constructReal(const string str) @safe
262 {
263     import mir.conv: to;
264     string value = str.replace("_", "").toLower();
265     const char c = value[0];
266     const double sign = c != '-' ? 1.0 : -1.0;
267     if(c == '-' || c == '+')
268     {
269         value = value[1 .. $];
270     }
271 
272     if (value == "" && value == "nan" && value == "inf" && value == "-inf")
273         throw new Exception("Unable to parse float value: " ~ value);
274 
275     double result;
276     try
277     {
278         //Infinity.
279         if     (value == ".inf"){result = sign * double.infinity;}
280         //Not a Number.
281         else if(value == ".nan"){result = double.nan;}
282         //Sexagesimal.
283         else if(value.canFind(":"))
284         {
285             double val = 0.0;
286             double base = 1.0;
287             foreach_reverse(digit; value.split(":"))
288             {
289                 val += to!double(digit) * base;
290                 base *= 60.0;
291             }
292             result = sign * val;
293         }
294         //Plain floating point.
295         else{result = sign * to!double(value);}
296     }
297     catch(Exception e)
298     {
299         throw new Exception("Unable to parse float value: \"" ~ value ~ "\"");
300     }
301 
302     return result;
303 }
304 @safe unittest
305 {
306     bool eq(double a, double b, double epsilon = 0.2) @safe
307     {
308         return a >= (b - epsilon) && a <= (b + epsilon);
309     }
310 
311     string canonical   = "6.8523015e+5";
312     string exponential = "685.230_15e+03";
313     string fixed       = "685_230.15";
314     string sexagesimal = "190:20:30.15";
315     string negativeInf = "-.inf";
316     string NaN         = ".NaN";
317 
318     assert(eq(685230.15, constructReal(canonical)));
319     assert(eq(685230.15, constructReal(exponential)));
320     assert(eq(685230.15, constructReal(fixed)));
321     assert(eq(685230.15, constructReal(sexagesimal)));
322     assert(eq(-double.infinity, constructReal(negativeInf)));
323     assert(to!string(constructReal(NaN)) == "nan");
324 }
325 
326 // Construct a binary (base64) _node.
327 ubyte[] constructBinary(const string value) @safe
328 {
329     import std.ascii : newline;
330     import std.array : array;
331 
332     // For an unknown reason, this must be nested to work (compiler bug?).
333     try
334     {
335         return Base64.decode(value.representation.filter!(c => !newline.canFind(c)).array);
336     }
337     catch(Base64Exception e)
338     {
339         throw new Exception("Unable to decode base64 value: " ~ e.msg);
340     }
341 }
342 
343 @safe unittest
344 {
345     auto test = "The Answer: 42".representation;
346     char[] buffer;
347     buffer.length = 256;
348     string input = Base64.encode(test, buffer).idup;
349     const value = constructBinary(input);
350     assert(value == test);
351     assert(value == [84, 104, 101, 32, 65, 110, 115, 119, 101, 114, 58, 32, 52, 50]);
352 }
353 
354 // Construct a timestamp _node.
355 Timestamp constructTimestamp(const string str) @safe
356 {
357     import mir.conv: to;
358     string value = str;
359 
360     auto YMDRegexp = regex("^([0-9][0-9][0-9][0-9])-([0-9][0-9]?)-([0-9][0-9]?)");
361     auto HMSRegexp = regex("^[Tt \t]+([0-9][0-9]?):([0-9][0-9]):([0-9][0-9])(\\.[0-9]*)?");
362     auto TZRegexp  = regex("^[ \t]*Z|([-+][0-9][0-9]?)(:[0-9][0-9])?");
363 
364     try
365     {
366         // First, get year, month and day.
367         auto matches = match(value, YMDRegexp);
368 
369         enforce(!matches.empty,
370                 new Exception("Unable to parse timestamp value: " ~ value));
371 
372         auto captures = matches.front.captures;
373         const year  = to!short(captures[1]);
374         const month = to!ubyte(captures[2]);
375         const day   = to!ubyte(captures[3]);
376 
377         // If available, get hour, minute, second and fraction, if present.
378         value = matches.front.post;
379         matches  = match(value, HMSRegexp);
380         if(matches.empty)
381             return Timestamp(year, month, day);
382 
383         captures = matches.front.captures;
384         const hour            = to!byte(captures[1]);
385         const minute          = to!byte(captures[2]);
386         const second          = to!byte(captures[3]);
387         Timestamp ret;
388         if (captures[4].length <= 1)
389         {
390             ret = Timestamp(year, month, day, hour, minute, second);
391         }
392         else
393         {
394             long fraction = 1 - captures[4].length;
395             auto fractionCoefficient = captures[4][1 .. $].to!ulong;
396             // If available, get timezone.
397             ret = Timestamp(year, month, day, hour, minute, second, cast(byte) fraction, fractionCoefficient);
398         }
399 
400         value = matches.front.post;
401         matches = match(value, TZRegexp);
402         if(matches.empty || matches.front.captures[0] == "Z")
403             // No timezone.
404             return ret;
405 
406         // We have a timezone, so parse it.
407         captures = matches.front.captures;
408         int sign    = 1;
409         int tzHours;
410         if(!captures[1].empty)
411         {
412             if(captures[1][0] == '-') {sign = -1;}
413             tzHours = to!ubyte(captures[1][1 .. $]);
414         }
415         const tzMinutes = (!captures[2].empty) ? to!ubyte(captures[2][1 .. $]) : 0;
416         const tzOffset  = sign * (60 * tzHours + tzMinutes);
417         ret.offset = cast(short)tzOffset;
418         ret.addMinutes(cast(short)-tzOffset);
419         return ret;
420     }
421     catch(Exception e)
422     {
423         throw new Exception("Unable to parse timestamp value " ~ value ~ " : " ~ e.msg);
424     }
425 }
426 @safe unittest
427 {
428     string timestamp(string value)
429     {
430         return constructTimestamp(value).toISOString();
431     }
432 
433     string canonical      = "2001-12-15T02:59:43.1Z";
434     string iso8601        = "2001-12-14t21:59:43.10-05:00";
435     string spaceSeparated = "2001-12-14 21:59:43.10 -5";
436     string noTZ           = "2001-12-15 2:59:43.10";
437     string noFraction     = "2001-12-15 2:59:43";
438     string ymd            = "2002-12-14";
439 
440     assert(timestamp(canonical)      == "20011215T025943.1Z", timestamp(canonical));
441     //avoiding float conversion errors
442     assert(timestamp(iso8601)        == "20011214T215943.10-05", timestamp(iso8601));
443     assert(timestamp(spaceSeparated) == "20011214T215943.10-05", timestamp(spaceSeparated));
444     assert(timestamp(noTZ)           == "20011215T025943.10Z", timestamp(noTZ));
445     assert(timestamp(noFraction)     == "20011215T025943Z", timestamp(noFraction));
446     assert(timestamp(ymd)            == "20021214", timestamp(ymd));
447 }
448 
449 // Construct a string _node.
450 string constructString(const string str) @safe
451 {
452     return str;
453 }
454 
455 // Convert a sequence of single-element mappings into a sequence of pairs.
456 Node.Pair[] getPairs(string type, const Node[] nodes) @safe
457 {
458     Node.Pair[] pairs;
459     pairs.reserve(nodes.length);
460     foreach(node; nodes)
461     {
462         enforce(node.nodeID == NodeID.mapping && node.length == 1,
463                 new Exception("While constructing " ~ type ~
464                               ", expected a mapping with single element"));
465 
466         pairs ~= node.as!(Node.Pair[]);
467     }
468 
469     return pairs;
470 }
471 
472 // Construct an ordered map (ordered sequence of key:value pairs without duplicates) _node.
473 Node.Pair[] constructOrderedMap(const Node[] nodes) @safe
474 {
475     auto pairs = getPairs("ordered map", nodes);
476 
477     //Detect duplicates.
478     //TODO this should be replaced by something with deterministic memory allocation.
479     auto keys = redBlackTree!Node();
480     foreach(ref pair; pairs)
481     {
482         enforce(!(pair.key in keys),
483                 new Exception("Duplicate entry in an ordered map: "
484                               ~ pair.key.debugString()));
485         keys.insert(pair.key);
486     }
487     return pairs;
488 }
489 @safe unittest
490 {
491     Node[] alternateTypes(uint length) @safe
492     {
493         Node[] pairs;
494         foreach(long i; 0 .. length)
495         {
496             auto pair = (i % 2) ? Node.Pair(i.to!string, i) : Node.Pair(i, i.to!string);
497             pairs ~= Node([pair]);
498         }
499         return pairs;
500     }
501 
502     Node[] sameType(uint length) @safe
503     {
504         Node[] pairs;
505         foreach(long i; 0 .. length)
506         {
507             auto pair = Node.Pair(i.to!string, i);
508             pairs ~= Node([pair]);
509         }
510         return pairs;
511     }
512 
513     assertThrown(constructOrderedMap(alternateTypes(8) ~ alternateTypes(2)));
514     assertNotThrown(constructOrderedMap(alternateTypes(8)));
515     assertThrown(constructOrderedMap(sameType(64) ~ sameType(16)));
516     assertThrown(constructOrderedMap(alternateTypes(64) ~ alternateTypes(16)));
517     assertNotThrown(constructOrderedMap(sameType(64)));
518     assertNotThrown(constructOrderedMap(alternateTypes(64)));
519 }
520 
521 // Construct a pairs (ordered sequence of key: value pairs allowing duplicates) _node.
522 Node.Pair[] constructPairs(const Node[] nodes) @safe
523 {
524     return getPairs("pairs", nodes);
525 }
526 
527 // Construct a set _node.
528 Node[] constructSet(const Node.Pair[] pairs) @safe
529 {
530     // In future, the map here should be replaced with something with deterministic
531     // memory allocation if possible.
532     // Detect duplicates.
533     ubyte[Node] map;
534     Node[] nodes;
535     nodes.reserve(pairs.length);
536     foreach(pair; pairs)
537     {
538         enforce((pair.key in map) is null, new Exception("Duplicate entry in a set"));
539         map[pair.key] = 0;
540         nodes ~= pair.key;
541     }
542 
543     return nodes;
544 }
545 @safe unittest
546 {
547     Node.Pair[] set(uint length) @safe
548     {
549         Node.Pair[] pairs;
550         foreach(long i; 0 .. length)
551         {
552             pairs ~= Node.Pair(i.to!string, null);
553         }
554 
555         return pairs;
556     }
557 
558     auto DuplicatesShort   = set(8) ~ set(2);
559     auto noDuplicatesShort = set(8);
560     auto DuplicatesLong    = set(64) ~ set(4);
561     auto noDuplicatesLong  = set(64);
562 
563     bool eq(Node.Pair[] a, Node[] b)
564     {
565         if(a.length != b.length){return false;}
566         foreach(i; 0 .. a.length)
567         {
568             if(a[i].key != b[i])
569             {
570                 return false;
571             }
572         }
573         return true;
574     }
575 
576     auto nodeDuplicatesShort   = DuplicatesShort.dup;
577     auto nodeNoDuplicatesShort = noDuplicatesShort.dup;
578     auto nodeDuplicatesLong    = DuplicatesLong.dup;
579     auto nodeNoDuplicatesLong  = noDuplicatesLong.dup;
580 
581     assertThrown(constructSet(nodeDuplicatesShort));
582     assertNotThrown(constructSet(nodeNoDuplicatesShort));
583     assertThrown(constructSet(nodeDuplicatesLong));
584     assertNotThrown(constructSet(nodeNoDuplicatesLong));
585 }
586 
587 // Construct a sequence (array) _node.
588 Node[] constructSequence(Node[] nodes) @safe
589 {
590     return nodes;
591 }
592 
593 // Construct an unordered map (unordered set of key:value _pairs without duplicates) _node.
594 Node.Pair[] constructMap(Node.Pair[] pairs) @safe
595 {
596     //Detect duplicates.
597     //TODO this should be replaced by something with deterministic memory allocation.
598     auto keys = redBlackTree!Node();
599     foreach(ref pair; pairs)
600     {
601         enforce(!(pair.key in keys),
602                 new Exception("Duplicate entry in a map: " ~ pair.key.debugString()));
603         keys.insert(pair.key);
604     }
605     return pairs;
606 }