dyaml.reader source code

1 
2 //          Copyright Ferdinand Majerech 2011-2014.
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 
7 module dyaml.reader;
8 
9 
10 import core.stdc.stdlib;
11 import core.stdc..string;
12 import core.thread;
13 
14 import std.algorithm;
15 import std.array;
16 import std.conv;
17 import std.exception;
18 import std.range;
19 import std..string;
20 import std.system;
21 import std.typecons;
22 import std.utf;
23 
24 import tinyendian;
25 
26 import dyaml.encoding;
27 import dyaml.exception;
28 
29 alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');
30 
31 package:
32 
33 
34 ///Exception thrown at Reader errors.
35 class ReaderException : YAMLException
36 {
37     this(string msg, string file = __FILE__, size_t line = __LINE__)
38         @safe pure nothrow
39     {
40         super("Reader error: " ~ msg, file, line);
41     }
42 }
43 
44 /// Provides an API to read characters from a UTF-8 buffer and build slices into that
45 /// buffer to avoid allocations (see SliceBuilder).
46 final class Reader
47 {
48     private:
49         // Buffer of currently loaded characters.
50         char[] buffer_;
51 
52         // Current position within buffer. Only data after this position can be read.
53         size_t bufferOffset_;
54 
55         // Index of the current character in the buffer.
56         size_t charIndex_;
57         // Number of characters (code points) in buffer_.
58         size_t characterCount_;
59 
60         // File name
61         string name_;
62         // Current line in file.
63         uint line_;
64         // Current column in file.
65         uint column_;
66 
67         // Original Unicode encoding of the data.
68         Encoding encoding_;
69 
70         version(unittest)
71         {
72             // Endianness of the input before it was converted (for testing)
73             Endian endian_;
74         }
75 
76         // The number of consecutive ASCII characters starting at bufferOffset_.
77         //
78         // Used to minimize UTF-8 decoding.
79         size_t upcomingASCII_;
80 
81         // Index to buffer_ where the last decoded character starts.
82         size_t lastDecodedBufferOffset_;
83         // Offset, relative to charIndex_, of the last decoded character,
84         // in code points, not chars.
85         size_t lastDecodedCharOffset_;
86 
87     public:
88         /// Construct a Reader.
89         ///
90         /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
91         ///                   contents of a file or a string. $(B will) be modified by
92         ///                   the Reader and other parts of D:YAML (D:YAML tries to
93         ///                   reuse the buffer to minimize memory allocations)
94         ///          name   = File name if the buffer is the contents of a file or
95         ///                   `"<unknown>"` if the buffer is the contents of a string.
96         ///
97         /// Throws:  ReaderException on a UTF decoding error or if there are
98         ///          nonprintable Unicode characters illegal in YAML.
99         this(ubyte[] buffer, string name = "<unknown>") @safe pure
100         {
101             name_ = name;
102             auto endianResult = fixUTFByteOrder(buffer);
103             if(endianResult.bytesStripped > 0)
104             {
105                 throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
106                                           "to 2 or 4 bytes, respectively");
107             }
108 
109             version(unittest) { endian_ = endianResult.endian; }
110             encoding_ = endianResult.encoding;
111 
112             auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
113             const msg = utf8Result.errorMessage;
114             if(msg !is null)
115             {
116                 throw new ReaderException("Error when converting to UTF-8: " ~ msg);
117             }
118 
119             buffer_ = utf8Result.utf8;
120 
121             characterCount_ = utf8Result.characterCount;
122             // Check that all characters in buffer are printable.
123             enforce(isPrintableValidUTF8(buffer_),
124                     new ReaderException("Special unicode characters are not allowed"));
125 
126             this.sliceBuilder = SliceBuilder(this);
127             checkASCII();
128         }
129 
130         /// Get character at specified index relative to current position.
131         ///
132         /// Params:  index = Index of the character to get relative to current position
133         ///                  in the buffer. Can point outside of the buffer; In that
134         ///                  case, '\0' will be returned.
135         ///
136         /// Returns: Character at specified position or '\0' if outside of the buffer.
137         ///
138         // XXX removed; search for 'risky' to find why.
139         // Throws:  ReaderException if trying to read past the end of the buffer.
140         dchar peek(const size_t index) @safe pure
141         {
142             if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
143             if(characterCount_ <= charIndex_ + index)
144             {
145                 // XXX This is risky; revert this if bugs are introduced. We rely on
146                 // the assumption that Reader only uses peek() to detect end of buffer.
147                 // The test suite passes.
148                 // Revert this case here and in other peek() versions if this causes
149                 // errors.
150                 // throw new ReaderException("Trying to read past the end of the buffer");
151                 return '\0';
152             }
153 
154             // Optimized path for Scanner code that peeks chars in linear order to
155             // determine the length of some sequence.
156             if(index == lastDecodedCharOffset_)
157             {
158                 ++lastDecodedCharOffset_;
159                 const char b = buffer_[lastDecodedBufferOffset_];
160                 // ASCII
161                 if(b < 0x80)
162                 {
163                     ++lastDecodedBufferOffset_;
164                     return b;
165                 }
166                 return decode(buffer_, lastDecodedBufferOffset_);
167             }
168 
169             // 'Slow' path where we decode everything up to the requested character.
170             const asciiToTake = min(upcomingASCII_, index);
171             lastDecodedCharOffset_   = asciiToTake;
172             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
173             dchar d;
174             while(lastDecodedCharOffset_ <= index)
175             {
176                 d = decodeNext();
177             }
178 
179             return d;
180         }
181 
182         /// Optimized version of peek() for the case where peek index is 0.
183         dchar peek() @safe pure
184         {
185             if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
186             if(characterCount_ <= charIndex_) { return '\0'; }
187 
188             lastDecodedCharOffset_   = 0;
189             lastDecodedBufferOffset_ = bufferOffset_;
190             return decodeNext();
191         }
192 
193         /// Get byte at specified index relative to current position.
194         ///
195         /// Params:  index = Index of the byte to get relative to current position
196         ///                  in the buffer. Can point outside of the buffer; In that
197         ///                  case, '\0' will be returned.
198         ///
199         /// Returns: Byte at specified position or '\0' if outside of the buffer.
200         char peekByte(const size_t index) @safe pure nothrow @nogc
201         {
202             return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
203         }
204 
205         /// Optimized version of peekByte() for the case where peek byte index is 0.
206         char peekByte() @safe pure nothrow @nogc
207         {
208             return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
209         }
210 
211 
212         /// Get specified number of characters starting at current position.
213         ///
214         /// Note: This gets only a "view" into the internal buffer, which will be
215         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
216         ///       for permanent use.
217         ///
218         /// Params: length = Number of characters (code points, not bytes) to get. May
219         ///                  reach past the end of the buffer; in that case the returned
220         ///                  slice will be shorter.
221         ///
222         /// Returns: Characters starting at current position or an empty slice if out of bounds.
223         char[] prefix(const size_t length) @safe pure
224         {
225             return slice(length);
226         }
227 
228         /// Get specified number of bytes, not code points, starting at current position.
229         ///
230         /// Note: This gets only a "view" into the internal buffer, which will be
231         ///       invalidated after other Reader calls. Use SliceBuilder to build slices
232         ///       for permanent use.
233         ///
234         /// Params: length = Number bytes (not code points) to get. May NOT reach past
235         ///                  the end of the buffer; should be used with peek() to avoid
236         ///                  this.
237         ///
238         /// Returns: Bytes starting at current position.
239         char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
240         in(length == 0 || bufferOffset_ + length <= buffer_.length, "prefixBytes out of bounds")
241         {
242             return buffer_[bufferOffset_ .. bufferOffset_ + length];
243         }
244 
245         /// Get a slice view of the internal buffer, starting at the current position.
246         ///
247         /// Note: This gets only a "view" into the internal buffer,
248         ///       which get invalidated after other Reader calls.
249         ///
250         /// Params:  end = End of the slice relative to current position. May reach past
251         ///                the end of the buffer; in that case the returned slice will
252         ///                be shorter.
253         ///
254         /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
255         char[] slice(const size_t end) @safe pure
256         {
257             // Fast path in case the caller has already peek()ed all the way to end.
258             if(end == lastDecodedCharOffset_)
259             {
260                 return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
261             }
262 
263             const asciiToTake = min(upcomingASCII_, end, buffer_.length);
264             lastDecodedCharOffset_   = asciiToTake;
265             lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
266 
267             // 'Slow' path - decode everything up to end.
268             while(lastDecodedCharOffset_ < end &&
269                   lastDecodedBufferOffset_ < buffer_.length)
270             {
271                 decodeNext();
272             }
273 
274             return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
275         }
276 
277         /// Get the next character, moving buffer position beyond it.
278         ///
279         /// Returns: Next character.
280         ///
281         /// Throws:  ReaderException if trying to read past the end of the buffer
282         ///          or if invalid data is read.
283         dchar get() @safe pure
284         {
285             const result = peek();
286             forward();
287             return result;
288         }
289 
290         /// Get specified number of characters, moving buffer position beyond them.
291         ///
292         /// Params:  length = Number or characters (code points, not bytes) to get.
293         ///
294         /// Returns: Characters starting at current position.
295         char[] get(const size_t length) @safe pure
296         {
297             auto result = slice(length);
298             forward(length);
299             return result;
300         }
301 
302         /// Move current position forward.
303         ///
304         /// Params:  length = Number of characters to move position forward.
305         void forward(size_t length) @safe pure
306         {
307             while(length > 0)
308             {
309                 auto asciiToTake = min(upcomingASCII_, length);
310                 charIndex_     += asciiToTake;
311                 length         -= asciiToTake;
312                 upcomingASCII_ -= asciiToTake;
313 
314                 for(; asciiToTake > 0; --asciiToTake)
315                 {
316                     const c = buffer_[bufferOffset_++];
317                     // c is ASCII, do we only need to check for ASCII line breaks.
318                     if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
319                     {
320                         ++line_;
321                         column_ = 0;
322                         continue;
323                     }
324                     ++column_;
325                 }
326 
327                 // If we have used up all upcoming ASCII chars, the next char is
328                 // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
329                 // be updated - it's zero.
330                 if(length == 0) { break; }
331 
332                 assert(upcomingASCII_ == 0,
333                        "Running unicode handling code but we haven't run out of ASCII chars");
334                 assert(bufferOffset_ < buffer_.length,
335                        "Attempted to decode past the end of YAML buffer");
336                 assert(buffer_[bufferOffset_] >= 0x80,
337                        "ASCII must be handled by preceding code");
338 
339                 ++charIndex_;
340                 const c = decode(buffer_, bufferOffset_);
341 
342                 // New line. (can compare with '\n' without decoding since it's ASCII)
343                 if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
344                 {
345                     ++line_;
346                     column_ = 0;
347                 }
348                 else if(c != '\uFEFF') { ++column_; }
349                 --length;
350                 checkASCII();
351             }
352 
353             lastDecodedBufferOffset_ = bufferOffset_;
354             lastDecodedCharOffset_ = 0;
355         }
356 
357         /// Move current position forward by one character.
358         void forward() @safe pure
359         {
360             ++charIndex_;
361             lastDecodedBufferOffset_ = bufferOffset_;
362             lastDecodedCharOffset_ = 0;
363 
364             // ASCII
365             if(upcomingASCII_ > 0)
366             {
367                 --upcomingASCII_;
368                 const c = buffer_[bufferOffset_++];
369 
370                 if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
371                 {
372                     ++line_;
373                     column_ = 0;
374                     return;
375                 }
376                 ++column_;
377                 return;
378             }
379 
380             // UTF-8
381             assert(bufferOffset_ < buffer_.length,
382                    "Attempted to decode past the end of YAML buffer");
383             assert(buffer_[bufferOffset_] >= 0x80,
384                    "ASCII must be handled by preceding code");
385 
386             const c = decode(buffer_, bufferOffset_);
387 
388             // New line. (can compare with '\n' without decoding since it's ASCII)
389             if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
390             {
391                 ++line_;
392                 column_ = 0;
393             }
394             else if(c != '\uFEFF') { ++column_; }
395 
396             checkASCII();
397         }
398 
399         /// Used to build slices of read data in Reader; to avoid allocations.
400         SliceBuilder sliceBuilder;
401 
402         /// Get a string describing current buffer position, used for error messages.
403         Mark mark() const pure nothrow @nogc @safe { return Mark(name_, line_, column_); }
404 
405         /// Get file name.
406         string name() const @safe pure nothrow @nogc { return name_; }
407 
408         /// Get current line number.
409         uint line() const @safe pure nothrow @nogc { return line_; }
410 
411         /// Get current column number.
412         uint column() const @safe pure nothrow @nogc { return column_; }
413 
414         /// Get index of the current character in the buffer.
415         size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }
416 
417         /// Get encoding of the input buffer.
418         Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }
419 
420 private:
421         // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
422         void checkASCII() @safe pure nothrow @nogc
423         {
424             upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
425         }
426 
427         // Decode the next character relative to
428         // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
429         //
430         // Does not advance the buffer position. Used in peek() and slice().
431         dchar decodeNext() @safe pure
432         {
433             assert(lastDecodedBufferOffset_ < buffer_.length,
434                    "Attempted to decode past the end of YAML buffer");
435             const char b = buffer_[lastDecodedBufferOffset_];
436             ++lastDecodedCharOffset_;
437             // ASCII
438             if(b < 0x80)
439             {
440                 ++lastDecodedBufferOffset_;
441                 return b;
442             }
443 
444             return decode(buffer_, lastDecodedBufferOffset_);
445         }
446 }
447 
448 /// Used to build slices of already read data in Reader buffer, avoiding allocations.
449 ///
450 /// Usually these slices point to unchanged Reader data, but sometimes the data is
451 /// changed due to how YAML interprets certain characters/strings.
452 ///
453 /// See begin() documentation.
454 struct SliceBuilder
455 {
456 private:
457     // No copying by the user.
458     @disable this(this);
459     @disable void opAssign(ref SliceBuilder);
460 
461     // Reader this builder works in.
462     Reader reader_;
463 
464     // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
465     size_t start_ = size_t.max;
466     // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
467     size_t end_   = size_t.max;
468 
469     // Stack of slice ends to revert to (see Transaction)
470     //
471     // Very few levels as we don't want arbitrarily nested transactions.
472     size_t[4] endStack_;
473     // The number of elements currently in endStack_.
474     size_t endStackUsed_;
475 
476     @safe const pure nothrow @nogc invariant()
477     {
478         if(!inProgress) { return; }
479         assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
480         assert(start_ <= end_, "Slice start after slice end");
481     }
482 
483     // Is a slice currently being built?
484     bool inProgress() @safe const pure nothrow @nogc
485     in(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max, "start_/end_ are not consistent")
486     {
487         return start_ != size_t.max;
488     }
489 
490 public:
491     /// Begin building a slice.
492     ///
493     /// Only one slice can be built at any given time; before beginning a new slice,
494     /// finish the previous one (if any).
495     ///
496     /// The slice starts at the current position in the Reader buffer. It can only be
497     /// extended up to the current position in the buffer; Reader methods get() and
498     /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
499     /// a string just returned by get() - but not one returned by prefix() unless the
500     /// position has changed since the prefix() call.
501     void begin() @safe pure nothrow @nogc
502     in(!inProgress, "Beginning a slice while another slice is being built")
503     in(endStackUsed_ == 0, "Slice stack not empty at slice begin")
504     {
505 
506         start_ = reader_.bufferOffset_;
507         end_   = reader_.bufferOffset_;
508     }
509 
510     /// Finish building a slice and return it.
511     ///
512     /// Any Transactions on the slice must be committed or destroyed before the slice
513     /// is finished.
514     ///
515     /// Returns a string; once a slice is finished it is definitive that its contents
516     /// will not be changed.
517     char[] finish() @safe pure nothrow @nogc
518     in(inProgress, "finish called without begin")
519     in(endStackUsed_ == 0, "Finishing a slice with running transactions.")
520     {
521 
522         auto result = reader_.buffer_[start_ .. end_];
523         start_ = end_ = size_t.max;
524         return result;
525     }
526 
527     /// Write a string to the slice being built.
528     ///
529     /// Data can only be written up to the current position in the Reader buffer.
530     ///
531     /// If str is a string returned by a Reader method, and str starts right after the
532     /// end of the slice being built, the slice is extended (trivial operation).
533     ///
534     /// See_Also: begin
535     void write(scope char[] str) @safe pure nothrow @nogc
536     {
537         assert(inProgress, "write called without begin");
538         assert(end_ <= reader_.bufferOffset_,
539                "AT START: Slice ends after buffer position");
540 
541         // Nothing? Already done.
542         if (str.length == 0) { return; }
543         // If str starts at the end of the slice (is a string returned by a Reader
544         // method), just extend the slice to contain str.
545         if(&str[0] == &reader_.buffer_[end_])
546         {
547             end_ += str.length;
548         }
549         // Even if str does not start at the end of the slice, it still may be returned
550         // by a Reader method and point to buffer. So we need to memmove.
551         else
552         {
553             copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]);
554             end_ += str.length;
555         }
556     }
557 
558     /// Write a character to the slice being built.
559     ///
560     /// Data can only be written up to the current position in the Reader buffer.
561     ///
562     /// See_Also: begin
563     void write(dchar c) @safe pure
564     in(inProgress, "write called without begin")
565     {
566         if(c < 0x80)
567         {
568             reader_.buffer_[end_++] = cast(char)c;
569             return;
570         }
571 
572         // We need to encode a non-ASCII dchar into UTF-8
573         char[4] encodeBuf;
574         const bytes = encode(encodeBuf, c);
575         reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
576         end_ += bytes;
577     }
578 
579     /// Insert a character to a specified position in the slice.
580     ///
581     /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
582     /// current position in the Reader buffer.
583     ///
584     /// Params:
585     ///
586     /// c        = The character to insert.
587     /// position = Position to insert the character at in code units, not code points.
588     ///            Must be less than slice length(); a previously returned length()
589     ///            can be used.
590     void insert(const dchar c, const size_t position) @safe pure
591     in(inProgress, "insert called without begin")
592     in(start_ + position <= end_, "Trying to insert after the end of the slice")
593     {
594 
595         const point       = start_ + position;
596         const movedLength = end_ - point;
597 
598         // Encode c into UTF-8
599         char[4] encodeBuf;
600         if(c < 0x80) { encodeBuf[0] = cast(char)c; }
601         const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);
602 
603         if(movedLength > 0)
604         {
605             copy(reader_.buffer_[point..point + movedLength * char.sizeof],
606                     reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]);
607         }
608         reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
609         end_ += bytes;
610     }
611 
612     /// Get the current length of the slice.
613     size_t length() @safe const pure nothrow @nogc
614     {
615         return end_ - start_;
616     }
617 
618     /// A slice building transaction.
619     ///
620     /// Can be used to save and revert back to slice state.
621     struct Transaction
622     {
623     private:
624         // The slice builder affected by the transaction.
625         SliceBuilder* builder_;
626         // Index of the return point of the transaction in StringBuilder.endStack_.
627         size_t stackLevel_;
628         // True after commit() has been called.
629         bool committed_;
630 
631     public:
632         /// Begins a transaction on a SliceBuilder object.
633         ///
634         /// The transaction must end $(B after) any transactions created within the
635         /// transaction but $(B before) the slice is finish()-ed. A transaction can be
636         /// ended either by commit()-ing or reverting through the destructor.
637         ///
638         /// Saves the current state of a slice.
639         this(SliceBuilder* builder) @safe pure nothrow @nogc
640         {
641             builder_ = builder;
642             stackLevel_ = builder_.endStackUsed_;
643             builder_.push();
644         }
645 
646         /// Commit changes to the slice.
647         ///
648         /// Ends the transaction - can only be called once, and removes the possibility
649         /// to revert slice state.
650         ///
651         /// Does nothing for a default-initialized transaction (the transaction has not
652         /// been started yet).
653         void commit() @safe pure nothrow @nogc
654         in(!committed_, "Can't commit a transaction more than once")
655         {
656 
657             if(builder_ is null) { return; }
658             assert(builder_.endStackUsed_ == stackLevel_ + 1,
659                    "Parent transactions don't fully contain child transactions");
660             builder_.apply();
661             committed_ = true;
662         }
663 
664         /// Destroy the transaction and revert it if it hasn't been committed yet.
665         void end() @safe pure nothrow @nogc
666         in(builder_ && builder_.endStackUsed_ == stackLevel_ + 1, "Parent transactions don't fully contain child transactions")
667         {
668             builder_.pop();
669             builder_ = null;
670         }
671 
672     }
673 
674 private:
675     // Push the current end of the slice so we can revert to it if needed.
676     //
677     // Used by Transaction.
678     void push() @safe pure nothrow @nogc
679     in(inProgress, "push called without begin")
680     in(endStackUsed_ < endStack_.length, "Slice stack overflow")
681     {
682         endStack_[endStackUsed_++] = end_;
683     }
684 
685     // Pop the current end of endStack_ and set the end of the slice to the popped
686     // value, reverting changes since the old end was pushed.
687     //
688     // Used by Transaction.
689     void pop() @safe pure nothrow @nogc
690     in(inProgress, "pop called without begin")
691     in(endStackUsed_ > 0, "Trying to pop an empty slice stack")
692     {
693         end_ = endStack_[--endStackUsed_];
694     }
695 
696     // Pop the current end of endStack_, but keep the current end of the slice, applying
697     // changes made since pushing the old end.
698     //
699     // Used by Transaction.
700     void apply() @safe pure nothrow @nogc
701     in(inProgress, "apply called without begin")
702     in(endStackUsed_ > 0, "Trying to apply an empty slice stack")
703     {
704         --endStackUsed_;
705     }
706 }
707 
708 
709 private:
710 
711 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
712 //
713 // Params:
714 //
715 // input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
716 //            conversion, in which case the result will be a slice of this buffer.
717 // encoding = Encoding of input.
718 //
719 // Returns:
720 //
721 // A struct with the following members:
722 //
723 // $(D string errorMessage)   In case of an error, the error message is stored here. If
724 //                            there was no error, errorMessage is NULL. Always check
725 //                            this first.
726 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
727 // $(D size_t characterCount) Number of characters (code points) in input.
728 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
729 {
730     // Documented in function ddoc.
731     struct Result
732     {
733         string errorMessage;
734         char[] utf8;
735         size_t characterCount;
736     }
737 
738     Result result;
739 
740     // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
741     //
742     // Params:
743     //
744     // buffer = The input buffer to encode.
745     // result = A Result struct to put encoded result and any error messages to.
746     //
747     // On error, result.errorMessage will be set.
748     static void encode(C)(C[] input, ref Result result) @safe pure
749     {
750         // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
751         // less bytes.
752         static if(is(C == dchar))
753         {
754             char[4] encodeBuf;
755             auto utf8 = cast(char[])input;
756             auto length = 0;
757             foreach(dchar c; input)
758             {
759                 ++result.characterCount;
760                 // ASCII
761                 if(c < 0x80)
762                 {
763                     utf8[length++] = cast(char)c;
764                     continue;
765                 }
766 
767                 std.utf.encode(encodeBuf, c);
768                 const bytes = codeLength!char(c);
769                 utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
770                 length += bytes;
771             }
772             result.utf8 = utf8[0 .. length];
773         }
774         // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
775         else
776         {
777             result.characterCount = std.utf.count(input);
778             result.utf8 = input.to!(char[]);
779         }
780     }
781 
782     try final switch(encoding)
783     {
784         case UTFEncoding.UTF_8:
785             result.utf8 = cast(char[])input;
786             result.utf8.validate();
787             result.characterCount = std.utf.count(result.utf8);
788             break;
789         case UTFEncoding.UTF_16:
790             assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
791             encode(cast(wchar[])input, result);
792             break;
793         case UTFEncoding.UTF_32:
794             assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
795             encode(cast(dchar[])input, result);
796             break;
797     }
798     catch(ConvException e) { result.errorMessage = e.msg; }
799     catch(UTFException e)  { result.errorMessage = e.msg; }
800     catch(Exception e)
801     {
802         assert(false, "Unexpected exception in encode(): " ~ e.msg);
803     }
804 
805     return result;
806 }
807 
808 /// Determine if all characters (code points, not bytes) in a string are printable.
809 bool isPrintableValidUTF8(const char[] chars) @safe pure
810 {
811     import std.uni : isControl, isWhite;
812     foreach (dchar chr; chars)
813     {
814         if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
815         {
816             return false;
817         }
818     }
819     return true;
820 }
821 
822 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
823 ///
824 /// Used to determine how many characters we can process without decoding.
825 size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
826 {
827     return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
828 }
829 // Unittests.
830 
831 void testEndian(R)()
832 {
833     void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
834     {
835         auto reader = new R(data);
836         assert(reader.encoding == encoding_expected);
837         assert(reader.endian_ == endian_expected);
838     }
839     ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
840     ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
841     endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
842     endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
843 }
844 
845 void testPeekPrefixForward(R)()
846 {
847     import std.encoding;
848     ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
849     auto reader = new R(data);
850     assert(reader.peek() == 'd');
851     assert(reader.peek(1) == 'a');
852     assert(reader.peek(2) == 't');
853     assert(reader.peek(3) == 'a');
854     assert(reader.peek(4) == '\0');
855     assert(reader.prefix(4) == "data");
856     // assert(reader.prefix(6) == "data\0");
857     reader.forward(2);
858     assert(reader.peek(1) == 'a');
859     // assert(collectException(reader.peek(3)));
860 }
861 
862 void testUTF(R)()
863 {
864     import std.encoding;
865     dchar[] data = cast(dchar[])"data";
866     void utf_test(T)(T[] data, BOM bom)
867     {
868         ubyte[] bytes = bomTable[bom].sequence ~
869                         (cast(ubyte[])data)[0 .. data.length * T.sizeof];
870         auto reader = new R(bytes);
871         assert(reader.peek() == 'd');
872         assert(reader.peek(1) == 'a');
873         assert(reader.peek(2) == 't');
874         assert(reader.peek(3) == 'a');
875     }
876     utf_test!char(to!(char[])(data), BOM.utf8);
877     utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
878     utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
879 }
880 
881 void test1Byte(R)()
882 {
883     ubyte[] data = [97];
884 
885     auto reader = new R(data);
886     assert(reader.peek() == 'a');
887     assert(reader.peek(1) == '\0');
888     // assert(collectException(reader.peek(2)));
889 }
890 
891 @system unittest
892 {
893     testEndian!Reader();
894     testPeekPrefixForward!Reader();
895     testUTF!Reader();
896     test1Byte!Reader();
897 }
898 //Issue 257 - https://github.com/dlang-community/D-YAML/issues/257
899 @safe unittest
900 {
901     import dyaml.loader : Loader;
902     auto yaml = "hello ";
903     auto root = Loader.fromString(yaml).load();
904     assert(root._is!string);
905 }