tinyendian source code

1 //          Copyright Ferdinand Majerech 2014.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 /// A minimal library providing functionality for changing the endianness of data.
7 module tinyendian;
8 
9 
10 import core.stdc.string;
11 
12 import std.algorithm;
13 import std.system;
14 import std.utf;
15 
16 static if(__VERSION__ < 2066)
17     private enum nogc;
18 
19 /// Unicode UTF encodings.
20 enum UTFEncoding : ubyte
21 {
22     UTF_8,
23     UTF_16,
24     UTF_32
25 }
26 
27 unittest
28 {
29     const ints = [314, -101];
30     int[2] intsSwapBuffer = ints;
31     swapByteOrder(intsSwapBuffer[]);
32     swapByteOrder(intsSwapBuffer[]);
33     assert(ints == intsSwapBuffer, "Lost information when swapping byte order");
34 
35     const floats = [3.14f, 10.1f];
36     float[2] floatsSwapBuffer = floats;
37     swapByteOrder(floatsSwapBuffer[]);
38     swapByteOrder(floatsSwapBuffer[]);
39     assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");
40 }
41 
42 @nogc @system pure nothrow:
43 
44 /** Swap byte order of items in an array in place.
45  *
46  * Params:
47  *
48  * T     = Item type. Must be either 2 or 4 bytes long.
49  * array = Buffer with values to fix byte order of.
50  */
51 void swapByteOrder(T)(T[] array)
52     if([2, 4].canFind(T.sizeof))
53 {
54     import core.bitop;
55     // Swap the byte order of all read characters.
56     foreach(ref item; array)
57     {
58         static if(T.sizeof == 2)
59         {
60             swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
61         }
62         else static if(T.sizeof == 4)
63         {
64             const swapped = bswap(*cast(uint*)&item);
65             item = *cast(const(T)*)&swapped;
66         }
67         else static assert(false, "Unsupported T: " ~ T.stringof);
68     }
69 }
70 
71 /** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place.
72  *
73  * Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
74  * at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
75  * BOM, if any, will be removed from the buffer.
76  *
77  * If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
78  * for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
79  * 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
80  *
81  * Note that this function does $(B not) check if the array is a valid UTF string. It
82  * only works with the BOM and 1,2 or 4-byte items.
83  *
84  * Params:
85  *
86  * array = The array with UTF-data.
87  *
88  * Returns:
89  *
90  * A struct with the following members:
91  *
92  * $(D ubyte[] array)            A slice of the input array containing data in correct
93  *                               byte order, without BOM and in case of UTF-16/UTF-32,
94  *                               without stripped bytes, if any.
95  * $(D UTFEncoding encoding)     Encoding of the result (UTF-8, UTF-16 or UTF-32)
96  * $(D std.system.Endian endian) Endianness of the original array.
97  * $(D uint bytesStripped)       Number of bytes stripped from a UTF-16/UTF-32 array, if
98  *                               any. This is non-zero only if array.length was not
99  *                               divisible by 2 or 4 for UTF-16 and UTF-32, respectively.
100  *
101  * Complexity: (BIGOH array.length)
102  */
103 auto fixUTFByteOrder(ubyte[] array)
104 {
105     // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
106     enum BOM: ubyte
107     {
108         UTF_8     = 0,
109         UTF_16_LE = 1,
110         UTF_16_BE = 2,
111         UTF_32_LE = 3,
112         UTF_32_BE = 4,
113         None      = ubyte.max
114     }
115 
116     // These 2 are from std.stream
117     static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
118                                                    [0xFF, 0xFE],
119                                                    [0xFE, 0xFF],
120                                                    [0xFF, 0xFE, 0x00, 0x00],
121                                                    [0x00, 0x00, 0xFE, 0xFF] ];
122     static immutable Endian[5] bomEndian = [ std.system.endian,
123                                              Endian.littleEndian,
124                                              Endian.bigEndian,
125                                              Endian.littleEndian, 
126                                              Endian.bigEndian ];
127 
128     // Documented in function ddoc.
129     struct Result
130     {
131         ubyte[] array;
132         UTFEncoding encoding;
133         Endian endian;
134         uint bytesStripped = 0;
135     }
136     Result result;
137 
138     // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
139     // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
140     // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
141     BOM bomId = BOM.None;
142     foreach(i, bom; byteOrderMarks) if(array.startsWith(bom))
143     {
144         bomId = cast(BOM)i;
145     }
146 
147     result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;
148 
149     // Start of UTF data (after BOM, if any)
150     size_t start = 0;
151     // If we've read more than just the BOM, put the rest into the array.
152     with(BOM) final switch(bomId)
153     {
154         case None: result.encoding = UTFEncoding.UTF_8; break;
155         case UTF_8:
156             start = 3;
157             result.encoding = UTFEncoding.UTF_8;
158             break;
159         case UTF_16_LE, UTF_16_BE:
160             result.bytesStripped = array.length % 2;
161             start = 2;
162             result.encoding = UTFEncoding.UTF_16;
163             break;
164         case UTF_32_LE, UTF_32_BE:
165             result.bytesStripped = array.length % 4;
166             start = 4;
167             result.encoding = UTFEncoding.UTF_32;
168             break;
169     }
170 
171     array = array[0 .. $ - result.bytesStripped];
172     // If there's a BOM, we need to move data back to ensure it starts at array[0]
173     if(start != 0)
174     {
175         core.stdc..string.memmove(array.ptr, array.ptr + start, array.length - start);
176         array = array[0 .. $ - start];
177     }
178 
179     // We enforce above that array.length is divisible by 2/4 for UTF-16/32
180     if(std.system.endian != result.endian)
181     {
182         if(result.encoding == UTFEncoding.UTF_16)      { swapByteOrder(cast(wchar[])array); }
183         else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); }
184     }
185 
186     result.array = array;
187     return result;
188 }