tinyendian source code

1 //          Copyright Ferdinand Majerech 2014.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 /// A minimal library providing functionality for changing the endianness of data.
7 module tinyendian;
8 
9 
10 import core.stdc.string;
11 
12 import std.algorithm;
13 import std.system;
14 import std.utf;
15 
16 
17 /// Unicode UTF encodings.
18 enum UTFEncoding : ubyte
19 {
20     UTF_8,
21     UTF_16,
22     UTF_32
23 }
24 
25 unittest
26 {
27     const ints = [314, -101];
28     int[2] intsSwapBuffer = ints;
29     swapByteOrder(intsSwapBuffer[]);
30     swapByteOrder(intsSwapBuffer[]);
31     assert(ints == intsSwapBuffer, "Lost information when swapping byte order");
32 
33     const floats = [3.14f, 10.1f];
34     float[2] floatsSwapBuffer = floats;
35     swapByteOrder(floatsSwapBuffer[]);
36     swapByteOrder(floatsSwapBuffer[]);
37     assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");
38 }
39 
40 @system pure nothrow @nogc:
41 
42 /// Swap byte order of items in an array in place.
43 ///
44 /// Params:
45 ///
46 /// T     = Item type. Must be either 2 or 4 bytes long.
47 /// array = Buffer with values to fix byte order of.
48 void swapByteOrder(T)(T[] array)
49     if([2, 4].canFind(T.sizeof))
50 {
51     import core.bitop;
52     // Swap the byte order of all read characters.
53     foreach(ref item; array)
54     {
55         static if(T.sizeof == 2)
56         {
57             swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
58         }
59         else static if(T.sizeof == 4)
60         {
61             const swapped = bswap(*cast(uint*)&item);
62             item = *cast(const(T)*)&swapped;
63         }
64         else static assert(false, "Unsupported T: " ~ T.stringof);
65     }
66 }
67 
68 /// Convert byte order of an array encoded in UTF(8/16/32) to system endianness in
69 /// place.
70 ///
71 /// Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
72 /// at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
73 /// BOM, if any, will be removed from the buffer.
74 ///
75 /// If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
76 /// for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
77 /// 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
78 ///
79 /// Note that this function does $(B not) check if the array is a valid UTF string. It
80 /// only works with the BOM and 1,2 or 4-byte items.
81 ///
82 /// Params:
83 ///
84 /// array = The array with UTF-data.
85 ///
86 /// Returns:
87 ///
88 /// A struct with the following members:
89 ///
90 /// $(D ubyte[] array)            A slice of the input array containing data in correct
91 ///                               byte order, without BOM and in case of UTF-16/UTF-32,
92 ///                               without stripped bytes, if any.
93 /// $(D UTFEncoding encoding)     Encoding of the result (UTF-8, UTF-16 or UTF-32)
94 /// $(D std.system.Endian endian) Endianness of the original array.
95 /// $(D uint bytesStripped)       Number of bytes stripped from a UTF-16/UTF-32 array,
96 ///                               if any. This is non-zero only if array.length was not
97 ///                               divisible by 2 or 4 for UTF-16 and UTF-32,
98 ///                               respectively.
99 ///
100 /// Complexity: (BIGOH array.length)
101 auto fixUTFByteOrder(ubyte[] array)
102 {
103     // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
104     enum BOM: ubyte
105     {
106         UTF_8     = 0,
107         UTF_16_LE = 1,
108         UTF_16_BE = 2,
109         UTF_32_LE = 3,
110         UTF_32_BE = 4,
111         None      = ubyte.max
112     }
113 
114     // These 2 are from std.stream
115     static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
116                                                    [0xFF, 0xFE],
117                                                    [0xFE, 0xFF],
118                                                    [0xFF, 0xFE, 0x00, 0x00],
119                                                    [0x00, 0x00, 0xFE, 0xFF] ];
120     static immutable Endian[5] bomEndian = [ std.system.endian,
121                                              Endian.littleEndian,
122                                              Endian.bigEndian,
123                                              Endian.littleEndian, 
124                                              Endian.bigEndian ];
125 
126     // Documented in function ddoc.
127     struct Result
128     {
129         ubyte[] array;
130         UTFEncoding encoding;
131         Endian endian;
132         uint bytesStripped = 0;
133     }
134     Result result;
135 
136     // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
137     // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
138     // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
139     BOM bomId = BOM.None;
140     foreach(i, bom; byteOrderMarks) if(array.startsWith(bom))
141     {
142         bomId = cast(BOM)i;
143     }
144 
145     result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;
146 
147     // Start of UTF data (after BOM, if any)
148     size_t start = 0;
149     // If we've read more than just the BOM, put the rest into the array.
150     with(BOM) final switch(bomId)
151     {
152         case None: result.encoding = UTFEncoding.UTF_8; break;
153         case UTF_8:
154             start = 3;
155             result.encoding = UTFEncoding.UTF_8;
156             break;
157         case UTF_16_LE, UTF_16_BE:
158             result.bytesStripped = array.length % 2;
159             start = 2;
160             result.encoding = UTFEncoding.UTF_16;
161             break;
162         case UTF_32_LE, UTF_32_BE:
163             result.bytesStripped = array.length % 4;
164             start = 4;
165             result.encoding = UTFEncoding.UTF_32;
166             break;
167     }
168 
169     array = array[0 .. $ - result.bytesStripped];
170     // If there's a BOM, we need to move data back to ensure it starts at array[0]
171     if(start != 0)
172     {
173         core.stdc..string.memmove(array.ptr, array.ptr + start, array.length - start);
174         array = array[0 .. $ - start];
175     }
176 
177     // We enforce above that array.length is divisible by 2/4 for UTF-16/32
178     if(std.system.endian != result.endian)
179     {
180         if(result.encoding == UTFEncoding.UTF_16)      { swapByteOrder(cast(wchar[])array); }
181         else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); }
182     }
183 
184     result.array = array;
185     return result;
186 }