1 // Copyright Ferdinand Majerech 2014. 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 /// A minimal library providing functionality for changing the endianness of data. 7 module tinyendian; 8 9 import std.system : Endian, endian; 10 11 /// Unicode UTF encodings. 12 enum UTFEncoding : ubyte 13 { 14 UTF_8, 15 UTF_16, 16 UTF_32 17 } 18 /// 19 @safe unittest 20 { 21 const ints = [314, -101]; 22 int[2] intsSwapBuffer = ints; 23 swapByteOrder(intsSwapBuffer[]); 24 swapByteOrder(intsSwapBuffer[]); 25 assert(ints == intsSwapBuffer, "Lost information when swapping byte order"); 26 27 const floats = [3.14f, 10.1f]; 28 float[2] floatsSwapBuffer = floats; 29 swapByteOrder(floatsSwapBuffer[]); 30 swapByteOrder(floatsSwapBuffer[]); 31 assert(floats == floatsSwapBuffer, "Lost information when swapping byte order"); 32 } 33 34 /** Swap byte order of items in an array in place. 35 * 36 * Params: 37 * 38 * T = Item type. Must be either 2 or 4 bytes long. 39 * array = Buffer with values to fix byte order of. 40 */ 41 void swapByteOrder(T)(T[] array) @trusted @nogc pure nothrow 42 if (T.sizeof == 2 || T.sizeof == 4) 43 { 44 // Swap the byte order of all read characters. 45 foreach (ref item; array) 46 { 47 static if (T.sizeof == 2) 48 { 49 import std.algorithm.mutation : swap; 50 swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1)); 51 } 52 else static if (T.sizeof == 4) 53 { 54 import core.bitop : bswap; 55 const swapped = bswap(*cast(uint*)&item); 56 item = *cast(const(T)*)&swapped; 57 } 58 else static assert(false, "Unsupported T: " ~ T.stringof); 59 } 60 } 61 62 /// See fixUTFByteOrder. 63 struct FixUTFByteOrderResult 64 { 65 ubyte[] array; 66 UTFEncoding encoding; 67 Endian endian; 68 uint bytesStripped = 0; 69 } 70 71 /** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place. 72 * 73 * Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM 74 * at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The 75 * BOM, if any, will be removed from the buffer. 76 * 77 * If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes 78 * for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by 79 * 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped. 80 * 81 * Note that this function does $(B not) check if the array is a valid UTF string. It 82 * only works with the BOM and 1,2 or 4-byte items. 83 * 84 * Params: 85 * 86 * array = The array with UTF-data. 87 * 88 * Returns: 89 * 90 * A struct with the following members: 91 * 92 * $(D ubyte[] array) A slice of the input array containing data in correct 93 * byte order, without BOM and in case of UTF-16/UTF-32, 94 * without stripped bytes, if any. 95 * $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32) 96 * $(D std.system.Endian endian) Endianness of the original array. 97 * $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, if 98 * any. This is non-zero only if array.length was not 99 * divisible by 2 or 4 for UTF-16 and UTF-32, respectively. 100 * 101 * Complexity: (BIGOH array.length) 102 */ 103 auto fixUTFByteOrder(ubyte[] array) @safe @nogc pure nothrow 104 { 105 // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian. 106 enum BOM: ubyte 107 { 108 UTF_8 = 0, 109 UTF_16_LE = 1, 110 UTF_16_BE = 2, 111 UTF_32_LE = 3, 112 UTF_32_BE = 4, 113 None = ubyte.max 114 } 115 116 // These 2 are from std.stream 117 static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF], 118 [0xFF, 0xFE], 119 [0xFE, 0xFF], 120 [0xFF, 0xFE, 0x00, 0x00], 121 [0x00, 0x00, 0xFE, 0xFF] ]; 122 static immutable Endian[5] bomEndian = [ endian, 123 Endian.littleEndian, 124 Endian.bigEndian, 125 Endian.littleEndian, 126 Endian.bigEndian ]; 127 128 // Documented in function ddoc. 129 130 FixUTFByteOrderResult result; 131 132 // Detect BOM, if any, in the bytes we've read. -1 means no BOM. 133 // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we 134 // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM. 135 import std.algorithm.searching : startsWith; 136 BOM bomId = BOM.None; 137 foreach (i, bom; byteOrderMarks) 138 if (array.startsWith(bom)) 139 bomId = cast(BOM)i; 140 141 result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init; 142 143 // Start of UTF data (after BOM, if any) 144 size_t start = 0; 145 // If we've read more than just the BOM, put the rest into the array. 146 with(BOM) final switch(bomId) 147 { 148 case None: result.encoding = UTFEncoding.UTF_8; break; 149 case UTF_8: 150 start = 3; 151 result.encoding = UTFEncoding.UTF_8; 152 break; 153 case UTF_16_LE, UTF_16_BE: 154 result.bytesStripped = array.length % 2; 155 start = 2; 156 result.encoding = UTFEncoding.UTF_16; 157 break; 158 case UTF_32_LE, UTF_32_BE: 159 result.bytesStripped = array.length % 4; 160 start = 4; 161 result.encoding = UTFEncoding.UTF_32; 162 break; 163 } 164 165 // If there's a BOM, we need to move data back to ensure it starts at array[0] 166 if (start != 0) 167 { 168 array = array[start .. $ - result.bytesStripped]; 169 } 170 171 // We enforce above that array.length is divisible by 2/4 for UTF-16/32 172 if (endian != result.endian) 173 { 174 if (result.encoding == UTFEncoding.UTF_16) 175 swapByteOrder(cast(wchar[])array); 176 else if (result.encoding == UTFEncoding.UTF_32) 177 swapByteOrder(cast(dchar[])array); 178 } 179 180 result.array = array; 181 return result; 182 } 183 /// 184 @safe unittest 185 { 186 { 187 ubyte[] s = [0xEF, 0xBB, 0xBF, 'a']; 188 FixUTFByteOrderResult r = fixUTFByteOrder(s); 189 assert(r.encoding == UTFEncoding.UTF_8); 190 assert(r.array.length == 1); 191 assert(r.array == ['a']); 192 assert(r.endian == Endian.littleEndian); 193 } 194 195 { 196 ubyte[] s = ['a']; 197 FixUTFByteOrderResult r = fixUTFByteOrder(s); 198 assert(r.encoding == UTFEncoding.UTF_8); 199 assert(r.array.length == 1); 200 assert(r.array == ['a']); 201 assert(r.endian == Endian.bigEndian); 202 } 203 204 { 205 // strip 'a' b/c not complete unit 206 ubyte[] s = [0xFE, 0xFF, 'a']; 207 FixUTFByteOrderResult r = fixUTFByteOrder(s); 208 assert(r.encoding == UTFEncoding.UTF_16); 209 assert(r.array.length == 0); 210 assert(r.endian == Endian.bigEndian); 211 } 212 213 }