1 // Copyright Ferdinand Majerech 2014. 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 /// A minimal library providing functionality for changing the endianness of data. 7 module tinyendian; 8 9 10 import core.stdc.string; 11 12 import std.algorithm; 13 import std.system; 14 import std.utf; 15 16 17 /// Unicode UTF encodings. 18 enum UTFEncoding : ubyte 19 { 20 UTF_8, 21 UTF_16, 22 UTF_32 23 } 24 25 unittest 26 { 27 const ints = [314, -101]; 28 int[2] intsSwapBuffer = ints; 29 swapByteOrder(intsSwapBuffer[]); 30 swapByteOrder(intsSwapBuffer[]); 31 assert(ints == intsSwapBuffer, "Lost information when swapping byte order"); 32 33 const floats = [3.14f, 10.1f]; 34 float[2] floatsSwapBuffer = floats; 35 swapByteOrder(floatsSwapBuffer[]); 36 swapByteOrder(floatsSwapBuffer[]); 37 assert(floats == floatsSwapBuffer, "Lost information when swapping byte order"); 38 } 39 40 @system pure nothrow @nogc: 41 42 /// Swap byte order of items in an array in place. 43 /// 44 /// Params: 45 /// 46 /// T = Item type. Must be either 2 or 4 bytes long. 47 /// array = Buffer with values to fix byte order of. 48 void swapByteOrder(T)(T[] array) 49 if([2, 4].canFind(T.sizeof)) 50 { 51 import core.bitop; 52 // Swap the byte order of all read characters. 53 foreach(ref item; array) 54 { 55 static if(T.sizeof == 2) 56 { 57 swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1)); 58 } 59 else static if(T.sizeof == 4) 60 { 61 const swapped = bswap(*cast(uint*)&item); 62 item = *cast(const(T)*)&swapped; 63 } 64 else static assert(false, "Unsupported T: " ~ T.stringof); 65 } 66 } 67 68 /// Convert byte order of an array encoded in UTF(8/16/32) to system endianness in 69 /// place. 70 /// 71 /// Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM 72 /// at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The 73 /// BOM, if any, will be removed from the buffer. 74 /// 75 /// If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes 76 /// for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by 77 /// 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped. 78 /// 79 /// Note that this function does $(B not) check if the array is a valid UTF string. It 80 /// only works with the BOM and 1,2 or 4-byte items. 81 /// 82 /// Params: 83 /// 84 /// array = The array with UTF-data. 85 /// 86 /// Returns: 87 /// 88 /// A struct with the following members: 89 /// 90 /// $(D ubyte[] array) A slice of the input array containing data in correct 91 /// byte order, without BOM and in case of UTF-16/UTF-32, 92 /// without stripped bytes, if any. 93 /// $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32) 94 /// $(D std.system.Endian endian) Endianness of the original array. 95 /// $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, 96 /// if any. This is non-zero only if array.length was not 97 /// divisible by 2 or 4 for UTF-16 and UTF-32, 98 /// respectively. 99 /// 100 /// Complexity: (BIGOH array.length) 101 auto fixUTFByteOrder(ubyte[] array) 102 { 103 // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian. 104 enum BOM: ubyte 105 { 106 UTF_8 = 0, 107 UTF_16_LE = 1, 108 UTF_16_BE = 2, 109 UTF_32_LE = 3, 110 UTF_32_BE = 4, 111 None = ubyte.max 112 } 113 114 // These 2 are from std.stream 115 static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF], 116 [0xFF, 0xFE], 117 [0xFE, 0xFF], 118 [0xFF, 0xFE, 0x00, 0x00], 119 [0x00, 0x00, 0xFE, 0xFF] ]; 120 static immutable Endian[5] bomEndian = [ std.system.endian, 121 Endian.littleEndian, 122 Endian.bigEndian, 123 Endian.littleEndian, 124 Endian.bigEndian ]; 125 126 // Documented in function ddoc. 127 struct Result 128 { 129 ubyte[] array; 130 UTFEncoding encoding; 131 Endian endian; 132 uint bytesStripped = 0; 133 } 134 Result result; 135 136 // Detect BOM, if any, in the bytes we've read. -1 means no BOM. 137 // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we 138 // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM. 139 BOM bomId = BOM.None; 140 foreach(i, bom; byteOrderMarks) if(array.startsWith(bom)) 141 { 142 bomId = cast(BOM)i; 143 } 144 145 result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init; 146 147 // Start of UTF data (after BOM, if any) 148 size_t start = 0; 149 // If we've read more than just the BOM, put the rest into the array. 150 with(BOM) final switch(bomId) 151 { 152 case None: result.encoding = UTFEncoding.UTF_8; break; 153 case UTF_8: 154 start = 3; 155 result.encoding = UTFEncoding.UTF_8; 156 break; 157 case UTF_16_LE, UTF_16_BE: 158 result.bytesStripped = array.length % 2; 159 start = 2; 160 result.encoding = UTFEncoding.UTF_16; 161 break; 162 case UTF_32_LE, UTF_32_BE: 163 result.bytesStripped = array.length % 4; 164 start = 4; 165 result.encoding = UTFEncoding.UTF_32; 166 break; 167 } 168 169 array = array[0 .. $ - result.bytesStripped]; 170 // If there's a BOM, we need to move data back to ensure it starts at array[0] 171 if(start != 0) 172 { 173 core.stdc..string.memmove(array.ptr, array.ptr + start, array.length - start); 174 array = array[0 .. $ - start]; 175 } 176 177 // We enforce above that array.length is divisible by 2/4 for UTF-16/32 178 if(std.system.endian != result.endian) 179 { 180 if(result.encoding == UTFEncoding.UTF_16) { swapByteOrder(cast(wchar[])array); } 181 else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); } 182 } 183 184 result.array = array; 185 return result; 186 }