1 // Copyright Ferdinand Majerech 2014. 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 /// A minimal library providing functionality for changing the endianness of data. 7 module tinyendian; 8 9 10 import core.stdc.string; 11 12 import std.algorithm; 13 import std.system; 14 import std.utf; 15 16 17 /// Unicode UTF encodings. 18 enum UTFEncoding : ubyte 19 { 20 UTF_8, 21 UTF_16, 22 UTF_32 23 } 24 25 @system pure nothrow @nogc: 26 27 /// Swap byte order of items in an array in place. 28 /// 29 /// Params: 30 /// 31 /// T = Item type. Must be either 2 or 4 bytes long. 32 /// array = Buffer with values to fix byte order of. 33 void swapByteOrder(T)(T[] array) 34 if([2, 4].canFind(T.sizeof)) 35 { 36 import core.bitop; 37 // Swap the byte order of all read characters. 38 foreach(ref item; array) 39 { 40 static if(T.sizeof == 2) 41 { 42 swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1)); 43 } 44 else static if(T.sizeof == 4) 45 { 46 item = bswap(cast(uint)item); 47 } 48 else static assert(false, "Unsupported T: " ~ T.stringof); 49 } 50 } 51 52 /// Convert byte order of an array encoded in UTF(8/16/32) to system endianness in 53 /// place. 54 /// 55 /// Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM 56 /// at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The 57 /// BOM, if any, will be removed from the buffer. 58 /// 59 /// If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes 60 /// for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by 61 /// 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped. 62 /// 63 /// Note that this function does $(B not) check if the array is a valid UTF string. It 64 /// only works with the BOM and 1,2 or 4-byte items. 65 /// 66 /// Params: 67 /// 68 /// array = The array with UTF-data. 69 /// 70 /// Returns: 71 /// 72 /// A struct with the following members: 73 /// 74 /// $(D ubyte[] array) A slice of the input array containing data in correct 75 /// byte order, without BOM and in case of UTF-16/UTF-32, 76 /// without stripped bytes, if any. 77 /// $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32) 78 /// $(D std.system.Endian endian) Endianness of the original array. 79 /// $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, 80 /// if any. This is non-zero only if array.length was not 81 /// divisible by 2 or 4 for UTF-16 and UTF-32, 82 /// respectively. 83 /// 84 /// Complexity: (BIGOH array.length) 85 auto fixUTFByteOrder(ubyte[] array) 86 { 87 // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian. 88 enum BOM: ubyte 89 { 90 UTF_8 = 0, 91 UTF_16_LE = 1, 92 UTF_16_BE = 2, 93 UTF_32_LE = 3, 94 UTF_32_BE = 4, 95 None = ubyte.max 96 } 97 98 // These 2 are from std.stream 99 static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF], 100 [0xFF, 0xFE], 101 [0xFE, 0xFF], 102 [0xFF, 0xFE, 0x00, 0x00], 103 [0x00, 0x00, 0xFE, 0xFF] ]; 104 static immutable Endian[5] bomEndian = [ std.system.endian, 105 Endian.littleEndian, 106 Endian.bigEndian, 107 Endian.littleEndian, 108 Endian.bigEndian ]; 109 110 // Documented in function ddoc. 111 struct Result 112 { 113 ubyte[] array; 114 UTFEncoding encoding; 115 Endian endian; 116 uint bytesStripped = 0; 117 } 118 Result result; 119 120 // Detect BOM, if any, in the bytes we've read. -1 means no BOM. 121 // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we 122 // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM. 123 BOM bomId = BOM.None; 124 foreach(i, bom; byteOrderMarks) if(array.startsWith(bom)) 125 { 126 bomId = cast(BOM)i; 127 } 128 129 result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init; 130 131 // Start of UTF data (after BOM, if any) 132 size_t start = 0; 133 // If we've read more than just the BOM, put the rest into the array. 134 with(BOM) final switch(bomId) 135 { 136 case None: result.encoding = UTFEncoding.UTF_8; break; 137 case UTF_8: 138 start = 3; 139 result.encoding = UTFEncoding.UTF_8; 140 break; 141 case UTF_16_LE, UTF_16_BE: 142 result.bytesStripped = array.length % 2; 143 start = 2; 144 result.encoding = UTFEncoding.UTF_16; 145 break; 146 case UTF_32_LE, UTF_32_BE: 147 result.bytesStripped = array.length % 4; 148 start = 4; 149 result.encoding = UTFEncoding.UTF_32; 150 break; 151 } 152 153 array = array[0 .. $ - result.bytesStripped]; 154 // If there's a BOM, we need to move data back to ensure it starts at array[0] 155 if(start != 0) 156 { 157 core.stdc..string.memmove(array.ptr, array.ptr + start, array.length - start); 158 array = array[0 .. $ - start]; 159 } 160 161 // We enforce above that array.length is divisible by 2/4 for UTF-16/32 162 if(std.system.endian != result.endian) 163 { 164 if(result.encoding == UTFEncoding.UTF_16) { swapByteOrder(cast(wchar[])array); } 165 else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); } 166 } 167 168 result.array = array; 169 return result; 170 }