chenjunfu2-nbt-cpp v2.1.0
一个基于CPP20的NBT(Named Binary Tag)库
载入中...
搜索中...
未找到
MUTF8_Tool.hpp
浏览该文件的文档.
1#pragma once
2
3#include <string>
4#include <type_traits>
5#include <assert.h>
6#include <stdint.h>
7#include <stddef.h>//size_t
8#include <array>
9#include <algorithm>
10
13
14//来个static string包装类,使得模板能接受字符串字面量
15//必须放在外面,否则NTTP推导主类模板会失败,
16//导致此并不依赖主类模板的模板也推导失败
17
21{
26 template<typename T, size_t N>
27 class StringLiteral : public std::array<T, N>
28 {
29 public:
31 using Super = std::array<T, N>;
32
33 public:
34
38 constexpr StringLiteral(const T(&_tStr)[N]) noexcept : Super(std::to_array(_tStr))
39 {}
40
43 constexpr ~StringLiteral(void) = default;
44 };
45
50 template<auto ArrayData, typename View_Type>
51 consteval View_Type ToStringView(void) noexcept
52 {
53 return { ArrayData.data(), ArrayData.size() };
54 }
55}
56
62template<typename MU8T = uint8_t, typename U16T = char16_t, typename U8T = char8_t>
63class MUTF8_Tool
64{
65 static_assert(sizeof(MU8T) == 1, "MU8T size must be at 1 byte");
66 static_assert(sizeof(U16T) == 2, "U16T size must be at 2 bytes");
67 static_assert(sizeof(U8T) == 1, "U8T size must be at 1 bytes");
68
69 MUTF8_Tool(void) = delete;
70 ~MUTF8_Tool(void) = delete;
71
72 //用于在错误情况输出utf16错误字节0xFFFD和mu8、u8形式的0xEF 0xBF 0xBD
73 static inline constexpr MU8T mu8FailChar[3]{ (MU8T)0xEF, (MU8T)0xBF, (MU8T)0xBD };
74 static inline constexpr U16T u16FailChar = (U16T)0xFFFD;
75 static inline constexpr U8T u8FailChar[3]{ (U8T)0xEF, (U8T)0xBF, (U8T)0xBD };
76
77public:
79 using MU8_T = MU8T;
81 using U16_T = U16T;
83 using U8_T = U8T;
84
85private:
86 //来点魔法类,伪装成basic string,在插入的时候进行数据长度计数,忽略插入的数据,最后转换为size_t长度
87 //这样就能在最小修改的情况下用同一个函数套模板来获取转换后的长度(且100%准确),而不是重写一个例程
88 template<typename T>
89 class FakeStringCounter
90 {
91 private:
92 size_t szCounter = 0;
93 public:
94 constexpr FakeStringCounter(void) = default;
95 constexpr ~FakeStringCounter(void) = default;
96
97 constexpr void clear(void) noexcept
98 {
99 szCounter = 0;
100 }
101
102 constexpr FakeStringCounter &append(const T *const, size_t szSize) noexcept
103 {
104 szCounter += szSize;
105 return *this;
106 }
107
108 template<typename U>
109 constexpr FakeStringCounter &append_cvt(const U *const, size_t szSize) noexcept
110 {
111 szCounter += szSize;//静态求值与动态求值并无区别,不做特例
112 return *this;
113 }
114
115 constexpr void push_back(const T &) noexcept
116 {
117 szCounter += 1;
118 }
119
120 constexpr const size_t &GetData(void) const noexcept
121 {
122 return szCounter;
123 }
124 };
125
126 //魔法类2,伪装成string,转换到静态字符串作为std::array返回
127 template<typename T, size_t N>
128 class StaticString
129 {
130 public:
131 using ARRAY_TYPE = std::array<T, N>;
132 private:
133 ARRAY_TYPE arrData{};
134 size_t szIndex = 0;
135 public:
136 constexpr StaticString(void) = default;
137 constexpr ~StaticString(void) = default;
138
139 constexpr void clear(void) noexcept
140 {
141 szIndex = 0;
142 }
143
144 constexpr StaticString &append(const T *const pData, size_t szLength) noexcept
145 {
146 if (szLength == 0 || szLength > arrData.size() - szIndex)//减法避免溢出,注意这里不是大等于而是大于,否则会导致拷贝差1错误
147 {
148 return *this;
149 }
150
151 //从pData的 0 ~ szLength 拷贝到arrData的 szIndex ~ szIndex + szLength
152 std::ranges::copy(&pData[0], &pData[szLength], &arrData[szIndex]);
153 szIndex += szLength;
154
155 return *this;
156 }
157
158 template<typename U>
159 constexpr StaticString &append_cvt(const U *const pData, size_t szLength) noexcept
160 {
161 if (std::is_constant_evaluated())
162 {
163 if (szLength == 0 || szLength > arrData.size() - szIndex)
164 {
165 return *this;
166 }
167
168 //静态求值按顺序转换
169 std::ranges::transform(&pData[0], &pData[szLength], &arrData[szIndex],
170 [](const U &u) -> T
171 {
172 return (T)u;
173 });
174 szIndex += szLength;
175
176 return *this;
177 }
178 else//注意虽然理论上这个函数不会在任何情况被用于动态,但是为了明确操作,仍然这么写
179 {
180 return append((const T *const)pData, szLength);//非静态直接暴力转换指针
181 }
182 }
183
184 constexpr void push_back(const T &tData) noexcept
185 {
186 if (1 > arrData.size() - szIndex)
187 {
188 return;
189 }
190
191 arrData[szIndex] = tData;
192 szIndex += 1;
193 }
194
195 constexpr const ARRAY_TYPE &GetData(void) const noexcept
196 {
197 return arrData;
198 }
199 };
200
201 //魔法类3,给String添加静态转换接口,与原先一致,防止报错
202 template<typename String>
203 class DynamicString : public String//注意使用继承,这样可以直接隐式转换到基类
204 {
205 public:
206 DynamicString(size_t szReserve = 0) :String({})
207 {
208 String::reserve(szReserve);
209 }
210 ~DynamicString(void) = default;
211
212 template<typename U>
213 DynamicString &append_cvt(const U *const pData, size_t szLength)//理论上此函数永远动态调用
214 {
215 String::append((const typename String::value_type *const)pData, szLength);
216 return *this;
217 }
218
219 constexpr const String &GetData(void) const noexcept
220 {
221 return *this;
222 }
223 };
224
225private:
226 template<size_t szBytes>
227 static constexpr void EncodeMUTF8Bmp(const U16T u16Char, MU8T(&mu8CharArr)[szBytes])
228 {
229 if constexpr (szBytes == 1)
230 {
231 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0111'1111) >> 0) | (uint16_t)0b0000'0000);//0 + 6-0 7bit
232 }
233 else if constexpr (szBytes == 2)
234 {
235 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0111'1100'0000) >> 6) | (uint16_t)0b1100'0000);//110 + 10-6 5bit
236 mu8CharArr[1] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0011'1111) >> 0) | (uint16_t)0b1000'0000);//10 + 5-0 6bit
237 }
238 else if constexpr (szBytes == 3)
239 {
240 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b1111'0000'0000'0000) >> 12) | (uint16_t)0b1110'0000);//1110 + 15-12 4bit
241 mu8CharArr[1] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'1111'1100'0000) >> 6) | (uint16_t)0b1000'0000);//10 + 11-6 6bit
242 mu8CharArr[2] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0011'1111) >> 0) | (uint16_t)0b1000'0000);//10 + 5-0 6bit
243 }
244 else
245 {
246 static_assert(false, "Error szBytes Size");//大小错误
247 }
248 }
249
250 static constexpr void EncodeMUTF8Supplementary(const U16T u16HighSurrogate, const U16T u16LowSurrogate, MU8T(&mu8CharArr)[6])
251 {
252 //取出代理对数据并组合 范围:1'0000 ~ 10'FFFF 通过u16代理对组成:高代理10位低代理10位,
253 //最后加上0x1'0000得到此范围,也就是从utf16组成的0x0'0000 ~ 0xF'FFFF + 0x1'0000得到
254 uint32_t u32RawChar = ((uint32_t)((uint16_t)u16HighSurrogate & (uint16_t)0b0000'0011'1111'1111)) << 10 |//10bit
255 ((uint32_t)((uint16_t)u16LowSurrogate & (uint16_t)0b0000'0011'1111'1111)) << 0; //10bit
256
257 //因为mutf8直接存储utf16的代理对位,不进行+0x1'0000运算
258 //u32RawChar += (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为加法而非位运算
259
260 //高代理
261 mu8CharArr[0] = (uint8_t)0b1110'1101;//固定字节
262 mu8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'1111'0000'0000'0000'0000) >> 16) | (uint32_t)0b1010'0000);//1010 + 19-16 4bit
263 mu8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'1111'1100'0000'0000) >> 10) | (uint32_t)0b1000'0000);//10 + 15-10 6bit
264 //低代理
265 mu8CharArr[3] = (uint8_t)0b1110'1101;//固定字节
266 mu8CharArr[4] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1100'0000) >> 6) | (uint32_t)0b1011'0000);//1011 + 9-6 4bit
267 mu8CharArr[5] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
268 }
269
270 template<size_t szBytes>
271 static constexpr void DecodeMUTF8Bmp(const MU8T(&mu8CharArr)[szBytes], U16T &u16Char)
272 {
273 if constexpr (szBytes == 1)
274 {
275 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0111'1111)) << 0;//7bit
276 }
277 else if constexpr (szBytes == 2)
278 {
279 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0001'1111)) << 6 |//5bit
280 ((uint16_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0011'1111)) << 0; //6bit
281 }
282 else if constexpr (szBytes == 3)
283 {
284 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0000'1111)) << 12 |//4bit
285 ((uint16_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0011'1111)) << 6 |//6bit
286 ((uint16_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 0; //6bit
287 }
288 else
289 {
290 static_assert(false, "Error szBytes Size");//大小错误
291 }
292 }
293
294 static constexpr void DecodeMUTF8Supplementary(const MU8T(&mu8CharArr)[6], U16T &u16HighSurrogate, U16T &u16LowSurrogate)
295 {
296 uint32_t u32RawChar = //mu8CharArr[0] ignore 固定字节忽略
297 ((uint32_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0000'1111)) << 16 |//4bit
298 ((uint32_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 10 |//6bit
299 //mu8CharArr[3] ignore 固定字节忽略
300 ((uint32_t)((uint8_t)mu8CharArr[4] & (uint8_t)0b0000'1111)) << 6 |//4bit
301 ((uint32_t)((uint8_t)mu8CharArr[5] & (uint8_t)0b0011'1111)) << 0 ;//6bit
302
303 //因为mutf8直接存储utf16的代理对位,不进行-0x1'0000运算
304 //u32RawChar -= (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为减法而非位运算
305
306 //解析到高低代理
307 //范围1'0000-10'FFFF
308 u16HighSurrogate = (uint32_t)((u32RawChar & (uint32_t)0b0000'0000'0000'1111'1111'1100'0000'0000) >> 10 | (uint32_t)0b1101'1000'0000'0000);//1101'10 + 19-10 10bit
309 u16LowSurrogate = (uint32_t)((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1111'1111) >> 0 | (uint32_t)0b1101'1100'0000'0000);//1101'11 + 9-0 10bit
310 }
311
312 /*
313 4bytes utf-8 bit distribution:(utf8直接存储utf32的映射,需要两次转换到mutf8)
314 000u'uuuu zzzz'yyyy yyxx'xxxx - 1111'0uuu 10uu'zzzz 10yy'yyyy 10xx'xxxx
315
316 6bytes modified utf-8 bit distribution:(mutf8直接存储了utf16的映射,不需要增减0x1'0000,所以要比utf8小一位)
317 0000'uuuu zzzz'yyyy yyxx'xxxx - 1110'1101 1010'uuuu 10zz'zzyy 1110'1101 1011'yyyy 10xx'xxxx
318 */
319
320 static constexpr void UTF8SupplementaryToMUTF8(const U8T(&u8CharArr)[4], MU8T(&mu8CharArr)[6])
321 {
322 //先把utf8映射到utf32,减去0x1'0000,得到utf16的中间形式,再进行下一步转换
323 uint32_t u32RawChar = ((uint32_t)((uint8_t)u8CharArr[0] & (uint8_t)0b0000'0111)) << 18 |//3bit
324 ((uint32_t)((uint8_t)u8CharArr[1] & (uint8_t)0b0011'1111)) << 12 |//6bit
325 ((uint32_t)((uint8_t)u8CharArr[2] & (uint8_t)0b0011'1111)) << 6 |//6bit
326 ((uint32_t)((uint8_t)u8CharArr[3] & (uint8_t)0b0011'1111)) << 0; //6bit
327
328 //删除位
329 u32RawChar -= (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为减法而非位运算
330
331 //高代理
332 mu8CharArr[0] = (uint8_t)0b1110'1101;//固定字节
333 mu8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'1111'0000'0000'0000'0000) >> 16) | (uint32_t)0b1010'0000);//1010 + 19-16 4bit
334 mu8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'1111'1100'0000'0000) >> 10) | (uint32_t)0b1000'0000);//10 + 15-10 6bit
335 //低代理
336 mu8CharArr[3] = (uint8_t)0b1110'1101;//固定字节
337 mu8CharArr[4] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1100'0000) >> 6) | (uint32_t)0b1011'0000);//1011 + 9-6 4bit
338 mu8CharArr[5] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
339 }
340
341 static constexpr void MUTF8SupplementaryToUTF8(const MU8T(&mu8CharArr)[6], U8T(&u8CharArr)[4])
342 {
343 //先把mutf8映射到utf16组成的中间形式的utf32,再加上0x1'0000得到utf8的utf32表示形式
344 uint32_t u32RawChar = //mu8CharArr[0] ignore 固定字节忽略
345 ((uint32_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0000'1111)) << 16 |//4bit
346 ((uint32_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 10 |//6bit
347 //mu8CharArr[3] ignore 固定字节忽略
348 ((uint32_t)((uint8_t)mu8CharArr[4] & (uint8_t)0b0000'1111)) << 6 |//4bit
349 ((uint32_t)((uint8_t)mu8CharArr[5] & (uint8_t)0b0011'1111)) << 0 ;//6bit
350
351 //恢复位
352 u32RawChar += (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为加法而非位运算
353
354 //转换到utf8
355 u8CharArr[0] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0001'1100'0000'0000'0000'0000) >> 18) | (uint32_t)0b1111'0000);//1111'0 + 20-18 3bit
356 u8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0011'1111'0000'0000'0000) >> 12) | (uint32_t)0b1000'0000);//10 + 17-12 6bit
357 u8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'1111'1100'0000) >> 6) | (uint32_t)0b1000'0000);//10 + 11-6 6bit
358 u8CharArr[3] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
359 }
360
361private:
363
364//c=char d=do检查迭代器并获取下一个字节(如果可以,否则执行指定语句后跳出)
365#define GET_NEXTCHAR(c,d) if (++it == end) { (d);break; } else { (c) = *it; }
366//v=value m=mask p=pattern t=test 测试遮罩位之后的结果是否是指定值或值是否是由指定bits组成
367#define HAS_BITMASK(v,m,p) (((uint8_t)(v) & (uint8_t)(m)) == (uint8_t)(p))
368#define IS_BITS(v,t) ((uint8_t)(v) == (uint8_t)(t))
369//v=value b=begin e=end 注意范围是左右边界包含关系,而不是普通的左边界包含
370#define IN_RANGE(v,b,e) (((uint16_t)(v) >= (uint16_t)(b)) && ((uint16_t)(v) <= (uint16_t)(e)))
371
373
374 template<typename T = std::basic_string<MU8T>>
375 static constexpr T U16ToMU8Impl(const U16T *u16String, size_t szStringLength, T mu8String = {})
376 {
378#define PUSH_FAIL_MU8CHAR mu8String.append(mu8FailChar, sizeof(mu8FailChar) / sizeof(MU8T))
380
381 //因为string带长度信息,则不用处理0字符情况,for不会进入,直接返回size为0的mu8str
382 //mu8字符串结尾为0xC0 0x80而非0x00
383
384 for (auto it = u16String, end = u16String + szStringLength; it != end; ++it)
385 {
386 U16T u16Char = *it;//第一次
387 if (IN_RANGE(u16Char, 0x0001, 0x007F))//单字节码点
388 {
389 MU8T mu8Char[1]{};
390 EncodeMUTF8Bmp(u16Char, mu8Char);
391 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
392 }
393 else if (IN_RANGE(u16Char, 0x0080, 0x07FF) || u16Char == 0x0000)//双字节码点,0字节特判
394 {
395 MU8T mu8Char[2]{};
396 EncodeMUTF8Bmp(u16Char, mu8Char);
397 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
398 }
399 else if (IN_RANGE(u16Char, 0x0800, 0xFFFF))//三字节码点or多字节码点
400 {
401 if (IN_RANGE(u16Char, 0xD800, 0xDBFF))//遇到高代理对
402 {
403 U16T u16HighSurrogate = u16Char;//保存高代理
404 U16T u16LowSurrogate{};//读取低代理
405 GET_NEXTCHAR(u16LowSurrogate, (PUSH_FAIL_MU8CHAR));//第二次
406 //如果上面读取提前返回,则高代理后无数据,插入转换后的u16未知字符
407
408 //判断低代理范围
409 if (!IN_RANGE(u16LowSurrogate, 0xDC00, 0xDFFF))//错误,高代理后非低代理
410 {
411 --it;//撤回一次刚才的读取,重新判断非低代理字节
412 PUSH_FAIL_MU8CHAR;//插入u16未知字符
413 continue;//重试,for会重新++it,相当于重试当前*it
414 }
415
416 //代理对特殊处理:共6字节表示一个实际代理码点
417 MU8T mu8Char[6]{};
418 EncodeMUTF8Supplementary(u16HighSurrogate, u16LowSurrogate, mu8Char);
419 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
420 }
421 else//高代理之前遇到低代理或其它合法3字节字符
422 {
423 if (IN_RANGE(u16Char, 0xDC00, 0xDFFF))//在高代理之前遇到低代理
424 {
425 //不撤回读取,丢弃错误的低代理
426 PUSH_FAIL_MU8CHAR;//错误,插入u16未知字符
427 continue;//重试
428 }
429
430 //转换3字节字符
431 MU8T mu8Char[3]{};
432 EncodeMUTF8Bmp(u16Char, mu8Char);
433 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
434 }
435 }
436 else
437 {
438 assert(false);//??????????????怎么命中的
439 }
440 }
441
442 return mu8String;
443
444#undef PUSH_FAIL_MU8CHAR
445 }
446
447 template<typename T = std::basic_string<U16T>>
448 static constexpr T MU8ToU16Impl(const MU8T *mu8String, size_t szStringLength, T u16String = {})
449 {
451#define PUSH_FAIL_U16CHAR u16String.push_back(u16FailChar)
453
454 //因为string带长度信息,则不用处理0字符情况,for不会进入,直接返回size为0的u16str
455 //u16字符串末尾为0x0000
456
457 for (auto it = mu8String, end = mu8String + szStringLength; it != end; ++it)
458 {
459 MU8T mu8Char = *it;//第一次
460 //判断是几字节的mu8
461 if (HAS_BITMASK(mu8Char, 0b1000'0000, 0b0000'0000))//最高位为0,单字节码点
462 {
463 //放入数组
464 MU8T mu8CharArr[1] = { mu8Char };
465
466 //转换
467 U16T u16Char{};
468 DecodeMUTF8Bmp(mu8CharArr, u16Char);
469 u16String.push_back(u16Char);
470 }
471 else if (HAS_BITMASK(mu8Char, 0b1110'0000, 0b1100'0000))//高3位为110,双字节码点
472 {
473 //先保存第一个字节
474 MU8T mu8CharArr[2] = { mu8Char };//[0]=mu8Char
475 //尝试获取下一个字节
476 GET_NEXTCHAR(mu8CharArr[1], (PUSH_FAIL_U16CHAR));//第二次
477 //判断字节合法性
478 if (!HAS_BITMASK(mu8CharArr[1], 0b1100'0000, 0b1000'0000))//高2位不是10,错误,跳过
479 {
480 --it;//撤回读取(避免for自动递增跳过刚才的字符)
481 PUSH_FAIL_U16CHAR;//替换为utf16错误字符
482 continue;//重试,因为当前字符可能是错误的,而刚才多读取的才是正确的,所以需要撤回continue重新尝试
483 }
484
485 //转换
486 U16T u16Char{};
487 DecodeMUTF8Bmp(mu8CharArr, u16Char);
488 u16String.push_back(u16Char);
489 }
490 else if (HAS_BITMASK(mu8Char, 0b1111'0000, 0b1110'0000))//高4位为1110,三字节或多字节码点
491 {
492 //提前获取下一个字符,这是代理对的判断依据
493 MU8T mu8Next{};
494 GET_NEXTCHAR(mu8Next, (PUSH_FAIL_U16CHAR));//第二次
495
496 //合法性判断(区分是否为代理)
497 //代理区分:因为D800开头的为高代理,必不可能作为三字节码点0b1010'xxxx出现,所以只要高4位是1010必为代理对
498 //也就是说mu8CharArr3[0]的低4bit如果是1101并且mu8Char的高4bit是1010的情况下,即三字节码点10xx'xxxx中的最高二个xx为01,
499 //把他们合起来就是1101'10xx 也就是0xD8,即u16的高代理对开始字符,而代理对在encode过程走的另一个流程,不存在与3字节码点混淆处理的情况
500 if (IS_BITS(mu8Char, 0b1110'1101) && HAS_BITMASK(mu8Next, 0b1111'0000, 0b1010'0000))//代理对,必须先判断,很重要!
501 {
502 //保存到数组
503 MU8T mu8CharArr[6] = { mu8Char,mu8Next };//[0] = mu8Char, [1] = mu8Next
504
505 //继续读取后4个并验证
506
507 //下一个为高代理的低6位
508 GET_NEXTCHAR(mu8CharArr[2],
509 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第三次
510 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))
511 {
512 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
513 --it;
514 //替换为二个utf16错误字符
515 PUSH_FAIL_U16CHAR;
516 PUSH_FAIL_U16CHAR;
517 continue;
518 }
519
520 //下一个为固定字符
521 GET_NEXTCHAR(mu8CharArr[3],
522 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第四次
523 if (!IS_BITS(mu8CharArr[3], 0b1110'1101))
524 {
525 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
526 --it;
527 //替换为三个utf16错误字符
528 PUSH_FAIL_U16CHAR;
529 PUSH_FAIL_U16CHAR;
530 PUSH_FAIL_U16CHAR;
531 continue;
532 }
533
534 //下一个为低代理高4位
535 GET_NEXTCHAR(mu8CharArr[4],
536 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第五次
537 if (!HAS_BITMASK(mu8CharArr[4], 0b1111'0000, 0b1011'0000))
538 {
539 //撤回二次读取,尽管前面已确认是0b1110'1101,但是存在111开头的合法3码点
540 --it;
541 --it;
542 //替换为三个utf16错误字符,因为撤回二次,本来有4个错误字节的现在只要3个
543 PUSH_FAIL_U16CHAR;
544 PUSH_FAIL_U16CHAR;
545 PUSH_FAIL_U16CHAR;
546 continue;
547 }
548
549 //读取最后一个低代理的低6位
550 GET_NEXTCHAR(mu8CharArr[5],
551 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第六次
552 if (!HAS_BITMASK(mu8CharArr[5], 0b1100'0000, 0b1000'0000))
553 {
554 //撤回一次读取,因为不存在前一个已确认的101开头的合法码点,且再前一个开头为111,也就是不存在111后跟101的3码点情况,跳过
555 --it;
556 //替换为五个utf16错误字符
557 PUSH_FAIL_U16CHAR;
558 PUSH_FAIL_U16CHAR;
559 PUSH_FAIL_U16CHAR;
560 PUSH_FAIL_U16CHAR;
561 PUSH_FAIL_U16CHAR;
562 continue;
563 }
564
565 //验证全部通过,转换代理对
566 U16T u16HighSurrogate{}, u16LowSurrogate{};
567 DecodeMUTF8Supplementary(mu8CharArr, u16HighSurrogate, u16LowSurrogate);
568 u16String.push_back(u16HighSurrogate);
569 u16String.push_back(u16LowSurrogate);
570 }
571 else if(HAS_BITMASK(mu8Next, 0b1100'0000, 0b1000'0000))//三字节码点,排除代理对后只有这个可能,看看是不是10开头的尾随字节
572 {
573 //保存
574 MU8T mu8CharArr[3] = { mu8Char,mu8Next };//[0] = mu8Char, [1] = mu8Next
575
576 //尝试获取下一字符
577 GET_NEXTCHAR(mu8CharArr[2],
578 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第三次
579 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))//错误,3字节码点最后一个不是正确字符
580 {
581 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
582 --it;
583 //替换为二个utf16错误字符
584 PUSH_FAIL_U16CHAR;
585 PUSH_FAIL_U16CHAR;
586 continue;
587 }
588
589 //3位已就绪,转换
590 U16T u16Char{};
591 DecodeMUTF8Bmp(mu8CharArr, u16Char);
592 u16String.push_back(u16Char);
593 }
594 else
595 {
596 //撤回mu8Next的读取,因为mu8Char已经判断过,能运行到这里,
597 //证明此字节错误,如果撤回到mu8Char会导致无限错误循环,
598 //只撤回到mu8Next即可,for会重新++it,相当于重试当前*it
599 --it;
600 //替换为一个utf16错误字符
601 PUSH_FAIL_U16CHAR;
602 continue;
603 }
604 }
605 else
606 {
607 //未知,跳过并忽略,直到遇到下一个正确起始字符
608 //替换为一个utf16错误字符
609 PUSH_FAIL_U16CHAR;
610 continue;
611 }
612 }
613
614 return u16String;
615
616#undef PUSH_FAIL_U16CHAR
617 }
618
619 /*
620 Modified UTF-8 与 "标准"UTF-8 格式有二点区别:
621 第一,空字符(char)0使用双字节格式0xC0 0x80而非单字节格式0x00,
622 因此 Modified UTF-8字符串不会包含嵌入式空值;
623
624 第二,仅使用标准UTF-8的单字节、双字节和三字节格式。
625 Java虚拟机不识别标准UTF-8的四字节格式,
626 而是使用自定义的双三字节(6字节代理对)格式。
627 */
628
629 template<typename T = DynamicString<std::basic_string<MU8T>>>
630 static constexpr T U8ToMU8Impl(const U8T *u8String, size_t szStringLength, T mu8String = {})
631 {
633#define PUSH_FAIL_MU8CHAR mu8String.append(mu8FailChar, sizeof(mu8FailChar) / sizeof(MU8T))
634#define INSERT_NORMAL(p) (mu8String.append_cvt((p) - szNormalLength, szNormalLength), szNormalLength = 0)
636
637 size_t szNormalLength = 0;//普通字符的长度,用于优化批量插入
638 for (auto it = u8String, end = u8String + szStringLength; it != end; ++it)
639 {
640 //u8到mu8,处理u8空字符,处理4字节u8转换到6字节mu8
641 U8T u8Char = *it;//第一次
642 if (HAS_BITMASK(u8Char, 0b1111'1000, 0b1111'0000))//高5位为11110,utf8的4字节
643 {
644 INSERT_NORMAL(it);//在处理之前先插入之前被跳过的普通字符
645 //转换u8的4字节到mu8的6字节,并处理错误
646
647 U8T u8CharArr[4]{ u8Char };//[0] = u8Char
648
649 GET_NEXTCHAR(u8CharArr[1], (PUSH_FAIL_MU8CHAR));//第二次
650 if (!HAS_BITMASK(u8CharArr[1], 0b1100'0000, 0b1000'0000))//确保高2bit是10
651 {
652 //输出一个错误字符
653 PUSH_FAIL_MU8CHAR;
654 --it;//回退一次读取,尝试处理不以10开头的
655 continue;
656 }
657
658 GET_NEXTCHAR(u8CharArr[2],
659 (PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR));//第三次
660 if (!HAS_BITMASK(u8CharArr[2], 0b1100'0000, 0b1000'0000))//确保高2bit是10
661 {
662 //输出二个错误字符
663 PUSH_FAIL_MU8CHAR;
664 PUSH_FAIL_MU8CHAR;
665 --it;//回退一次读取,尝试处理不以10开头的
666 continue;
667 }
668
669 GET_NEXTCHAR(u8CharArr[3],
670 (PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR));//第四次
671 if (!HAS_BITMASK(u8CharArr[3], 0b1100'0000, 0b1000'0000))//确保高2bit是10
672 {
673 //输出三个错误字符
674 PUSH_FAIL_MU8CHAR;
675 PUSH_FAIL_MU8CHAR;
676 PUSH_FAIL_MU8CHAR;
677 --it;//回退一次读取,尝试处理不以10开头的
678 continue;
679 }
680
681 //读取成功完成
682 MU8T mu8CharArr[6]{};
683 UTF8SupplementaryToMUTF8(u8CharArr, mu8CharArr);
684 mu8String.append(mu8CharArr, sizeof(mu8CharArr) / sizeof(MU8T));
685 }
686 else if (IS_BITS(u8Char, 0b0000'0000))//\0字符
687 {
688 INSERT_NORMAL(it);//在处理之前先插入之前被跳过的普通字符
689
690 MU8T mu8EmptyCharArr[2] = { (MU8T)0xC0,(MU8T)0x80 };//mu8固定0字节
691 mu8String.append(mu8EmptyCharArr, sizeof(mu8EmptyCharArr) / sizeof(MU8T));
692 }
693 else//都不是,递增普通字符长度,直到遇到特殊字符的时候插入
694 {
695 ++szNormalLength;
696 }
697 }
698 //结束后再插入一次,因为for内可能完全没有进入过任何一个特殊块,
699 //且因为结束后for是从末尾退出的,所以从末尾开始作为当前指针插入
700 INSERT_NORMAL(u8String + szStringLength);
701
702
703 return mu8String;
704
705#undef INSERT_NORMAL
706#undef PUSH_FAIL_MU8CHAR
707 }
708
709 template<typename T = DynamicString<std::basic_string<U8T>>>
710 static constexpr T MU8ToU8Impl(const MU8T *mu8String, size_t szStringLength, T u8String = {})
711 {
713#define PUSH_FAIL_U8CHAR u8String.append(u8FailChar, sizeof(u8FailChar) / sizeof(U8T))
714#define INSERT_NORMAL(p) (u8String.append_cvt((p) - szNormalLength, szNormalLength), szNormalLength = 0)
716
717 size_t szNormalLength = 0;//普通字符的长度,用于优化批量插入
718 for (auto it = mu8String, end = mu8String + szStringLength; it != end; ++it)
719 {
720 MU8T mu8Char = *it;//第一次
721
722 if (HAS_BITMASK(mu8Char, 0b1111'0000, 0b1110'0000))//高4为为1110,mu8的3字节或多字节码点
723 {
724 //提前获取下一个
725 MU8T mu8Next{};
726 if (++it == end)
727 {
728 //把前面的都插入一下
729 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前移动了一下迭代器,回退1当前位置
730 PUSH_FAIL_U8CHAR;//插入错误字符
731 break;
732 }
733 mu8Next = *it;//第二次
734
735
736 //以1110'1101字节开始且下一个字节高4位是1010开头的必然是代理对
737 if (!IS_BITS(mu8Char, 0b1110'1101) || !HAS_BITMASK(mu8Next, 0b1111'0000, 0b1010'0000))
738 {
739 szNormalLength += 2;//前面消耗了两个,递增两次
740 continue;//然后继续循环
741 }
742
743 //已确认是代理对,把前面的都插入一下
744 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前读取了一个mu8Next,回退1当前位置
745
746 //继续读取后4个并验证
747 MU8T mu8CharArr[6] = { mu8Char, mu8Next };//[0] = mu8Char, [1] = mu8Next
748
749 //获取下一个
750 GET_NEXTCHAR(mu8CharArr[2],
751 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第三次
752 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))
753 {
754 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
755 --it;
756 //替换为二个utf8错误字符
757 PUSH_FAIL_U8CHAR;
758 PUSH_FAIL_U8CHAR;
759 continue;
760 }
761
762 //获取下一个
763 GET_NEXTCHAR(mu8CharArr[3],
764 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第四次
765 if (!IS_BITS(mu8CharArr[3], 0b1110'1101))
766 {
767 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
768 --it;
769 //替换为三个utf8错误字符
770 PUSH_FAIL_U8CHAR;
771 PUSH_FAIL_U8CHAR;
772 PUSH_FAIL_U8CHAR;
773 continue;
774 }
775
776 //获取下一个
777 GET_NEXTCHAR(mu8CharArr[4],
778 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第五次
779 if (!HAS_BITMASK(mu8CharArr[4], 0b1111'0000, 0b1011'0000))
780 {
781 //撤回二次读取,尽管前面已确认是0b1110'1101,但是存在111开头的合法3码点
782 --it;
783 --it;
784 //替换为三个utf8错误字符,因为撤回二次,本来有4个错误字节的现在只要3个
785 PUSH_FAIL_U8CHAR;
786 PUSH_FAIL_U8CHAR;
787 PUSH_FAIL_U8CHAR;
788 continue;
789 }
790
791 //获取下一个
792 GET_NEXTCHAR(mu8CharArr[5],
793 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第六次
794 if (!HAS_BITMASK(mu8CharArr[5], 0b1100'0000, 0b1000'0000))
795 {
796 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
797 --it;
798 //替换为五个utf8错误字符
799 PUSH_FAIL_U8CHAR;
800 PUSH_FAIL_U8CHAR;
801 PUSH_FAIL_U8CHAR;
802 PUSH_FAIL_U8CHAR;
803 PUSH_FAIL_U8CHAR;
804 continue;
805 }
806
807 //到此,全部验证通过,进行转换
808 U8T u8CharArr[4]{};
809 MUTF8SupplementaryToUTF8(mu8CharArr, u8CharArr);
810 u8String.append(u8CharArr, sizeof(u8CharArr) / sizeof(U8T));
811 }
812 else if (IS_BITS(mu8Char, 0xC0))//注意以0xC0开头的,必然是2字节码,所以如果里面没有第二个字符,则必然错误
813 {
814 //提前获取下一个
815 MU8T mu8Next{};
816 if (++it == end)
817 {
818 //把前面的都插入一下
819 INSERT_NORMAL(it - 1);
820 PUSH_FAIL_U8CHAR;//插入错误字符
821 break;
822 }
823 mu8Next = *it;//第二次
824
825 if (!IS_BITS(mu8Next, 0x80))//如果不是,说明是别的字节模式
826 {
827 szNormalLength += 2;//普通字符数加2然后继续
828 continue;
829 }
830
831 //已确认是0字符,插入一下前面的所有内容
832 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前读取了一个mu8Next,回退1当前位置
833 u8String.push_back((U8T)0x00);//插入0字符
834 }
835 else
836 {
837 ++szNormalLength;//普通字符,递增
838 continue;//继续
839 }
840 }
841 //最后再把for中剩余未插入的插入一下,注意这里起始位置其实是for中的end位置
842 INSERT_NORMAL(mu8String + szStringLength);
843
844 return u8String;
845
846#undef INSERT_NORMAL
847#undef PUSH_FAIL_U8CHAR
848 }
849
850#undef IN_RANGE
851#undef IS_BITS
852#undef HAS_BITMASK
853#undef GET_NEXTCHAR
854
855private:
856 template<typename T>
857 static consteval size_t ContentLength(const T &tStr)//获取不包含末尾0的长度(如果有)
858 {
859 return tStr.size() > 0 && tStr[tStr.size() - 1] == (typename T::value_type)0x00
860 ? tStr.size() - 1
861 : tStr.size();
862 }
863
864public:
865 //---------------------------------------------------------------------------------------------//
866
871 static constexpr size_t U16ToMU8Length(const std::basic_string_view<U16T> &u16String)
872 {
873 return U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String.data(), u16String.size()).GetData();
874 }
875
881 static constexpr size_t U16ToMU8Length(const U16T *u16String, size_t szStringLength)
882 {
883 return U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String, szStringLength).GetData();
884 }
885
891 static std::basic_string<MU8T> U16ToMU8(const std::basic_string_view<U16T> &u16String, size_t szReserve = 0)
892 {
893 return U16ToMU8Impl<DynamicString<std::basic_string<MU8T>>>(u16String.data(), u16String.size(), { szReserve }).GetData();
894 }
895
902 static std::basic_string<MU8T> U16ToMU8(const U16T *u16String, size_t szStringLength, size_t szReserve = 0)
903 {
904 return U16ToMU8Impl<DynamicString<std::basic_string<MU8T>>>(u16String, szStringLength, { szReserve }).GetData();
905 }
906
912 template<MUTF8_Tool_Internal::StringLiteral u16String>
913 requires std::is_same_v<typename decltype(u16String)::value_type, U16T>//限定类型
914 static consteval std::basic_string_view<MU8T> U16ToMU8(void)
915 {
916 constexpr size_t szStringLength = ContentLength(u16String);
917 constexpr size_t szNewLength = U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String.data(), szStringLength).GetData();
918
920 <
921 U16ToMU8Impl<StaticString<MU8T, szNewLength>>(u16String.data(), szStringLength).GetData(),
922 std::basic_string_view<MU8T>
923 >();
924 }
925
926 //---------------------------------------------------------------------------------------------//
927
932 static constexpr size_t U8ToMU8Length(const std::basic_string_view<U8T> &u8String)
933 {
934 return U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String.data(), u8String.size()).GetData();
935 }
936
942 static constexpr size_t U8ToMU8Length(const U8T *u8String, size_t szStringLength)
943 {
944 return U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String, szStringLength).GetData();
945 }
946
952 static std::basic_string<MU8T> U8ToMU8(const std::basic_string_view<U8T> &u8String, size_t szReserve = 0)
953 {
954 return U8ToMU8Impl<DynamicString<std::basic_string<MU8T>>>(u8String.data(), u8String.size(), { szReserve }).GetData();
955 }
956
963 static std::basic_string<MU8T> U8ToMU8(const U8T *u8String, size_t szStringLength, size_t szReserve = 0)
964 {
965 return U8ToMU8Impl<DynamicString<std::basic_string<MU8T>>>(u8String, szStringLength, { szReserve }).GetData();
966 }
967
973 template<MUTF8_Tool_Internal::StringLiteral u8String>
974 requires std::is_same_v<typename decltype(u8String)::value_type, U8T>//限定类型
975 static consteval std::basic_string_view<MU8T> U8ToMU8(void)
976 {
977 constexpr size_t szStringLength = ContentLength(u8String);
978 constexpr size_t szNewLength = U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String.data(), szStringLength).GetData();
979
980 return MUTF8_Tool_Internal::ToStringView<U8ToMU8Impl
981 <
982 StaticString<MU8T, szNewLength>>(u8String.data(), szStringLength).GetData(),
983 std::basic_string_view<MU8T>
984 >();
985 }
986
987 //---------------------------------------------------------------------------------------------//
988 //---------------------------------------------------------------------------------------------//
989
994 static constexpr size_t MU8ToU16Length(const std::basic_string_view<MU8T> &mu8String)
995 {
996 return MU8ToU16Impl<FakeStringCounter<U16T>>(mu8String.data(), mu8String.size()).GetData();
997 }
998
1004 static constexpr size_t MU8ToU16Length(const MU8T *mu8String, size_t szStringLength)
1005 {
1006 return MU8ToU16Impl<FakeStringCounter<U16T>>(mu8String, szStringLength).GetData();
1007 }
1008
1014 static std::basic_string<U16T> MU8ToU16(const std::basic_string_view<MU8T> &mu8String, size_t szReserve = 0)
1015 {
1016 return MU8ToU16Impl<DynamicString<std::basic_string<U16T>>>(mu8String.data(), mu8String.size(), { szReserve }).GetData();
1017 }
1018
1025 static std::basic_string<U16T> MU8ToU16(const MU8T *mu8String, size_t szStringLength, size_t szReserve = 0)
1026 {
1027 return MU8ToU16Impl<DynamicString<std::basic_string<U16T>>>(mu8String, szStringLength, { szReserve }).GetData();
1028 }
1029
1030 //---------------------------------------------------------------------------------------------//
1031
1036 static constexpr size_t MU8ToU8Length(const std::basic_string_view<MU8T> &mu8String)
1037 {
1038 return MU8ToU8Impl<FakeStringCounter<U8T>>(mu8String.data(), mu8String.size()).GetData();
1039 }
1040
1046 static constexpr size_t MU8ToU8Length(const MU8T *mu8String, size_t szStringLength)
1047 {
1048 return MU8ToU8Impl<FakeStringCounter<U8T>>(mu8String, szStringLength).GetData();
1049 }
1050
1056 static std::basic_string<U8T> MU8ToU8(const std::basic_string_view<MU8T> &mu8String, size_t szReserve = 0)
1057 {
1058 return MU8ToU8Impl<DynamicString<std::basic_string<U8T>>>(mu8String.data(), mu8String.size(), { szReserve }).GetData();
1059 }
1060
1067 static std::basic_string<U8T> MU8ToU8(const MU8T *mu8String, size_t szStringLength, size_t szReserve = 0)
1068 {
1069 return MU8ToU8Impl<DynamicString<std::basic_string<U8T>>>(mu8String, szStringLength, { szReserve }).GetData();
1070 }
1071
1072 //---------------------------------------------------------------------------------------------//
1073};
1074
1075//--------------------------------------------辅助调用宏--------------------------------------------//
1076
1077//动态转换
1078
1082#define U16CV2MU8(u16String) MUTF8_Tool<>::U16ToMU8(u16String)
1083
1087#define MU8CV2U16(mu8String) MUTF8_Tool<>::MU8ToU16(mu8String)
1088
1092#define U8CV2MU8(u8String) MUTF8_Tool<>::U8ToMU8(u8String)
1093
1097#define MU8CV2U8(mu8String) MUTF8_Tool<>::MU8ToU8(mu8String)
1098
1099//静态转换
1100//在mutf-8中,任何字符串结尾\0都会被映射成0xC0 0x80,且保证串中不包含\0,所以一定程度上可以和c-str(以\0结尾)兼容
1101
1106#define U16TOMU8STR(u16LiteralString) (MUTF8_Tool<>::U16ToMU8<u16LiteralString>())
1107
1112#define U8TOMU8STR(u8LiteralString) (MUTF8_Tool<>::U8ToMU8<u8LiteralString>())
1113
1114//---------------------------------------------------------------------------------------------//
1115
1116//英文原文
1117/*
1118Modified UTF-8 Strings
1119The JNI uses modified UTF-8 strings to represent various string types. Modified UTF-8 strings are the same as those used by the Java VM. Modified UTF-8 strings are encoded so that character sequences that contain only non-null ASCII characters can be represented using only one byte per character, but all Unicode characters can be represented.
1120
1121All characters in the range \u0001 to \u007F are represented by a single byte, as follows:
1122
11230xxxxxxx
1124The seven bits of data in the byte give the value of the character represented.
1125
1126The null character ('\u0000') and characters in the range '\u0080' to '\u07FF' are represented by a pair of bytes x and y:
1127
1128x: 110xxxxx
1129y: 10yyyyyy
1130The bytes represent the character with the value ((x & 0x1f) << 6) + (y & 0x3f).
1131
1132Characters in the range '\u0800' to '\uFFFF' are represented by 3 bytes x, y, and z:
1133
1134x: 1110xxxx
1135y: 10yyyyyy
1136z: 10zzzzzz
1137The character with the value ((x & 0xf) << 12) + ((y & 0x3f) << 6) + (z & 0x3f) is represented by the bytes.
1138
1139Characters with code points above U+FFFF (so-called supplementary characters) are represented by separately encoding the two surrogate code units of their UTF-16 representation. Each of the surrogate code units is represented by three bytes. This means, supplementary characters are represented by six bytes, u, v, w, x, y, and z:
1140
1141u: 11101101
1142v: 1010vvvv
1143w: 10wwwwww
1144x: 11101101
1145y: 1011yyyy
1146z: 10zzzzzz
1147The character with the value 0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) is represented by the six bytes.
1148
1149The bytes of multibyte characters are stored in the class file in big-endian (high byte first) order.
1150
1151There are two differences between this format and the standard UTF-8 format. First, the null character (char)0 is encoded using the two-byte format rather than the one-byte format. This means that modified UTF-8 strings never have embedded nulls. Second, only the one-byte, two-byte, and three-byte formats of standard UTF-8 are used. The Java VM does not recognize the four-byte format of standard UTF-8; it uses its own two-times-three-byte format instead.
1152
1153For more information regarding the standard UTF-8 format, see section 3.9 Unicode Encoding Forms of The Unicode Standard, Version 4.0.
1154*/
1155
1156//中文翻译
1157/*
1158修改后的 UTF-8 字符串
1159JNI 使用修改后的 UTF-8 字符串来表示各种字符串类型。修改后的 UTF-8 字符串与 Java VM 所使用的字符串相同。修改后的 UTF-8 字符串经过编码,使得仅包含非空 ASCII 字符的字符序列可以每个字符仅使用一个字节来表示,但所有 Unicode 字符都可以被表示。
1160
1161范围在 \u0001 到 \u007F 之间的所有字符都由单个字节表示,如下所示:
1162
11630xxxxxxx
1164字节中的七位数据给出了所表示字符的值。
1165
1166空字符 ('\u0000') 和范围在 '\u0080' 到 '\u07FF' 之间的字符由一对字节 x 和 y 表示:
1167
1168x: 110xxxxx
1169y: 10yyyyyy
1170这些字节表示值为 ((x & 0x1f) << 6) + (y & 0x3f) 的字符。
1171
1172范围在 '\u0800' 到 '\uFFFF' 之间的字符由三个字节 x、y 和 z 表示:
1173
1174x: 1110xxxx
1175y: 10yyyyyy
1176z: 10zzzzzz
1177值为 ((x & 0xf) << 12) + ((y & 0x3f) << 6) + (z & 0x3f) 的字符由这些字节表示。
1178
1179码点高于 U+FFFF 的字符(即所谓的补充字符)通过分别编码其 UTF-16 表示的二个代理码元来表示。每个代理码元由三个字节表示。这意味着,补充字符由六个字节 u、v、w、x、y 和 z 表示:
1180
1181u: 11101101
1182v: 1010vvvv
1183w: 10wwwwww
1184x: 11101101
1185y: 1011yyyy
1186z: 10zzzzzz
1187值为 0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) 的字符由这六个字节表示。
1188
1189多字节字符的字节在类文件中以大端序(高位字节在前)存储。
1190
1191此格式与标准 UTF-8 格式有二个区别。首先,空字符 (char)0 使用双字节格式而非单字节格式进行编码。这意味着修改后的 UTF-8 字符串永远不会包含嵌入的空字符。其次,仅使用标准 UTF-8 的单字节、双字节和三字节格式。Java VM 不识别标准 UTF-8 的四字节格式;它使用自己的二次三字节格式来代替。
1192
1193有关标准 UTF-8 格式的更多信息,请参阅 Unicode 标准 4.0 版的第 3.9 节 Unicode 编码形式。
1194*/
@ String
对应NBT_Type::String
定义 NBT_TAG.hpp:26
std::array< T, N > Super
父类类型定义
定义 MUTF8_Tool.hpp:31
constexpr StringLiteral(const T(&_tStr)[N]) noexcept
从字符串数组的引用拷贝构造
定义 MUTF8_Tool.hpp:38
constexpr ~StringLiteral(void)=default
默认析构
static std::basic_string< MU8T > U8ToMU8(const std::basic_string_view< U8T > &u8String, size_t szReserve=0)
获取UTF-8转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:952
static std::basic_string< U8T > MU8ToU8(const MU8T *mu8String, size_t szStringLength, size_t szReserve=0)
获取M-UTF-8转换到UTF-8的字符串
定义 MUTF8_Tool.hpp:1067
static std::basic_string< U8T > MU8ToU8(const std::basic_string_view< MU8T > &mu8String, size_t szReserve=0)
获取M-UTF-8转换到UTF-8的字符串
定义 MUTF8_Tool.hpp:1056
U8T U8_T
模板UTF-8字符类型的代理
定义 MUTF8_Tool.hpp:83
static std::basic_string< U16T > MU8ToU16(const std::basic_string_view< MU8T > &mu8String, size_t szReserve=0)
获取M-UTF-8转换到UTF-16的字符串
定义 MUTF8_Tool.hpp:1014
static constexpr size_t U16ToMU8Length(const std::basic_string_view< U16T > &u16String)
精确计算UTF-16转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:871
static constexpr size_t MU8ToU8Length(const MU8T *mu8String, size_t szStringLength)
精确计算M-UTF-8转换到UTF-8所需的UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1046
static constexpr size_t MU8ToU8Length(const std::basic_string_view< MU8T > &mu8String)
精确计算M-UTF-8转换到UTF-8所需的UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1036
static consteval std::basic_string_view< MU8T > U8ToMU8(void)
通过UTF-8字符串字面量,直接获得编译期的M-UTF-8静态字符串
定义 MUTF8_Tool.hpp:975
static std::basic_string< MU8T > U16ToMU8(const std::basic_string_view< U16T > &u16String, size_t szReserve=0)
获取UTF-16转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:891
static constexpr size_t U8ToMU8Length(const std::basic_string_view< U8T > &u8String)
精确计算UTF-8转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:932
MU8T MU8_T
模板M-UTF-8字符类型的代理
定义 MUTF8_Tool.hpp:79
static constexpr size_t U16ToMU8Length(const U16T *u16String, size_t szStringLength)
精确计算UTF-16转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:881
U16T U16_T
模板UTF-16字符类型的代理
定义 MUTF8_Tool.hpp:81
static constexpr size_t MU8ToU16Length(const MU8T *mu8String, size_t szStringLength)
精确计算M-UTF-8转换到UTF-16所需的UTF-16字符串的长度
定义 MUTF8_Tool.hpp:1004
static consteval std::basic_string_view< MU8T > U16ToMU8(void)
通过UTF-16字符串字面量,直接获得编译期的M-UTF-8静态字符串
定义 MUTF8_Tool.hpp:914
static constexpr size_t U8ToMU8Length(const U8T *u8String, size_t szStringLength)
精确计算UTF-8转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:942
static std::basic_string< U16T > MU8ToU16(const MU8T *mu8String, size_t szStringLength, size_t szReserve=0)
获取M-UTF-8转换到UTF-16的字符串
定义 MUTF8_Tool.hpp:1025
static constexpr size_t MU8ToU16Length(const std::basic_string_view< MU8T > &mu8String)
精确计算M-UTF-8转换到UTF-16所需的UTF-16字符串的长度
定义 MUTF8_Tool.hpp:994
static std::basic_string< MU8T > U8ToMU8(const U8T *u8String, size_t szStringLength, size_t szReserve=0)
获取UTF-8转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:963
static std::basic_string< MU8T > U16ToMU8(const U16T *u16String, size_t szStringLength, size_t szReserve=0)
获取UTF-16转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:902
用于存放MUTF8_Tool使用的,无法存在于类内的辅助类
定义 MUTF8_Tool.hpp:21
consteval View_Type ToStringView(void) noexcept
利用模板固化编译期求值函数返回的数组临时量
定义 MUTF8_Tool.hpp:51