chenjunfu2-nbt-cpp v2.1.3
一个基于CPP20的NBT(Named Binary Tag)库
载入中...
搜索中...
未找到
MUTF8_Tool.hpp
浏览该文件的文档.
1#pragma once
2
3#include <string>
4#include <cstring>
5#include <type_traits>
6#include <assert.h>
7#include <stdint.h>
8#include <stddef.h>//size_t
9#include <array>
10#include <algorithm>
11
14
15//来个static string包装类,使得模板能接受字符串字面量
16//必须放在外面,否则NTTP推导主类模板会失败,
17//导致此并不依赖主类模板的模板也推导失败
18
22{
27 template<typename T, size_t N>
28 class StringLiteral : public std::array<T, N>
29 {
30 public:
32 using Super = std::array<T, N>;
33
34 public:
35
39 constexpr StringLiteral(const T(&_tStr)[N]) noexcept : Super(std::to_array(_tStr))
40 {}
41
44 constexpr ~StringLiteral(void) = default;
45 };
46
51 template<auto ArrayData, typename View_Type>
52 consteval View_Type ToStringView(void) noexcept
53 {
54 return { ArrayData.data(), ArrayData.size() };
55 }
56}
57
59using MUTF8_Char_Type = uint8_t;
60
64template<typename MU8T = MUTF8_Char_Type>
66{
68public:
69 using char_type = MU8T;
70 using int_type = int;
71 using off_type = std::streamoff;
72 using pos_type = std::streampos;
73 using state_type = std::mbstate_t;
74 using comparison_category = std::strong_ordering;
75
76 static_assert(sizeof(int_type) > sizeof(char_type));
77
78public:
79 static constexpr char_type *copy(char_type *pDst, const char_type *pSrc, std::size_t szCount) noexcept
80 {
81 if (std::is_constant_evaluated())
82 {
83 for (std::size_t i = 0; i < szCount; ++i)
84 {
85 pDst[i] = pSrc[i];
86 }
87 return pDst;
88 }
89 else
90 {
91 std::memcpy(pDst, pSrc, szCount * sizeof(char_type));
92 return pDst;
93 }
94 }
95
96 static constexpr char_type *move(char_type *pDst, const char_type *pSrc, std::size_t szCount) noexcept
97 {
98 if (std::is_constant_evaluated())
99 {
100 if (pDst == pSrc)
101 {
102 return pDst;
103 }
104
105 //判断是正向复制还是反向复制,有重叠部分就使用反向
106 bool bLoopForward = true;
107 for (const char_type *p = pSrc; p != pSrc + szCount; ++p)
108 {
109 if (pDst == p)
110 {
111 bLoopForward = false;
112 break;
113 }
114 }
115
116 if (bLoopForward)
117 {
118 for (std::size_t i = 0; i < szCount; ++i)
119 {
120 pDst[i] = pSrc[i];
121 }
122 }
123 else
124 {
125 for (std::size_t i = szCount; i > 0; --i)
126 {
127 pDst[i - 1] = pSrc[i - 1];
128 }
129 }
130
131 return pDst;
132 }
133 else
134 {
135 std::memmove(pDst, pSrc, szCount * sizeof(char_type));
136 return pDst;
137 }
138 }
139
140 static constexpr int compare(const char_type *pLeft, const char_type *pRight, std::size_t szCount) noexcept
141 {
142 for (; szCount > 0; --szCount, ++pLeft, ++pRight)
143 {
144 if (*pLeft != *pRight)//第一个不匹配的,返回比较值
145 {
146 return *pLeft < *pRight ? -1 : 1;
147 }
148 }
149
150 //全匹配,返回0
151 return 0;
152 }
153
154 static constexpr std::size_t length(const char_type *pStr) noexcept
155 {
156 std::size_t szlength = 0;
157 while (*pStr != char_type{})
158 {
159 ++szlength;
160 ++pStr;
161 }
162
163 return szlength;
164 }
165
166 static constexpr const char_type *find(const char_type *pStr, std::size_t szCount, const char_type &ch) noexcept
167 {
168 for (; szCount > 0; --szCount, ++pStr)
169 {
170 if (*pStr == ch)
171 {
172 return pStr;
173 }
174 }
175
176 return nullptr;
177 }
178
179 static constexpr char_type *assign(char_type *pDst, std::size_t szCount, char_type ch) noexcept
180 {
181 if (std::is_constant_evaluated())
182 {
183 for (char_type *p = pDst; szCount > 0; --szCount, ++p)
184 {
185 std::construct_at(p, ch);
186 }
187 }
188 else
189 {
190 for (char_type *p = pDst; szCount > 0; --szCount, ++p)
191 {
192 *p = ch;
193 }
194 }
195
196 return pDst;
197 }
198
199 static constexpr void assign(char_type &dst, const char_type &src) noexcept
200 {
201 if (std::is_constant_evaluated())
202 {
203 std::construct_at(std::addressof(dst), src);
204 }
205 else
206 {
207 dst = src;
208 }
209 }
210
211 static constexpr bool eq(const char_type &l, const char_type &r) noexcept
212 {
213 return l == r;
214 }
215
216 static constexpr bool lt(const char_type &l, const char_type &r) noexcept
217 {
218 return l < r;
219 }
220
221 static constexpr char_type to_char_type(const int_type &meta) noexcept
222 {
223 return static_cast<char_type>(meta);
224 }
225
226 static constexpr int_type to_int_type(const char_type &ch) noexcept
227 {
228 return static_cast<int_type>(ch);
229 }
230
231 static constexpr bool eq_int_type(const int_type &l, const int_type &r) noexcept
232 {
233 return l == r;
234 }
235
236 static constexpr int_type eof(void) noexcept
237 {
238 return static_cast<int_type>(EOF);
239 }
240
241 static constexpr int_type not_eof(const int_type &meta) noexcept
242 {
243 return meta != eof() ? meta : static_cast<int_type>(!eof());
244 }
245
247};
248
250using MUTF8_String = std::basic_string<MUTF8_Char_Type, MUTF8_Char_Traits<MUTF8_Char_Type>>;
252using MUTF8_String_View = std::basic_string_view<MUTF8_Char_Type, MUTF8_Char_Traits<MUTF8_Char_Type>>;
253
259template<typename MU8T = MUTF8_Char_Type, typename U16T = char16_t, typename U8T = char8_t>
260class MUTF8_Tool
261{
262 static_assert(sizeof(MU8T) == 1, "MU8T size must be at 1 byte");
263 static_assert(sizeof(U16T) == 2, "U16T size must be at 2 bytes");
264 static_assert(sizeof(U8T) == 1, "U8T size must be at 1 bytes");
265
266 MUTF8_Tool(void) = delete;
267 ~MUTF8_Tool(void) = delete;
268
269 //用于在错误情况输出utf16错误字节0xFFFD和mu8、u8形式的0xEF 0xBF 0xBD
270 static inline constexpr MU8T mu8FailChar[3]{ (MU8T)0xEF, (MU8T)0xBF, (MU8T)0xBD };
271 static inline constexpr U16T u16FailChar = (U16T)0xFFFD;
272 static inline constexpr U8T u8FailChar[3]{ (U8T)0xEF, (U8T)0xBF, (U8T)0xBD };
273
274public:
276 using MU8_T = MU8T;
278 using U16_T = U16T;
280 using U8_T = U8T;
281
283 using MU8_String = std::conditional_t<std::is_same_v<MU8T, MUTF8_Char_Type>, MUTF8_String, std::basic_string<MU8T>>;
285 using MU8_String_View = std::conditional_t<std::is_same_v<MU8T, MUTF8_Char_Type>, MUTF8_String_View, std::basic_string_view<MU8T>>;
286
287private:
288 //来点魔法类,伪装成basic string,在插入的时候进行数据长度计数,忽略插入的数据,最后转换为size_t长度
289 //这样就能在最小修改的情况下用同一个函数套模板来获取转换后的长度(且100%准确),而不是重写一个例程
290 template<typename T>
291 class FakeStringCounter
292 {
293 private:
294 size_t szCounter = 0;
295 public:
296 constexpr FakeStringCounter(void) = default;
297 constexpr ~FakeStringCounter(void) = default;
298
299 constexpr void clear(void) noexcept
300 {
301 szCounter = 0;
302 }
303
304 constexpr FakeStringCounter &append(const T *const, size_t szSize) noexcept
305 {
306 szCounter += szSize;
307 return *this;
308 }
309
310 template<typename U>
311 constexpr FakeStringCounter &append_cvt(const U *const, size_t szSize) noexcept
312 {
313 szCounter += szSize;//静态求值与动态求值并无区别,不做特例
314 return *this;
315 }
316
317 constexpr void push_back(const T &) noexcept
318 {
319 szCounter += 1;
320 }
321
322 constexpr const size_t &GetData(void) const noexcept
323 {
324 return szCounter;
325 }
326 };
327
328 //魔法类2,伪装成string,转换到静态字符串作为std::array返回
329 template<typename T, size_t N>
330 class StaticString
331 {
332 public:
333 using ARRAY_TYPE = std::array<T, N>;
334 private:
335 ARRAY_TYPE arrData{};
336 size_t szIndex = 0;
337 public:
338 constexpr StaticString(void) = default;
339 constexpr ~StaticString(void) = default;
340
341 constexpr void clear(void) noexcept
342 {
343 szIndex = 0;
344 }
345
346 constexpr StaticString &append(const T *const pData, size_t szLength) noexcept
347 {
348 if (szLength == 0 || szLength > arrData.size() - szIndex)//减法避免溢出,注意这里不是大等于而是大于,否则会导致拷贝差1错误
349 {
350 return *this;
351 }
352
353 //从pData的 0 ~ szLength 拷贝到arrData的 szIndex ~ szIndex + szLength
354 std::ranges::copy(&pData[0], &pData[szLength], &arrData[szIndex]);
355 szIndex += szLength;
356
357 return *this;
358 }
359
360 template<typename U>
361 constexpr StaticString &append_cvt(const U *const pData, size_t szLength) noexcept
362 {
363 if (std::is_constant_evaluated())
364 {
365 if (szLength == 0 || szLength > arrData.size() - szIndex)
366 {
367 return *this;
368 }
369
370 //静态求值按顺序转换
371 std::ranges::transform(&pData[0], &pData[szLength], &arrData[szIndex],
372 [](const U &u) -> T
373 {
374 return (T)u;
375 });
376 szIndex += szLength;
377
378 return *this;
379 }
380 else//注意虽然理论上这个函数不会在任何情况被用于动态,但是为了明确操作,仍然这么写
381 {
382 return append((const T *const)pData, szLength);//非静态直接暴力转换指针
383 }
384 }
385
386 constexpr void push_back(const T &tData) noexcept
387 {
388 if (1 > arrData.size() - szIndex)
389 {
390 return;
391 }
392
393 arrData[szIndex] = tData;
394 szIndex += 1;
395 }
396
397 constexpr const ARRAY_TYPE &GetData(void) const noexcept
398 {
399 return arrData;
400 }
401 };
402
403 //魔法类3,给String添加静态转换接口,与原先一致,防止报错
404 template<typename String>
405 class DynamicString : public String//注意使用继承,这样可以直接隐式转换到基类
406 {
407 public:
408 DynamicString(size_t szReserve = 0) :String({})
409 {
410 String::reserve(szReserve);
411 }
412 ~DynamicString(void) = default;
413
414 template<typename U>
415 DynamicString &append_cvt(const U *const pData, size_t szLength)//理论上此函数永远动态调用
416 {
417 String::append((const typename String::value_type *const)pData, szLength);
418 return *this;
419 }
420
421 constexpr const String &GetData(void) const noexcept
422 {
423 return *this;
424 }
425 };
426
427private:
428 template<size_t szBytes>
429 static constexpr void EncodeMUTF8Bmp(const U16T u16Char, MU8T(&mu8CharArr)[szBytes])
430 {
431 if constexpr (szBytes == 1)
432 {
433 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0111'1111) >> 0) | (uint16_t)0b0000'0000);//0 + 6-0 7bit
434 }
435 else if constexpr (szBytes == 2)
436 {
437 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0111'1100'0000) >> 6) | (uint16_t)0b1100'0000);//110 + 10-6 5bit
438 mu8CharArr[1] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0011'1111) >> 0) | (uint16_t)0b1000'0000);//10 + 5-0 6bit
439 }
440 else if constexpr (szBytes == 3)
441 {
442 mu8CharArr[0] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b1111'0000'0000'0000) >> 12) | (uint16_t)0b1110'0000);//1110 + 15-12 4bit
443 mu8CharArr[1] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'1111'1100'0000) >> 6) | (uint16_t)0b1000'0000);//10 + 11-6 6bit
444 mu8CharArr[2] = (uint8_t)((((uint16_t)u16Char & (uint16_t)0b0000'0000'0011'1111) >> 0) | (uint16_t)0b1000'0000);//10 + 5-0 6bit
445 }
446 else
447 {
448 static_assert(false, "Error szBytes Size");//大小错误
449 }
450 }
451
452 static constexpr void EncodeMUTF8Supplementary(const U16T u16HighSurrogate, const U16T u16LowSurrogate, MU8T(&mu8CharArr)[6])
453 {
454 //取出代理对数据并组合 范围:1'0000 ~ 10'FFFF 通过u16代理对组成:高代理10位低代理10位,
455 //最后加上0x1'0000得到此范围,也就是从utf16组成的0x0'0000 ~ 0xF'FFFF + 0x1'0000得到
456 uint32_t u32RawChar = ((uint32_t)((uint16_t)u16HighSurrogate & (uint16_t)0b0000'0011'1111'1111)) << 10 |//10bit
457 ((uint32_t)((uint16_t)u16LowSurrogate & (uint16_t)0b0000'0011'1111'1111)) << 0; //10bit
458
459 //因为mutf8直接存储utf16的代理对位,不进行+0x1'0000运算
460 //u32RawChar += (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为加法而非位运算
461
462 //高代理
463 mu8CharArr[0] = (uint8_t)0b1110'1101;//固定字节
464 mu8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'1111'0000'0000'0000'0000) >> 16) | (uint32_t)0b1010'0000);//1010 + 19-16 4bit
465 mu8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'1111'1100'0000'0000) >> 10) | (uint32_t)0b1000'0000);//10 + 15-10 6bit
466 //低代理
467 mu8CharArr[3] = (uint8_t)0b1110'1101;//固定字节
468 mu8CharArr[4] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1100'0000) >> 6) | (uint32_t)0b1011'0000);//1011 + 9-6 4bit
469 mu8CharArr[5] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
470 }
471
472 template<size_t szBytes>
473 static constexpr void DecodeMUTF8Bmp(const MU8T(&mu8CharArr)[szBytes], U16T &u16Char)
474 {
475 if constexpr (szBytes == 1)
476 {
477 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0111'1111)) << 0;//7bit
478 }
479 else if constexpr (szBytes == 2)
480 {
481 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0001'1111)) << 6 |//5bit
482 ((uint16_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0011'1111)) << 0; //6bit
483 }
484 else if constexpr (szBytes == 3)
485 {
486 u16Char = ((uint16_t)((uint8_t)mu8CharArr[0] & (uint8_t)0b0000'1111)) << 12 |//4bit
487 ((uint16_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0011'1111)) << 6 |//6bit
488 ((uint16_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 0; //6bit
489 }
490 else
491 {
492 static_assert(false, "Error szBytes Size");//大小错误
493 }
494 }
495
496 static constexpr void DecodeMUTF8Supplementary(const MU8T(&mu8CharArr)[6], U16T &u16HighSurrogate, U16T &u16LowSurrogate)
497 {
498 uint32_t u32RawChar = //mu8CharArr[0] ignore 固定字节忽略
499 ((uint32_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0000'1111)) << 16 |//4bit
500 ((uint32_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 10 |//6bit
501 //mu8CharArr[3] ignore 固定字节忽略
502 ((uint32_t)((uint8_t)mu8CharArr[4] & (uint8_t)0b0000'1111)) << 6 |//4bit
503 ((uint32_t)((uint8_t)mu8CharArr[5] & (uint8_t)0b0011'1111)) << 0 ;//6bit
504
505 //因为mutf8直接存储utf16的代理对位,不进行-0x1'0000运算
506 //u32RawChar -= (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为减法而非位运算
507
508 //解析到高低代理
509 //范围1'0000-10'FFFF
510 u16HighSurrogate = (uint32_t)((u32RawChar & (uint32_t)0b0000'0000'0000'1111'1111'1100'0000'0000) >> 10 | (uint32_t)0b1101'1000'0000'0000);//1101'10 + 19-10 10bit
511 u16LowSurrogate = (uint32_t)((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1111'1111) >> 0 | (uint32_t)0b1101'1100'0000'0000);//1101'11 + 9-0 10bit
512 }
513
514 /*
515 4bytes utf-8 bit distribution:(utf8直接存储utf32的映射,需要两次转换到mutf8)
516 000u'uuuu zzzz'yyyy yyxx'xxxx - 1111'0uuu 10uu'zzzz 10yy'yyyy 10xx'xxxx
517
518 6bytes modified utf-8 bit distribution:(mutf8直接存储了utf16的映射,不需要增减0x1'0000,所以要比utf8小一位)
519 0000'uuuu zzzz'yyyy yyxx'xxxx - 1110'1101 1010'uuuu 10zz'zzyy 1110'1101 1011'yyyy 10xx'xxxx
520 */
521
522 static constexpr void UTF8SupplementaryToMUTF8(const U8T(&u8CharArr)[4], MU8T(&mu8CharArr)[6])
523 {
524 //先把utf8映射到utf32,减去0x1'0000,得到utf16的中间形式,再进行下一步转换
525 uint32_t u32RawChar = ((uint32_t)((uint8_t)u8CharArr[0] & (uint8_t)0b0000'0111)) << 18 |//3bit
526 ((uint32_t)((uint8_t)u8CharArr[1] & (uint8_t)0b0011'1111)) << 12 |//6bit
527 ((uint32_t)((uint8_t)u8CharArr[2] & (uint8_t)0b0011'1111)) << 6 |//6bit
528 ((uint32_t)((uint8_t)u8CharArr[3] & (uint8_t)0b0011'1111)) << 0; //6bit
529
530 //删除位
531 u32RawChar -= (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为减法而非位运算
532
533 //高代理
534 mu8CharArr[0] = (uint8_t)0b1110'1101;//固定字节
535 mu8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'1111'0000'0000'0000'0000) >> 16) | (uint32_t)0b1010'0000);//1010 + 19-16 4bit
536 mu8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'1111'1100'0000'0000) >> 10) | (uint32_t)0b1000'0000);//10 + 15-10 6bit
537 //低代理
538 mu8CharArr[3] = (uint8_t)0b1110'1101;//固定字节
539 mu8CharArr[4] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0011'1100'0000) >> 6) | (uint32_t)0b1011'0000);//1011 + 9-6 4bit
540 mu8CharArr[5] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
541 }
542
543 static constexpr void MUTF8SupplementaryToUTF8(const MU8T(&mu8CharArr)[6], U8T(&u8CharArr)[4])
544 {
545 //先把mutf8映射到utf16组成的中间形式的utf32,再加上0x1'0000得到utf8的utf32表示形式
546 uint32_t u32RawChar = //mu8CharArr[0] ignore 固定字节忽略
547 ((uint32_t)((uint8_t)mu8CharArr[1] & (uint8_t)0b0000'1111)) << 16 |//4bit
548 ((uint32_t)((uint8_t)mu8CharArr[2] & (uint8_t)0b0011'1111)) << 10 |//6bit
549 //mu8CharArr[3] ignore 固定字节忽略
550 ((uint32_t)((uint8_t)mu8CharArr[4] & (uint8_t)0b0000'1111)) << 6 |//4bit
551 ((uint32_t)((uint8_t)mu8CharArr[5] & (uint8_t)0b0011'1111)) << 0 ;//6bit
552
553 //恢复位
554 u32RawChar += (uint32_t)0b0000'0000'0000'0001'0000'0000'0000'0000;//bit16->1 = 0x1'0000 注意此处为加法而非位运算
555
556 //转换到utf8
557 u8CharArr[0] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0001'1100'0000'0000'0000'0000) >> 18) | (uint32_t)0b1111'0000);//1111'0 + 20-18 3bit
558 u8CharArr[1] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0011'1111'0000'0000'0000) >> 12) | (uint32_t)0b1000'0000);//10 + 17-12 6bit
559 u8CharArr[2] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'1111'1100'0000) >> 6) | (uint32_t)0b1000'0000);//10 + 11-6 6bit
560 u8CharArr[3] = (uint8_t)(((u32RawChar & (uint32_t)0b0000'0000'0000'0000'0000'0000'0011'1111) >> 0) | (uint32_t)0b1000'0000);//10 + 5-0 6bit
561 }
562
563private:
565
566//c=char d=do检查迭代器并获取下一个字节(如果可以,否则执行指定语句后跳出)
567#define GET_NEXTCHAR(c,d) if (++it == end) { (d);break; } else { (c) = *it; }
568//v=value m=mask p=pattern t=test 测试遮罩位之后的结果是否是指定值或值是否是由指定bits组成
569#define HAS_BITMASK(v,m,p) (((uint8_t)(v) & (uint8_t)(m)) == (uint8_t)(p))
570#define IS_BITS(v,t) ((uint8_t)(v) == (uint8_t)(t))
571//v=value b=begin e=end 注意范围是左右边界包含关系,而不是普通的左边界包含
572#define IN_RANGE(v,b,e) (((uint16_t)(v) >= (uint16_t)(b)) && ((uint16_t)(v) <= (uint16_t)(e)))
573
575
576 template<typename T>
577 static constexpr T U16ToMU8Impl(const U16T *u16String, size_t szStringLength, T mu8String = {})
578 {
580#define PUSH_FAIL_MU8CHAR mu8String.append(mu8FailChar, sizeof(mu8FailChar) / sizeof(MU8T))
582
583 //因为string带长度信息,则不用处理0字符情况,for不会进入,直接返回size为0的mu8str
584 //mu8字符串结尾为0xC0 0x80而非0x00
585
586 for (auto it = u16String, end = u16String + szStringLength; it != end; ++it)
587 {
588 U16T u16Char = *it;//第一次
589 if (IN_RANGE(u16Char, 0x0001, 0x007F))//单字节码点
590 {
591 MU8T mu8Char[1]{};
592 EncodeMUTF8Bmp(u16Char, mu8Char);
593 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
594 }
595 else if (IN_RANGE(u16Char, 0x0080, 0x07FF) || u16Char == 0x0000)//双字节码点,0字节特判
596 {
597 MU8T mu8Char[2]{};
598 EncodeMUTF8Bmp(u16Char, mu8Char);
599 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
600 }
601 else if (IN_RANGE(u16Char, 0x0800, 0xFFFF))//三字节码点or多字节码点
602 {
603 if (IN_RANGE(u16Char, 0xD800, 0xDBFF))//遇到高代理对
604 {
605 U16T u16HighSurrogate = u16Char;//保存高代理
606 U16T u16LowSurrogate{};//读取低代理
607 GET_NEXTCHAR(u16LowSurrogate, (PUSH_FAIL_MU8CHAR));//第二次
608 //如果上面读取提前返回,则高代理后无数据,插入转换后的u16未知字符
609
610 //判断低代理范围
611 if (!IN_RANGE(u16LowSurrogate, 0xDC00, 0xDFFF))//错误,高代理后非低代理
612 {
613 --it;//撤回一次刚才的读取,重新判断非低代理字节
614 PUSH_FAIL_MU8CHAR;//插入u16未知字符
615 continue;//重试,for会重新++it,相当于重试当前*it
616 }
617
618 //代理对特殊处理:共6字节表示一个实际代理码点
619 MU8T mu8Char[6]{};
620 EncodeMUTF8Supplementary(u16HighSurrogate, u16LowSurrogate, mu8Char);
621 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
622 }
623 else//高代理之前遇到低代理或其它合法3字节字符
624 {
625 if (IN_RANGE(u16Char, 0xDC00, 0xDFFF))//在高代理之前遇到低代理
626 {
627 //不撤回读取,丢弃错误的低代理
628 PUSH_FAIL_MU8CHAR;//错误,插入u16未知字符
629 continue;//重试
630 }
631
632 //转换3字节字符
633 MU8T mu8Char[3]{};
634 EncodeMUTF8Bmp(u16Char, mu8Char);
635 mu8String.append(mu8Char, sizeof(mu8Char) / sizeof(MU8T));
636 }
637 }
638 else
639 {
640 assert(false);//??????????????怎么命中的
641 }
642 }
643
644 return mu8String;
645
646#undef PUSH_FAIL_MU8CHAR
647 }
648
649 template<typename T = std::basic_string<U16T>>
650 static constexpr T MU8ToU16Impl(const MU8T *mu8String, size_t szStringLength, T u16String = {})
651 {
653#define PUSH_FAIL_U16CHAR u16String.push_back(u16FailChar)
655
656 //因为string带长度信息,则不用处理0字符情况,for不会进入,直接返回size为0的u16str
657 //u16字符串末尾为0x0000
658
659 for (auto it = mu8String, end = mu8String + szStringLength; it != end; ++it)
660 {
661 MU8T mu8Char = *it;//第一次
662 //判断是几字节的mu8
663 if (HAS_BITMASK(mu8Char, 0b1000'0000, 0b0000'0000))//最高位为0,单字节码点
664 {
665 //放入数组
666 MU8T mu8CharArr[1] = { mu8Char };
667
668 //转换
669 U16T u16Char{};
670 DecodeMUTF8Bmp(mu8CharArr, u16Char);
671 u16String.push_back(u16Char);
672 }
673 else if (HAS_BITMASK(mu8Char, 0b1110'0000, 0b1100'0000))//高3位为110,双字节码点
674 {
675 //先保存第一个字节
676 MU8T mu8CharArr[2] = { mu8Char };//[0]=mu8Char
677 //尝试获取下一个字节
678 GET_NEXTCHAR(mu8CharArr[1], (PUSH_FAIL_U16CHAR));//第二次
679 //判断字节合法性
680 if (!HAS_BITMASK(mu8CharArr[1], 0b1100'0000, 0b1000'0000))//高2位不是10,错误,跳过
681 {
682 --it;//撤回读取(避免for自动递增跳过刚才的字符)
683 PUSH_FAIL_U16CHAR;//替换为utf16错误字符
684 continue;//重试,因为当前字符可能是错误的,而刚才多读取的才是正确的,所以需要撤回continue重新尝试
685 }
686
687 //转换
688 U16T u16Char{};
689 DecodeMUTF8Bmp(mu8CharArr, u16Char);
690 u16String.push_back(u16Char);
691 }
692 else if (HAS_BITMASK(mu8Char, 0b1111'0000, 0b1110'0000))//高4位为1110,三字节或多字节码点
693 {
694 //提前获取下一个字符,这是代理对的判断依据
695 MU8T mu8Next{};
696 GET_NEXTCHAR(mu8Next, (PUSH_FAIL_U16CHAR));//第二次
697
698 //合法性判断(区分是否为代理)
699 //代理区分:因为D800开头的为高代理,必不可能作为三字节码点0b1010'xxxx出现,所以只要高4位是1010必为代理对
700 //也就是说mu8CharArr3[0]的低4bit如果是1101并且mu8Char的高4bit是1010的情况下,即三字节码点10xx'xxxx中的最高二个xx为01,
701 //把他们合起来就是1101'10xx 也就是0xD8,即u16的高代理对开始字符,而代理对在encode过程走的另一个流程,不存在与3字节码点混淆处理的情况
702 if (IS_BITS(mu8Char, 0b1110'1101) && HAS_BITMASK(mu8Next, 0b1111'0000, 0b1010'0000))//代理对,必须先判断,很重要!
703 {
704 //保存到数组
705 MU8T mu8CharArr[6] = { mu8Char,mu8Next };//[0] = mu8Char, [1] = mu8Next
706
707 //继续读取后4个并验证
708
709 //下一个为高代理的低6位
710 GET_NEXTCHAR(mu8CharArr[2],
711 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第三次
712 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))
713 {
714 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
715 --it;
716 //替换为二个utf16错误字符
717 PUSH_FAIL_U16CHAR;
718 PUSH_FAIL_U16CHAR;
719 continue;
720 }
721
722 //下一个为固定字符
723 GET_NEXTCHAR(mu8CharArr[3],
724 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第四次
725 if (!IS_BITS(mu8CharArr[3], 0b1110'1101))
726 {
727 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
728 --it;
729 //替换为三个utf16错误字符
730 PUSH_FAIL_U16CHAR;
731 PUSH_FAIL_U16CHAR;
732 PUSH_FAIL_U16CHAR;
733 continue;
734 }
735
736 //下一个为低代理高4位
737 GET_NEXTCHAR(mu8CharArr[4],
738 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第五次
739 if (!HAS_BITMASK(mu8CharArr[4], 0b1111'0000, 0b1011'0000))
740 {
741 //撤回二次读取,尽管前面已确认是0b1110'1101,但是存在111开头的合法3码点
742 --it;
743 --it;
744 //替换为三个utf16错误字符,因为撤回二次,本来有4个错误字节的现在只要3个
745 PUSH_FAIL_U16CHAR;
746 PUSH_FAIL_U16CHAR;
747 PUSH_FAIL_U16CHAR;
748 continue;
749 }
750
751 //读取最后一个低代理的低6位
752 GET_NEXTCHAR(mu8CharArr[5],
753 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第六次
754 if (!HAS_BITMASK(mu8CharArr[5], 0b1100'0000, 0b1000'0000))
755 {
756 //撤回一次读取,因为不存在前一个已确认的101开头的合法码点,且再前一个开头为111,也就是不存在111后跟101的3码点情况,跳过
757 --it;
758 //替换为五个utf16错误字符
759 PUSH_FAIL_U16CHAR;
760 PUSH_FAIL_U16CHAR;
761 PUSH_FAIL_U16CHAR;
762 PUSH_FAIL_U16CHAR;
763 PUSH_FAIL_U16CHAR;
764 continue;
765 }
766
767 //验证全部通过,转换代理对
768 U16T u16HighSurrogate{}, u16LowSurrogate{};
769 DecodeMUTF8Supplementary(mu8CharArr, u16HighSurrogate, u16LowSurrogate);
770 u16String.push_back(u16HighSurrogate);
771 u16String.push_back(u16LowSurrogate);
772 }
773 else if(HAS_BITMASK(mu8Next, 0b1100'0000, 0b1000'0000))//三字节码点,排除代理对后只有这个可能,看看是不是10开头的尾随字节
774 {
775 //保存
776 MU8T mu8CharArr[3] = { mu8Char,mu8Next };//[0] = mu8Char, [1] = mu8Next
777
778 //尝试获取下一字符
779 GET_NEXTCHAR(mu8CharArr[2],
780 (PUSH_FAIL_U16CHAR, PUSH_FAIL_U16CHAR));//第三次
781 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))//错误,3字节码点最后一个不是正确字符
782 {
783 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
784 --it;
785 //替换为二个utf16错误字符
786 PUSH_FAIL_U16CHAR;
787 PUSH_FAIL_U16CHAR;
788 continue;
789 }
790
791 //3位已就绪,转换
792 U16T u16Char{};
793 DecodeMUTF8Bmp(mu8CharArr, u16Char);
794 u16String.push_back(u16Char);
795 }
796 else
797 {
798 //撤回mu8Next的读取,因为mu8Char已经判断过,能运行到这里,
799 //证明此字节错误,如果撤回到mu8Char会导致无限错误循环,
800 //只撤回到mu8Next即可,for会重新++it,相当于重试当前*it
801 --it;
802 //替换为一个utf16错误字符
803 PUSH_FAIL_U16CHAR;
804 continue;
805 }
806 }
807 else
808 {
809 //未知,跳过并忽略,直到遇到下一个正确起始字符
810 //替换为一个utf16错误字符
811 PUSH_FAIL_U16CHAR;
812 continue;
813 }
814 }
815
816 return u16String;
817
818#undef PUSH_FAIL_U16CHAR
819 }
820
821 /*
822 Modified UTF-8 与 "标准"UTF-8 格式有二点区别:
823 第一,空字符(char)0使用双字节格式0xC0 0x80而非单字节格式0x00,
824 因此 Modified UTF-8字符串不会包含嵌入式空值;
825
826 第二,仅使用标准UTF-8的单字节、双字节和三字节格式。
827 Java虚拟机不识别标准UTF-8的四字节格式,
828 而是使用自定义的双三字节(6字节代理对)格式。
829 */
830
831 template<typename T>
832 static constexpr T U8ToMU8Impl(const U8T *u8String, size_t szStringLength, T mu8String = {})
833 {
835#define PUSH_FAIL_MU8CHAR mu8String.append(mu8FailChar, sizeof(mu8FailChar) / sizeof(MU8T))
836#define INSERT_NORMAL(p) (mu8String.append_cvt((p) - szNormalLength, szNormalLength), szNormalLength = 0)
838
839 size_t szNormalLength = 0;//普通字符的长度,用于优化批量插入
840 for (auto it = u8String, end = u8String + szStringLength; it != end; ++it)
841 {
842 //u8到mu8,处理u8空字符,处理4字节u8转换到6字节mu8
843 U8T u8Char = *it;//第一次
844 if (HAS_BITMASK(u8Char, 0b1111'1000, 0b1111'0000))//高5位为11110,utf8的4字节
845 {
846 INSERT_NORMAL(it);//在处理之前先插入之前被跳过的普通字符
847 //转换u8的4字节到mu8的6字节,并处理错误
848
849 U8T u8CharArr[4]{ u8Char };//[0] = u8Char
850
851 GET_NEXTCHAR(u8CharArr[1], (PUSH_FAIL_MU8CHAR));//第二次
852 if (!HAS_BITMASK(u8CharArr[1], 0b1100'0000, 0b1000'0000))//确保高2bit是10
853 {
854 //输出一个错误字符
855 PUSH_FAIL_MU8CHAR;
856 --it;//回退一次读取,尝试处理不以10开头的
857 continue;
858 }
859
860 GET_NEXTCHAR(u8CharArr[2],
861 (PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR));//第三次
862 if (!HAS_BITMASK(u8CharArr[2], 0b1100'0000, 0b1000'0000))//确保高2bit是10
863 {
864 //输出二个错误字符
865 PUSH_FAIL_MU8CHAR;
866 PUSH_FAIL_MU8CHAR;
867 --it;//回退一次读取,尝试处理不以10开头的
868 continue;
869 }
870
871 GET_NEXTCHAR(u8CharArr[3],
872 (PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR, PUSH_FAIL_MU8CHAR));//第四次
873 if (!HAS_BITMASK(u8CharArr[3], 0b1100'0000, 0b1000'0000))//确保高2bit是10
874 {
875 //输出三个错误字符
876 PUSH_FAIL_MU8CHAR;
877 PUSH_FAIL_MU8CHAR;
878 PUSH_FAIL_MU8CHAR;
879 --it;//回退一次读取,尝试处理不以10开头的
880 continue;
881 }
882
883 //读取成功完成
884 MU8T mu8CharArr[6]{};
885 UTF8SupplementaryToMUTF8(u8CharArr, mu8CharArr);
886 mu8String.append(mu8CharArr, sizeof(mu8CharArr) / sizeof(MU8T));
887 }
888 else if (IS_BITS(u8Char, 0b0000'0000))//\0字符
889 {
890 INSERT_NORMAL(it);//在处理之前先插入之前被跳过的普通字符
891
892 MU8T mu8EmptyCharArr[2] = { (MU8T)0xC0,(MU8T)0x80 };//mu8固定0字节
893 mu8String.append(mu8EmptyCharArr, sizeof(mu8EmptyCharArr) / sizeof(MU8T));
894 }
895 else//都不是,递增普通字符长度,直到遇到特殊字符的时候插入
896 {
897 ++szNormalLength;
898 }
899 }
900 //结束后再插入一次,因为for内可能完全没有进入过任何一个特殊块,
901 //且因为结束后for是从末尾退出的,所以从末尾开始作为当前指针插入
902 INSERT_NORMAL(u8String + szStringLength);
903
904
905 return mu8String;
906
907#undef INSERT_NORMAL
908#undef PUSH_FAIL_MU8CHAR
909 }
910
911 template<typename T = DynamicString<std::basic_string<U8T>>>
912 static constexpr T MU8ToU8Impl(const MU8T *mu8String, size_t szStringLength, T u8String = {})
913 {
915#define PUSH_FAIL_U8CHAR u8String.append(u8FailChar, sizeof(u8FailChar) / sizeof(U8T))
916#define INSERT_NORMAL(p) (u8String.append_cvt((p) - szNormalLength, szNormalLength), szNormalLength = 0)
918
919 size_t szNormalLength = 0;//普通字符的长度,用于优化批量插入
920 for (auto it = mu8String, end = mu8String + szStringLength; it != end; ++it)
921 {
922 MU8T mu8Char = *it;//第一次
923
924 if (HAS_BITMASK(mu8Char, 0b1111'0000, 0b1110'0000))//高4为为1110,mu8的3字节或多字节码点
925 {
926 //提前获取下一个
927 MU8T mu8Next{};
928 if (++it == end)
929 {
930 //把前面的都插入一下
931 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前移动了一下迭代器,回退1当前位置
932 PUSH_FAIL_U8CHAR;//插入错误字符
933 break;
934 }
935 mu8Next = *it;//第二次
936
937
938 //以1110'1101字节开始且下一个字节高4位是1010开头的必然是代理对
939 if (!IS_BITS(mu8Char, 0b1110'1101) || !HAS_BITMASK(mu8Next, 0b1111'0000, 0b1010'0000))
940 {
941 szNormalLength += 2;//前面消耗了两个,递增两次
942 continue;//然后继续循环
943 }
944
945 //已确认是代理对,把前面的都插入一下
946 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前读取了一个mu8Next,回退1当前位置
947
948 //继续读取后4个并验证
949 MU8T mu8CharArr[6] = { mu8Char, mu8Next };//[0] = mu8Char, [1] = mu8Next
950
951 //获取下一个
952 GET_NEXTCHAR(mu8CharArr[2],
953 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第三次
954 if (!HAS_BITMASK(mu8CharArr[2], 0b1100'0000, 0b1000'0000))
955 {
956 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
957 --it;
958 //替换为二个utf8错误字符
959 PUSH_FAIL_U8CHAR;
960 PUSH_FAIL_U8CHAR;
961 continue;
962 }
963
964 //获取下一个
965 GET_NEXTCHAR(mu8CharArr[3],
966 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第四次
967 if (!IS_BITS(mu8CharArr[3], 0b1110'1101))
968 {
969 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
970 --it;
971 //替换为三个utf8错误字符
972 PUSH_FAIL_U8CHAR;
973 PUSH_FAIL_U8CHAR;
974 PUSH_FAIL_U8CHAR;
975 continue;
976 }
977
978 //获取下一个
979 GET_NEXTCHAR(mu8CharArr[4],
980 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第五次
981 if (!HAS_BITMASK(mu8CharArr[4], 0b1111'0000, 0b1011'0000))
982 {
983 //撤回二次读取,尽管前面已确认是0b1110'1101,但是存在111开头的合法3码点
984 --it;
985 --it;
986 //替换为三个utf8错误字符,因为撤回二次,本来有4个错误字节的现在只要3个
987 PUSH_FAIL_U8CHAR;
988 PUSH_FAIL_U8CHAR;
989 PUSH_FAIL_U8CHAR;
990 continue;
991 }
992
993 //获取下一个
994 GET_NEXTCHAR(mu8CharArr[5],
995 (PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR, PUSH_FAIL_U8CHAR));//第六次
996 if (!HAS_BITMASK(mu8CharArr[5], 0b1100'0000, 0b1000'0000))
997 {
998 //撤回一次读取(为什么不是二次?因为前一个字符已确认是10开头的尾随字符,跳过)
999 --it;
1000 //替换为五个utf8错误字符
1001 PUSH_FAIL_U8CHAR;
1002 PUSH_FAIL_U8CHAR;
1003 PUSH_FAIL_U8CHAR;
1004 PUSH_FAIL_U8CHAR;
1005 PUSH_FAIL_U8CHAR;
1006 continue;
1007 }
1008
1009 //到此,全部验证通过,进行转换
1010 U8T u8CharArr[4]{};
1011 MUTF8SupplementaryToUTF8(mu8CharArr, u8CharArr);
1012 u8String.append(u8CharArr, sizeof(u8CharArr) / sizeof(U8T));
1013 }
1014 else if (IS_BITS(mu8Char, 0xC0))//注意以0xC0开头的,必然是2字节码,所以如果里面没有第二个字符,则必然错误
1015 {
1016 //提前获取下一个
1017 MU8T mu8Next{};
1018 if (++it == end)
1019 {
1020 //把前面的都插入一下
1021 INSERT_NORMAL(it - 1);
1022 PUSH_FAIL_U8CHAR;//插入错误字符
1023 break;
1024 }
1025 mu8Next = *it;//第二次
1026
1027 if (!IS_BITS(mu8Next, 0x80))//如果不是,说明是别的字节模式
1028 {
1029 szNormalLength += 2;//普通字符数加2然后继续
1030 continue;
1031 }
1032
1033 //已确认是0字符,插入一下前面的所有内容
1034 INSERT_NORMAL(it - 1);//注意这里的-1,因为正常是要在块语句开头执行的,这里已经超前读取了一个mu8Next,回退1当前位置
1035 u8String.push_back((U8T)0x00);//插入0字符
1036 }
1037 else
1038 {
1039 ++szNormalLength;//普通字符,递增
1040 continue;//继续
1041 }
1042 }
1043 //最后再把for中剩余未插入的插入一下,注意这里起始位置其实是for中的end位置
1044 INSERT_NORMAL(mu8String + szStringLength);
1045
1046 return u8String;
1047
1048#undef INSERT_NORMAL
1049#undef PUSH_FAIL_U8CHAR
1050 }
1051
1052#undef IN_RANGE
1053#undef IS_BITS
1054#undef HAS_BITMASK
1055#undef GET_NEXTCHAR
1056
1057private:
1058 template<typename T>
1059 static consteval size_t ContentLength(const T &tStr)//获取不包含末尾0的长度(如果有)
1060 {
1061 return tStr.size() > 0 && tStr[tStr.size() - 1] == (typename T::value_type)0x00
1062 ? tStr.size() - 1
1063 : tStr.size();
1064 }
1065
1066public:
1067 //---------------------------------------------------------------------------------------------//
1068
1073 static constexpr size_t U16ToMU8Length(const std::basic_string_view<U16T> &u16String)
1074 {
1075 return U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String.data(), u16String.size()).GetData();
1076 }
1077
1083 static constexpr size_t U16ToMU8Length(const U16T *u16String, size_t szStringLength)
1084 {
1085 return U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String, szStringLength).GetData();
1086 }
1087
1093 static MU8_String U16ToMU8(const std::basic_string_view<U16T> &u16String, size_t szReserve = 0)
1094 {
1095 return U16ToMU8Impl<DynamicString<MU8_String>>(u16String.data(), u16String.size(), { szReserve }).GetData();
1096 }
1097
1104 static MU8_String U16ToMU8(const U16T *u16String, size_t szStringLength, size_t szReserve = 0)
1105 {
1106 return U16ToMU8Impl<DynamicString<MU8_String>>(u16String, szStringLength, { szReserve }).GetData();
1107 }
1108
1114 template<MUTF8_Tool_Internal::StringLiteral u16String>
1115 requires std::is_same_v<typename decltype(u16String)::value_type, U16T>//限定类型
1116 static consteval MU8_String_View U16ToMU8(void)
1117 {
1118 constexpr size_t szStringLength = ContentLength(u16String);
1119 constexpr size_t szNewLength = U16ToMU8Impl<FakeStringCounter<MU8T>>(u16String.data(), szStringLength).GetData();
1120
1122 <
1123 U16ToMU8Impl<StaticString<MU8T, szNewLength>>(u16String.data(), szStringLength).GetData(),
1125 >();
1126 }
1127
1128 //---------------------------------------------------------------------------------------------//
1129
1134 static constexpr size_t U8ToMU8Length(const std::basic_string_view<U8T> &u8String)
1135 {
1136 return U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String.data(), u8String.size()).GetData();
1137 }
1138
1144 static constexpr size_t U8ToMU8Length(const U8T *u8String, size_t szStringLength)
1145 {
1146 return U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String, szStringLength).GetData();
1147 }
1148
1154 static MU8_String U8ToMU8(const std::basic_string_view<U8T> &u8String, size_t szReserve = 0)
1155 {
1156 return U8ToMU8Impl<DynamicString<MU8_String>>(u8String.data(), u8String.size(), { szReserve }).GetData();
1157 }
1158
1165 static MU8_String U8ToMU8(const U8T *u8String, size_t szStringLength, size_t szReserve = 0)
1166 {
1167 return U8ToMU8Impl<DynamicString<MU8_String>>(u8String, szStringLength, { szReserve }).GetData();
1168 }
1169
1175 template<MUTF8_Tool_Internal::StringLiteral u8String>
1176 requires std::is_same_v<typename decltype(u8String)::value_type, U8T>//限定类型
1177 static consteval MU8_String_View U8ToMU8(void)
1178 {
1179 constexpr size_t szStringLength = ContentLength(u8String);
1180 constexpr size_t szNewLength = U8ToMU8Impl<FakeStringCounter<MU8T>>(u8String.data(), szStringLength).GetData();
1181
1182 return MUTF8_Tool_Internal::ToStringView<U8ToMU8Impl
1183 <
1184 StaticString<MU8T, szNewLength>>(u8String.data(), szStringLength).GetData(),
1186 >();
1187 }
1188
1189 //---------------------------------------------------------------------------------------------//
1190 //---------------------------------------------------------------------------------------------//
1191
1196 static constexpr size_t MU8ToU16Length(const MU8_String_View &mu8String)
1197 {
1198 return MU8ToU16Impl<FakeStringCounter<U16T>>(mu8String.data(), mu8String.size()).GetData();
1199 }
1200
1206 static constexpr size_t MU8ToU16Length(const MU8T *mu8String, size_t szStringLength)
1207 {
1208 return MU8ToU16Impl<FakeStringCounter<U16T>>(mu8String, szStringLength).GetData();
1209 }
1210
1216 static std::basic_string<U16T> MU8ToU16(const MU8_String_View &mu8String, size_t szReserve = 0)
1217 {
1218 return MU8ToU16Impl<DynamicString<std::basic_string<U16T>>>(mu8String.data(), mu8String.size(), { szReserve }).GetData();
1219 }
1220
1227 static std::basic_string<U16T> MU8ToU16(const MU8T *mu8String, size_t szStringLength, size_t szReserve = 0)
1228 {
1229 return MU8ToU16Impl<DynamicString<std::basic_string<U16T>>>(mu8String, szStringLength, { szReserve }).GetData();
1230 }
1231
1232 //---------------------------------------------------------------------------------------------//
1233
1238 static constexpr size_t MU8ToU8Length(const MU8_String_View &mu8String)
1239 {
1240 return MU8ToU8Impl<FakeStringCounter<U8T>>(mu8String.data(), mu8String.size()).GetData();
1241 }
1242
1248 static constexpr size_t MU8ToU8Length(const MU8T *mu8String, size_t szStringLength)
1249 {
1250 return MU8ToU8Impl<FakeStringCounter<U8T>>(mu8String, szStringLength).GetData();
1251 }
1252
1258 static std::basic_string<U8T> MU8ToU8(const MU8_String_View &mu8String, size_t szReserve = 0)
1259 {
1260 return MU8ToU8Impl<DynamicString<std::basic_string<U8T>>>(mu8String.data(), mu8String.size(), { szReserve }).GetData();
1261 }
1262
1269 static std::basic_string<U8T> MU8ToU8(const MU8T *mu8String, size_t szStringLength, size_t szReserve = 0)
1270 {
1271 return MU8ToU8Impl<DynamicString<std::basic_string<U8T>>>(mu8String, szStringLength, { szReserve }).GetData();
1272 }
1273
1274 //---------------------------------------------------------------------------------------------//
1275};
1276
1277//--------------------------------------------辅助调用宏--------------------------------------------//
1278
1279//动态转换
1280
1284#define U16CV2MU8(u16String) MUTF8_Tool<>::U16ToMU8(u16String)
1285
1289#define MU8CV2U16(mu8String) MUTF8_Tool<>::MU8ToU16(mu8String)
1290
1294#define U8CV2MU8(u8String) MUTF8_Tool<>::U8ToMU8(u8String)
1295
1299#define MU8CV2U8(mu8String) MUTF8_Tool<>::MU8ToU8(mu8String)
1300
1301//静态转换
1302//在mutf-8中,任何字符串结尾\0都会被映射成0xC0 0x80,且保证串中不包含\0,所以一定程度上可以和c-str(以\0结尾)兼容
1303
1308#define U16TOMU8STR(u16LiteralString) (MUTF8_Tool<>::U16ToMU8<u16LiteralString>())
1309
1314#define U8TOMU8STR(u8LiteralString) (MUTF8_Tool<>::U8ToMU8<u8LiteralString>())
1315
1316//---------------------------------------------------------------------------------------------//
1317
1318//英文原文
1319/*
1320Modified UTF-8 Strings
1321The JNI uses modified UTF-8 strings to represent various string types. Modified UTF-8 strings are the same as those used by the Java VM. Modified UTF-8 strings are encoded so that character sequences that contain only non-null ASCII characters can be represented using only one byte per character, but all Unicode characters can be represented.
1322
1323All characters in the range \u0001 to \u007F are represented by a single byte, as follows:
1324
13250xxxxxxx
1326The seven bits of data in the byte give the value of the character represented.
1327
1328The null character ('\u0000') and characters in the range '\u0080' to '\u07FF' are represented by a pair of bytes x and y:
1329
1330x: 110xxxxx
1331y: 10yyyyyy
1332The bytes represent the character with the value ((x & 0x1f) << 6) + (y & 0x3f).
1333
1334Characters in the range '\u0800' to '\uFFFF' are represented by 3 bytes x, y, and z:
1335
1336x: 1110xxxx
1337y: 10yyyyyy
1338z: 10zzzzzz
1339The character with the value ((x & 0xf) << 12) + ((y & 0x3f) << 6) + (z & 0x3f) is represented by the bytes.
1340
1341Characters with code points above U+FFFF (so-called supplementary characters) are represented by separately encoding the two surrogate code units of their UTF-16 representation. Each of the surrogate code units is represented by three bytes. This means, supplementary characters are represented by six bytes, u, v, w, x, y, and z:
1342
1343u: 11101101
1344v: 1010vvvv
1345w: 10wwwwww
1346x: 11101101
1347y: 1011yyyy
1348z: 10zzzzzz
1349The character with the value 0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) is represented by the six bytes.
1350
1351The bytes of multibyte characters are stored in the class file in big-endian (high byte first) order.
1352
1353There are two differences between this format and the standard UTF-8 format. First, the null character (char)0 is encoded using the two-byte format rather than the one-byte format. This means that modified UTF-8 strings never have embedded nulls. Second, only the one-byte, two-byte, and three-byte formats of standard UTF-8 are used. The Java VM does not recognize the four-byte format of standard UTF-8; it uses its own two-times-three-byte format instead.
1354
1355For more information regarding the standard UTF-8 format, see section 3.9 Unicode Encoding Forms of The Unicode Standard, Version 4.0.
1356*/
1357
1358//中文翻译
1359/*
1360修改后的 UTF-8 字符串
1361JNI 使用修改后的 UTF-8 字符串来表示各种字符串类型。修改后的 UTF-8 字符串与 Java VM 所使用的字符串相同。修改后的 UTF-8 字符串经过编码,使得仅包含非空 ASCII 字符的字符序列可以每个字符仅使用一个字节来表示,但所有 Unicode 字符都可以被表示。
1362
1363范围在 \u0001 到 \u007F 之间的所有字符都由单个字节表示,如下所示:
1364
13650xxxxxxx
1366字节中的七位数据给出了所表示字符的值。
1367
1368空字符 ('\u0000') 和范围在 '\u0080' 到 '\u07FF' 之间的字符由一对字节 x 和 y 表示:
1369
1370x: 110xxxxx
1371y: 10yyyyyy
1372这些字节表示值为 ((x & 0x1f) << 6) + (y & 0x3f) 的字符。
1373
1374范围在 '\u0800' 到 '\uFFFF' 之间的字符由三个字节 x、y 和 z 表示:
1375
1376x: 1110xxxx
1377y: 10yyyyyy
1378z: 10zzzzzz
1379值为 ((x & 0xf) << 12) + ((y & 0x3f) << 6) + (z & 0x3f) 的字符由这些字节表示。
1380
1381码点高于 U+FFFF 的字符(即所谓的补充字符)通过分别编码其 UTF-16 表示的二个代理码元来表示。每个代理码元由三个字节表示。这意味着,补充字符由六个字节 u、v、w、x、y 和 z 表示:
1382
1383u: 11101101
1384v: 1010vvvv
1385w: 10wwwwww
1386x: 11101101
1387y: 1011yyyy
1388z: 10zzzzzz
1389值为 0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) 的字符由这六个字节表示。
1390
1391多字节字符的字节在类文件中以大端序(高位字节在前)存储。
1392
1393此格式与标准 UTF-8 格式有二个区别。首先,空字符 (char)0 使用双字节格式而非单字节格式进行编码。这意味着修改后的 UTF-8 字符串永远不会包含嵌入的空字符。其次,仅使用标准 UTF-8 的单字节、双字节和三字节格式。Java VM 不识别标准 UTF-8 的四字节格式;它使用自己的二次三字节格式来代替。
1394
1395有关标准 UTF-8 格式的更多信息,请参阅 Unicode 标准 4.0 版的第 3.9 节 Unicode 编码形式。
1396*/
uint8_t MUTF8_Char_Type
默认的M-UTF-8字符类型
定义 MUTF8_Tool.hpp:59
std::basic_string< MUTF8_Char_Type, MUTF8_Char_Traits< MUTF8_Char_Type > > MUTF8_String
默认的M-UTF-8字符串对象类型
定义 MUTF8_Tool.hpp:250
std::basic_string_view< MUTF8_Char_Type, MUTF8_Char_Traits< MUTF8_Char_Type > > MUTF8_String_View
默认的M-UTF-8字符串对象视图类型
定义 MUTF8_Tool.hpp:252
@ String
对应NBT_Type::String
定义 NBT_TAG.hpp:26
std::array< T, N > Super
父类类型定义
定义 MUTF8_Tool.hpp:32
constexpr StringLiteral(const T(&_tStr)[N]) noexcept
从字符串数组的引用拷贝构造
定义 MUTF8_Tool.hpp:39
constexpr ~StringLiteral(void)=default
默认析构
static MU8_String U16ToMU8(const U16T *u16String, size_t szStringLength, size_t szReserve=0)
获取UTF-16转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:1104
static std::basic_string< U8T > MU8ToU8(const MU8T *mu8String, size_t szStringLength, size_t szReserve=0)
获取M-UTF-8转换到UTF-8的字符串
定义 MUTF8_Tool.hpp:1269
static std::basic_string< U16T > MU8ToU16(const MU8_String_View &mu8String, size_t szReserve=0)
获取M-UTF-8转换到UTF-16的字符串
定义 MUTF8_Tool.hpp:1216
static MU8_String U8ToMU8(const U8T *u8String, size_t szStringLength, size_t szReserve=0)
获取UTF-8转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:1165
static constexpr size_t MU8ToU8Length(const MU8_String_View &mu8String)
精确计算M-UTF-8转换到UTF-8所需的UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1238
U8T U8_T
模板UTF-8字符类型的代理
定义 MUTF8_Tool.hpp:280
static constexpr size_t U16ToMU8Length(const std::basic_string_view< U16T > &u16String)
精确计算UTF-16转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1073
static constexpr size_t MU8ToU8Length(const MU8T *mu8String, size_t szStringLength)
精确计算M-UTF-8转换到UTF-8所需的UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1248
std::conditional_t< std::is_same_v< MU8T, MUTF8_Char_Type >, MUTF8_String_View, std::basic_string_view< MU8T > > MU8_String_View
根据实际MU8T类型决定使用特化版的MUTF8字符串视图类型还是标准库字符串视图类型
定义 MUTF8_Tool.hpp:285
static constexpr size_t U8ToMU8Length(const std::basic_string_view< U8T > &u8String)
精确计算UTF-8转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1134
static consteval MU8_String_View U8ToMU8(void)
通过UTF-8字符串字面量,直接获得编译期的M-UTF-8静态字符串
定义 MUTF8_Tool.hpp:1177
static constexpr size_t MU8ToU16Length(const MU8_String_View &mu8String)
精确计算M-UTF-8转换到UTF-16所需的UTF-16字符串的长度
定义 MUTF8_Tool.hpp:1196
MU8T MU8_T
模板M-UTF-8字符类型的代理
定义 MUTF8_Tool.hpp:276
static constexpr size_t U16ToMU8Length(const U16T *u16String, size_t szStringLength)
精确计算UTF-16转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1083
U16T U16_T
模板UTF-16字符类型的代理
定义 MUTF8_Tool.hpp:278
static MU8_String U8ToMU8(const std::basic_string_view< U8T > &u8String, size_t szReserve=0)
获取UTF-8转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:1154
static constexpr size_t MU8ToU16Length(const MU8T *mu8String, size_t szStringLength)
精确计算M-UTF-8转换到UTF-16所需的UTF-16字符串的长度
定义 MUTF8_Tool.hpp:1206
std::conditional_t< std::is_same_v< MU8T, MUTF8_Char_Type >, MUTF8_String, std::basic_string< MU8T > > MU8_String
根据实际MU8T类型决定使用特化版的MUTF8字符串类型还是标准库字符串类型
定义 MUTF8_Tool.hpp:283
static constexpr size_t U8ToMU8Length(const U8T *u8String, size_t szStringLength)
精确计算UTF-8转换到M-UTF-8所需的M-UTF-8字符串的长度
定义 MUTF8_Tool.hpp:1144
static std::basic_string< U16T > MU8ToU16(const MU8T *mu8String, size_t szStringLength, size_t szReserve=0)
获取M-UTF-8转换到UTF-16的字符串
定义 MUTF8_Tool.hpp:1227
static MU8_String U16ToMU8(const std::basic_string_view< U16T > &u16String, size_t szReserve=0)
获取UTF-16转换到M-UTF-8的字符串
定义 MUTF8_Tool.hpp:1093
static consteval MU8_String_View U16ToMU8(void)
通过UTF-16字符串字面量,直接获得编译期的M-UTF-8静态字符串
定义 MUTF8_Tool.hpp:1116
static std::basic_string< U8T > MU8ToU8(const MU8_String_View &mu8String, size_t szReserve=0)
获取M-UTF-8转换到UTF-8的字符串
定义 MUTF8_Tool.hpp:1258
用于存放MUTF8_Tool使用的,无法存在于类内的辅助类
定义 MUTF8_Tool.hpp:22
consteval View_Type ToStringView(void) noexcept
利用模板固化编译期求值函数返回的数组临时量
定义 MUTF8_Tool.hpp:52
一个用于符合标准库使用的,自定义的Char_Traits
定义 MUTF8_Tool.hpp:66