diff --git a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp index 53a45ed008a491..39bdbb1a11d0f0 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp @@ -224,6 +224,23 @@ PALTEST(locale_info_MultiByteToWideChar_test4_paltest_multibytetowidechar_test4, free(wideBuffer); } + { + // U+6F22 — code unit > U+00FF catches long-code-path bugs that move + // only the low byte instead of performing a full 16-bit byte swap. + const char utf8[] = "\xE6\xBC\xA2"; + WCHAR wide[1] = { 0 }; + size_t n = minipal_convert_utf8_to_utf16(utf8, sizeof(utf8) - 1, (CHAR16_T*)wide, 1, 0); + if (n != 1 || wide[0] != 0x6F22) + Fail("utf8->utf16 produced 0x%04x (n=%zu)\n", wide[0], n); + +#if BIGENDIAN + wide[0] = 0; + n = minipal_convert_utf8_to_utf16(utf8, sizeof(utf8) - 1, (CHAR16_T*)wide, 1, MINIPAL_TREAT_AS_LITTLE_ENDIAN); + if (n != 1 || wide[0] != 0x226F) + Fail("treat-as-LE utf8->utf16 produced 0x%04x (n=%zu)\n", wide[0], n); +#endif + } + #if BIGENDIAN { const char* ascii = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"; diff --git a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp index 697730b9dbd026..115faf95bfd2d3 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp @@ -148,6 +148,27 @@ PALTEST(locale_info_WideCharToMultiByte_test5_paltest_widechartomultibyte_test5, free(utf8Buffer); } + { + // U+6F22 — code unit > U+00FF catches long-code-path bugs that move + // only the low byte instead of performing a full 16-bit byte swap. + const WCHAR srcNative[1] = { 0x6F22 }; + CHAR utf8[4] = { 0 }; + size_t n = minipal_convert_utf16_to_utf8((const CHAR16_T*)srcNative, 1, utf8, sizeof(utf8), 0); + if (n != 3 || memcmp(utf8, "\xE6\xBC\xA2", 3) != 0) + Fail("utf16->utf8 produced %02x %02x %02x (n=%zu)\n", + (unsigned char)utf8[0], (unsigned char)utf8[1], (unsigned char)utf8[2], n); + +#if BIGENDIAN + // Stored little-endian (22 6F) reads as BE word 0x226F. + const WCHAR srcLE[1] = { 0x226F }; + memset(utf8, 0, sizeof(utf8)); + n = minipal_convert_utf16_to_utf8((const CHAR16_T*)srcLE, 1, utf8, sizeof(utf8), MINIPAL_TREAT_AS_LITTLE_ENDIAN); + if (n != 3 || memcmp(utf8, "\xE6\xBC\xA2", 3) != 0) + Fail("treat-as-LE utf16->utf8 produced %02x %02x %02x (n=%zu)\n", + (unsigned char)utf8[0], (unsigned char)utf8[1], (unsigned char)utf8[2], n); +#endif + } + #if BIGENDIAN { const char* expected = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"; diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c index e0b25fce415473..c1a3d3b9d37a31 100644 --- a/src/native/minipal/utf8.c +++ b/src/native/minipal/utf8.c @@ -983,7 +983,7 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun } #if BIGENDIAN if (self->treatAsLE) - *pTarget = ((CHAR16_T)ch)<<8; + *pTarget = (((CHAR16_T)ch) << 8 | ((CHAR16_T)ch) >> 8); else #endif *pTarget = (CHAR16_T)ch; @@ -1438,7 +1438,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead #if BIGENDIAN if (self->treatAsLE) - ch = (*pSrc) >> 8; + ch = (CHAR16_T)(((*pSrc) >> 8) | ((*pSrc) << 8)); else #endif ch = *pSrc; @@ -1581,7 +1581,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un { #if BIGENDIAN if (self->treatAsLE) - ch = (*pSrc) >> 8; + ch = (CHAR16_T)(((*pSrc) >> 8) | ((*pSrc) << 8)); else #endif ch = *pSrc; @@ -1619,7 +1619,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un { #if BIGENDIAN if (self->treatAsLE) - ch = (*pSrc) >> 8; + ch = (CHAR16_T)(((*pSrc) >> 8) | ((*pSrc) << 8)); else #endif ch = *pSrc; @@ -1635,7 +1635,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un { #if BIGENDIAN if (self->treatAsLE) - ch = (*pSrc) >> 8; + ch = (CHAR16_T)(((*pSrc) >> 8) | ((*pSrc) << 8)); else #endif ch = *pSrc; @@ -1652,10 +1652,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un ch = *(int*)pSrc; int chc = *(int*)(pSrc + 2); #if BIGENDIAN - if (self->treatAsLE){ - if (((ch | chc) & (int)0x80FF80FF) != 0) goto LongCodeWithMask; - } - else + if (((ch | chc) & (int)0x80FF80FF) != 0) goto LongCodeWithMask; #else if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; #endif