diff --git a/Sources/Plasma/CoreLib/plString.cpp b/Sources/Plasma/CoreLib/plString.cpp index 754b7882..2b706612 100644 --- a/Sources/Plasma/CoreLib/plString.cpp +++ b/Sources/Plasma/CoreLib/plString.cpp @@ -171,18 +171,24 @@ void plString::IConvertFromUtf16(const uint16_t *utf16, size_t size) plUniChar unichar = 0x10000; if (sp + 1 >= utf16 + size) { - hsAssert(0, "Incomplete surrogate pair in UTF-16 data"); + // Incomplete surrogate pair unichar = BADCHAR_REPLACEMENT; } else if (*sp < 0xDC00) { unichar += (*sp++ & 0x3FF) << 10; - hsAssert(*sp >= 0xDC00 && *sp <= 0xDFFF, - "Invalid surrogate pair in UTF-16 data"); - unichar += (*sp & 0x3FF); + if (*sp < 0xDC00 || *sp > 0xDFFF) { + // Invalid surrogate pair + unichar = BADCHAR_REPLACEMENT; + } else { + unichar += (*sp & 0x3FF); + } } else { unichar += (*sp++ & 0x3FF); - hsAssert(*sp >= 0xD800 && *sp < 0xDC00, - "Invalid surrogate pair in UTF-16 data"); - unichar += (*sp & 0x3FF) << 10; + if (*sp < 0xD800 || *sp >= 0xDC00) { + // Invalid surrogate pair + unichar = BADCHAR_REPLACEMENT; + } else { + unichar += (*sp & 0x3FF) << 10; + } } *dp++ = 0xF0 | ((unichar >> 18) & 0x07); *dp++ = 0x80 | ((unichar >> 12) & 0x3F); @@ -229,8 +235,8 @@ void plString::IConvertFromUtf32(const plUniChar *ustr, size_t size) const plUniChar *sp = ustr; while (sp < ustr + size) { if (*sp > 0x10FFFF) { - hsAssert(0, "UTF-32 character out of range"); - convlen += 3; // Use U+FFFD for release builds + // Invalid character gets replaced with U+FFFD + convlen += 3; } else if (*sp > 0xFFFF) convlen += 4; diff --git a/Sources/Tests/CoreTests/test_plString.cpp b/Sources/Tests/CoreTests/test_plString.cpp index efc1103d..9e3988b1 100644 --- a/Sources/Tests/CoreTests/test_plString.cpp +++ b/Sources/Tests/CoreTests/test_plString.cpp @@ -123,18 +123,30 @@ TEST(PlStringTest, ConvertInvalid) { // The following should encode replacement characters for invalid chars const plUniChar unicode_replacement[] = { 0xfffd, 0 }; + const char latin1_replacement[] = "?"; - const plUniChar char_too_big[] = { 0xffffff, 0 }; - plUnicodeBuffer too_big = plString::FromUtf32(char_too_big).GetUnicodeArray(); + // Character outside of Unicode specification range + const plUniChar too_big_c[] = { 0xffffff, 0 }; + plUnicodeBuffer too_big = plString::FromUtf32(too_big_c).GetUnicodeArray(); EXPECT_EQ(0, T_strcmp(unicode_replacement, too_big.GetData())); - // TODO: Invalid surrogate pairs can encode to 0xfffd, but it's handled - // by an assert right now. + // Invalid surrogate pairs + const uint16_t incomplete_surr_c[] = { 0xd800, 0 }; + plString incomplete_surr = plString::FromUtf16(incomplete_surr_c); + EXPECT_EQ(0, T_strcmp(unicode_replacement, + incomplete_surr.GetUnicodeArray().GetData())); + + const uint16_t double_low_c[] = { 0xd800, 0xd801, 0 }; + plString double_low = plString::FromUtf16(double_low_c); + EXPECT_EQ(0, T_strcmp(unicode_replacement, double_low.GetUnicodeArray().GetData())); + + const uint16_t bad_combo_c[] = { 0xdc00, 0x20, 0 }; + plString bad_combo = plString::FromUtf16(double_low_c); + EXPECT_EQ(0, T_strcmp(unicode_replacement, bad_combo.GetUnicodeArray().GetData())); // ISO-8859-1 doesn't have \ufffd, so it uses '?' instead - const plUniChar high_char[] = { 0x1ff, 0 }; - const char latin1_replacement[] = "?"; - plStringBuffer non_latin1 = plString::FromUtf32(high_char).ToIso8859_1(); + const plUniChar non_latin1_c[] = { 0x1ff, 0 }; + plStringBuffer non_latin1 = plString::FromUtf32(non_latin1_c).ToIso8859_1(); EXPECT_STREQ(latin1_replacement, non_latin1.GetData()); }