Browse Source

Use U+FFFD on Debug code too, and finish test cases for replacement

Michael Hansen 10 years ago
parent
commit
56553c8271
  1. 24
      Sources/Plasma/CoreLib/plString.cpp
  2. 26
      Sources/Tests/CoreTests/test_plString.cpp

24
Sources/Plasma/CoreLib/plString.cpp

@ -171,18 +171,24 @@ void plString::IConvertFromUtf16(const uint16_t *utf16, size_t size)
plUniChar unichar = 0x10000; plUniChar unichar = 0x10000;
if (sp + 1 >= utf16 + size) { if (sp + 1 >= utf16 + size) {
hsAssert(0, "Incomplete surrogate pair in UTF-16 data"); // Incomplete surrogate pair
unichar = BADCHAR_REPLACEMENT; unichar = BADCHAR_REPLACEMENT;
} else if (*sp < 0xDC00) { } else if (*sp < 0xDC00) {
unichar += (*sp++ & 0x3FF) << 10; unichar += (*sp++ & 0x3FF) << 10;
hsAssert(*sp >= 0xDC00 && *sp <= 0xDFFF, if (*sp < 0xDC00 || *sp > 0xDFFF) {
"Invalid surrogate pair in UTF-16 data"); // Invalid surrogate pair
unichar += (*sp & 0x3FF); unichar = BADCHAR_REPLACEMENT;
} else {
unichar += (*sp & 0x3FF);
}
} else { } else {
unichar += (*sp++ & 0x3FF); unichar += (*sp++ & 0x3FF);
hsAssert(*sp >= 0xD800 && *sp < 0xDC00, if (*sp < 0xD800 || *sp >= 0xDC00) {
"Invalid surrogate pair in UTF-16 data"); // Invalid surrogate pair
unichar += (*sp & 0x3FF) << 10; unichar = BADCHAR_REPLACEMENT;
} else {
unichar += (*sp & 0x3FF) << 10;
}
} }
*dp++ = 0xF0 | ((unichar >> 18) & 0x07); *dp++ = 0xF0 | ((unichar >> 18) & 0x07);
*dp++ = 0x80 | ((unichar >> 12) & 0x3F); *dp++ = 0x80 | ((unichar >> 12) & 0x3F);
@ -229,8 +235,8 @@ void plString::IConvertFromUtf32(const plUniChar *ustr, size_t size)
const plUniChar *sp = ustr; const plUniChar *sp = ustr;
while (sp < ustr + size) { while (sp < ustr + size) {
if (*sp > 0x10FFFF) { if (*sp > 0x10FFFF) {
hsAssert(0, "UTF-32 character out of range"); // Invalid character gets replaced with U+FFFD
convlen += 3; // Use U+FFFD for release builds convlen += 3;
} }
else if (*sp > 0xFFFF) else if (*sp > 0xFFFF)
convlen += 4; convlen += 4;

26
Sources/Tests/CoreTests/test_plString.cpp

@ -123,18 +123,30 @@ TEST(PlStringTest, ConvertInvalid)
{ {
// The following should encode replacement characters for invalid chars // The following should encode replacement characters for invalid chars
const plUniChar unicode_replacement[] = { 0xfffd, 0 }; const plUniChar unicode_replacement[] = { 0xfffd, 0 };
const char latin1_replacement[] = "?";
const plUniChar char_too_big[] = { 0xffffff, 0 }; // Character outside of Unicode specification range
plUnicodeBuffer too_big = plString::FromUtf32(char_too_big).GetUnicodeArray(); const plUniChar too_big_c[] = { 0xffffff, 0 };
plUnicodeBuffer too_big = plString::FromUtf32(too_big_c).GetUnicodeArray();
EXPECT_EQ(0, T_strcmp(unicode_replacement, too_big.GetData())); EXPECT_EQ(0, T_strcmp(unicode_replacement, too_big.GetData()));
// TODO: Invalid surrogate pairs can encode to 0xfffd, but it's handled // Invalid surrogate pairs
// by an assert right now. const uint16_t incomplete_surr_c[] = { 0xd800, 0 };
plString incomplete_surr = plString::FromUtf16(incomplete_surr_c);
EXPECT_EQ(0, T_strcmp(unicode_replacement,
incomplete_surr.GetUnicodeArray().GetData()));
const uint16_t double_low_c[] = { 0xd800, 0xd801, 0 };
plString double_low = plString::FromUtf16(double_low_c);
EXPECT_EQ(0, T_strcmp(unicode_replacement, double_low.GetUnicodeArray().GetData()));
const uint16_t bad_combo_c[] = { 0xdc00, 0x20, 0 };
plString bad_combo = plString::FromUtf16(double_low_c);
EXPECT_EQ(0, T_strcmp(unicode_replacement, bad_combo.GetUnicodeArray().GetData()));
// ISO-8859-1 doesn't have \ufffd, so it uses '?' instead // ISO-8859-1 doesn't have \ufffd, so it uses '?' instead
const plUniChar high_char[] = { 0x1ff, 0 }; const plUniChar non_latin1_c[] = { 0x1ff, 0 };
const char latin1_replacement[] = "?"; plStringBuffer<char> non_latin1 = plString::FromUtf32(non_latin1_c).ToIso8859_1();
plStringBuffer<char> non_latin1 = plString::FromUtf32(high_char).ToIso8859_1();
EXPECT_STREQ(latin1_replacement, non_latin1.GetData()); EXPECT_STREQ(latin1_replacement, non_latin1.GetData());
} }

Loading…
Cancel
Save