From d8e6d79fbd78d79dda3fcbd6fd39f5eab7e43df9 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Fri, 13 Feb 2015 00:07:45 -0800 Subject: [PATCH] More fully test encoding conversions. Also fixes some existing bugs: - UTF-16 surrogate pairs were getting encoded incorrectly - Signed shift converting from ISO-8859-1 got too many bits - Edge case incorrect for converting to ISO-8859-1 --- Sources/Plasma/CoreLib/plString.cpp | 9 +- Sources/Tests/CoreTests/test_plString.cpp | 121 +++++++++++++++++++--- 2 files changed, 110 insertions(+), 20 deletions(-) diff --git a/Sources/Plasma/CoreLib/plString.cpp b/Sources/Plasma/CoreLib/plString.cpp index 08bab171..754b7882 100644 --- a/Sources/Plasma/CoreLib/plString.cpp +++ b/Sources/Plasma/CoreLib/plString.cpp @@ -299,9 +299,9 @@ void plString::IConvertFromIso8859_1(const char *astr, size_t size) char *dp = utf8; sp = astr; while (sp < astr + size) { - if (*astr & 0x80) { - *dp++ = 0xC0 | ((*sp >> 6) & 0x1F); - *dp++ = 0x80 | ((*sp ) & 0x3F); + if (*sp & 0x80) { + *dp++ = 0xC0 | ((uint8_t(*sp) >> 6) & 0x1F); + *dp++ = 0x80 | ((uint8_t(*sp) ) & 0x3F); } else { *dp++ = *sp; } @@ -347,6 +347,7 @@ plStringBuffer plString::ToUtf16() const unichar |= (*sp++ & 0x3F) << 12; unichar |= (*sp++ & 0x3F) << 6; unichar |= (*sp++ & 0x3F); + unichar -= 0x10000; *dp++ = 0xD800 | ((unichar >> 10) & 0x3FF); *dp++ = 0xDC00 | ((unichar ) & 0x3FF); @@ -424,7 +425,7 @@ plStringBuffer plString::ToIso8859_1() const } else { unichar = *sp++; } - *dp++ = (unichar < 0xFF) ? unichar : '?'; + *dp++ = (unichar < 0x100) ? unichar : '?'; } astr[convlen] = 0; diff --git a/Sources/Tests/CoreTests/test_plString.cpp b/Sources/Tests/CoreTests/test_plString.cpp index f9cf0566..ff1fa312 100644 --- a/Sources/Tests/CoreTests/test_plString.cpp +++ b/Sources/Tests/CoreTests/test_plString.cpp @@ -4,30 +4,119 @@ #include #include +static const plUniChar test_data[] = { + 0x20, 0x7f, /* Normal ASCII chars */ + 0xff, 0x100, /* 8-bit boundary chars */ + 0x7fff, /* UTF-8 2-byte boundary */ + 0xffff, 0x10000, /* 16-bit boundary chars */ + 0x10020, 0x40000, /* Non-edge UTF-16 surrogate pairs */ + 0x10ffff, /* Highest Unicode character */ + 0 /* Null terminator */ +}; + +/* UTF-8 version of above test data */ +static const char utf8_test_data[] = + "\x20" "\x7f" + "\xc3\xbf" "\xc4\x80" + "\xe7\xbf\xbf" + "\xef\xbf\xbf" "\xf0\x90\x80\x80" + "\xf0\x90\x80\xa0" "\xf1\x80\x80\x80" + "\xf4\x8f\xbf\xbf"; + +/* UTF-16 version of test data */ +static const uint16_t utf16_test_data[] = { + 0x20, 0x7f, + 0xff, 0x100, + 0x7fff, + 0xffff, + /* surrogate pairs for chars >0xffff */ + 0xd800, 0xdc00, + 0xd800, 0xdc20, + 0xd8c0, 0xdc00, + 0xdbff, 0xdfff, + 0 +}; + +/* Utility for comparing plUniChar buffers */ +template +static int T_strcmp(const _Ch *left, const _Ch *right) +{ + for ( ;; ) { + if (*left != *right) + return *left - *right; + if (*left == 0) + return (*right == 0) ? 0 : -1; + if (*right == 0) + return 1; + + ++left; + ++right; + } +} + +TEST(PlStringTest, TestHelpers) +{ + /* Ensure the utilities for testing the module function properly */ + EXPECT_EQ(0, T_strcmp("abc", "abc")); + EXPECT_LT(0, T_strcmp("abc", "aba")); + EXPECT_GT(0, T_strcmp("abc", "abe")); + EXPECT_LT(0, T_strcmp("abc", "ab")); + EXPECT_GT(0, T_strcmp("abc", "abcd")); +} + +TEST(PlStringTest, ConvertUtf8) +{ + // From UTF-8 to plString + plString from_utf8 = plString::FromUtf8(utf8_test_data); + EXPECT_STREQ(utf8_test_data, from_utf8.c_str()); + plUnicodeBuffer unicode = from_utf8.GetUnicodeArray(); + EXPECT_EQ(0, T_strcmp(test_data, unicode.GetData())); + + // From plString to UTF-8 + plString to_utf8 = plString::FromUtf32(test_data); + EXPECT_STREQ(utf8_test_data, to_utf8.c_str()); +} -TEST(PlStringTest,ToUtf16) +TEST(PlStringTest, ConvertUtf16) { - uint16_t text[] = {0x0061,0x0062,0x0063,0x0064}; //abcd as in utf16 - plStringBuffer expected = plStringBuffer(text,arrsize(text)); - plStringBuffer output = plString("abcd").ToUtf16(); - - EXPECT_EQ(expected.GetSize(), output.GetSize()); //not really a good test + // From UTF-16 to plString + plString from_utf16 = plString::FromUtf16(utf16_test_data); + plUnicodeBuffer unicode = from_utf16.GetUnicodeArray(); + EXPECT_EQ(0, T_strcmp(test_data, unicode.GetData())); + + // From plString to UTF-16 + plStringBuffer to_utf16 = plString::FromUtf32(test_data).ToUtf16(); + EXPECT_EQ(0, T_strcmp(utf16_test_data, to_utf16.GetData())); } -TEST(PlStringTest,ToWchar) +TEST(PlStringTest, ConvertIso8859_1) { - wchar_t text[] =L"abcd\u00E9"; - plStringBuffer expected = plStringBuffer(text,arrsize(text)); - plStringBuffer output = plString("abcd\xC3\xA9").ToWchar(); - EXPECT_STREQ(expected.GetData(),output.GetData()); + // From ISO-8859-1 to plString + const char latin1[] = "\x20\x7e\xa0\xff"; + const plUniChar unicode_cp0[] = { 0x20, 0x7e, 0xa0, 0xff, 0 }; + plString from_latin1 = plString::FromIso8859_1(latin1); + plUnicodeBuffer unicode = from_latin1.GetUnicodeArray(); + EXPECT_EQ(0, T_strcmp(unicode_cp0, unicode.GetData())); + + // From plString to ISO-8859-1 + plStringBuffer to_latin1 = plString::FromUtf32(unicode_cp0).ToIso8859_1(); + EXPECT_STREQ(latin1, to_latin1.GetData()); } -TEST(PlStringTest,ToIso8859_1) +TEST(PlStringTest, ConvertWchar) { - char text[] ="abcde"; - plStringBuffer expected = plStringBuffer(text,arrsize(text)); - plStringBuffer output = plString("abcde").ToIso8859_1(); - EXPECT_STREQ(expected.GetData(),output.GetData()); + // UTF-8 and UTF-16 are already tested, so just make sure we test + // wchar_t and L"" conversions + + const wchar_t wtext[] = L"\x20\x7f\xff\u0100\uffff"; + const plUniChar unicode_text[] = { 0x20, 0x7f, 0xff, 0x100, 0xffff, 0 }; + plString from_wchar = plString::FromWchar(wtext); + plUnicodeBuffer unicode = from_wchar.GetUnicodeArray(); + EXPECT_EQ(0, T_strcmp(unicode_text, unicode.GetData())); + + // From plString to wchar_t + plStringBuffer to_wchar = plString::FromUtf32(unicode_text).ToWchar(); + EXPECT_STREQ(wtext, to_wchar.GetData()); } TEST(PlStringTest,FindChar)