Browse Source

More fully test encoding conversions. Also fixes some existing bugs:

- UTF-16 surrogate pairs were getting encoded incorrectly
- Signed shift converting from ISO-8859-1 got too many bits
- Edge case incorrect for converting to ISO-8859-1
Michael Hansen 10 years ago
parent
commit
d8e6d79fbd
  1. 9
      Sources/Plasma/CoreLib/plString.cpp
  2. 119
      Sources/Tests/CoreTests/test_plString.cpp

9
Sources/Plasma/CoreLib/plString.cpp

@ -299,9 +299,9 @@ void plString::IConvertFromIso8859_1(const char *astr, size_t size)
char *dp = utf8; char *dp = utf8;
sp = astr; sp = astr;
while (sp < astr + size) { while (sp < astr + size) {
if (*astr & 0x80) { if (*sp & 0x80) {
*dp++ = 0xC0 | ((*sp >> 6) & 0x1F); *dp++ = 0xC0 | ((uint8_t(*sp) >> 6) & 0x1F);
*dp++ = 0x80 | ((*sp ) & 0x3F); *dp++ = 0x80 | ((uint8_t(*sp) ) & 0x3F);
} else { } else {
*dp++ = *sp; *dp++ = *sp;
} }
@ -347,6 +347,7 @@ plStringBuffer<uint16_t> plString::ToUtf16() const
unichar |= (*sp++ & 0x3F) << 12; unichar |= (*sp++ & 0x3F) << 12;
unichar |= (*sp++ & 0x3F) << 6; unichar |= (*sp++ & 0x3F) << 6;
unichar |= (*sp++ & 0x3F); unichar |= (*sp++ & 0x3F);
unichar -= 0x10000;
*dp++ = 0xD800 | ((unichar >> 10) & 0x3FF); *dp++ = 0xD800 | ((unichar >> 10) & 0x3FF);
*dp++ = 0xDC00 | ((unichar ) & 0x3FF); *dp++ = 0xDC00 | ((unichar ) & 0x3FF);
@ -424,7 +425,7 @@ plStringBuffer<char> plString::ToIso8859_1() const
} else { } else {
unichar = *sp++; unichar = *sp++;
} }
*dp++ = (unichar < 0xFF) ? unichar : '?'; *dp++ = (unichar < 0x100) ? unichar : '?';
} }
astr[convlen] = 0; astr[convlen] = 0;

119
Sources/Tests/CoreTests/test_plString.cpp

@ -4,30 +4,119 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <wchar.h> #include <wchar.h>
static const plUniChar test_data[] = {
0x20, 0x7f, /* Normal ASCII chars */
0xff, 0x100, /* 8-bit boundary chars */
0x7fff, /* UTF-8 2-byte boundary */
0xffff, 0x10000, /* 16-bit boundary chars */
0x10020, 0x40000, /* Non-edge UTF-16 surrogate pairs */
0x10ffff, /* Highest Unicode character */
0 /* Null terminator */
};
/* UTF-8 version of above test data */
static const char utf8_test_data[] =
"\x20" "\x7f"
"\xc3\xbf" "\xc4\x80"
"\xe7\xbf\xbf"
"\xef\xbf\xbf" "\xf0\x90\x80\x80"
"\xf0\x90\x80\xa0" "\xf1\x80\x80\x80"
"\xf4\x8f\xbf\xbf";
/* UTF-16 version of test data */
static const uint16_t utf16_test_data[] = {
0x20, 0x7f,
0xff, 0x100,
0x7fff,
0xffff,
/* surrogate pairs for chars >0xffff */
0xd800, 0xdc00,
0xd800, 0xdc20,
0xd8c0, 0xdc00,
0xdbff, 0xdfff,
0
};
/* Utility for comparing plUniChar buffers */
template <typename _Ch>
static int T_strcmp(const _Ch *left, const _Ch *right)
{
for ( ;; ) {
if (*left != *right)
return *left - *right;
if (*left == 0)
return (*right == 0) ? 0 : -1;
if (*right == 0)
return 1;
++left;
++right;
}
}
TEST(PlStringTest,ToUtf16) TEST(PlStringTest, TestHelpers)
{ {
uint16_t text[] = {0x0061,0x0062,0x0063,0x0064}; //abcd as in utf16 /* Ensure the utilities for testing the module function properly */
plStringBuffer<uint16_t> expected = plStringBuffer<uint16_t>(text,arrsize(text)); EXPECT_EQ(0, T_strcmp("abc", "abc"));
plStringBuffer<uint16_t> output = plString("abcd").ToUtf16(); EXPECT_LT(0, T_strcmp("abc", "aba"));
EXPECT_GT(0, T_strcmp("abc", "abe"));
EXPECT_LT(0, T_strcmp("abc", "ab"));
EXPECT_GT(0, T_strcmp("abc", "abcd"));
}
EXPECT_EQ(expected.GetSize(), output.GetSize()); //not really a good test TEST(PlStringTest, ConvertUtf8)
{
// From UTF-8 to plString
plString from_utf8 = plString::FromUtf8(utf8_test_data);
EXPECT_STREQ(utf8_test_data, from_utf8.c_str());
plUnicodeBuffer unicode = from_utf8.GetUnicodeArray();
EXPECT_EQ(0, T_strcmp(test_data, unicode.GetData()));
// From plString to UTF-8
plString to_utf8 = plString::FromUtf32(test_data);
EXPECT_STREQ(utf8_test_data, to_utf8.c_str());
}
TEST(PlStringTest, ConvertUtf16)
{
// From UTF-16 to plString
plString from_utf16 = plString::FromUtf16(utf16_test_data);
plUnicodeBuffer unicode = from_utf16.GetUnicodeArray();
EXPECT_EQ(0, T_strcmp(test_data, unicode.GetData()));
// From plString to UTF-16
plStringBuffer<uint16_t> to_utf16 = plString::FromUtf32(test_data).ToUtf16();
EXPECT_EQ(0, T_strcmp(utf16_test_data, to_utf16.GetData()));
} }
TEST(PlStringTest,ToWchar) TEST(PlStringTest, ConvertIso8859_1)
{ {
wchar_t text[] =L"abcd\u00E9"; // From ISO-8859-1 to plString
plStringBuffer<wchar_t> expected = plStringBuffer<wchar_t>(text,arrsize(text)); const char latin1[] = "\x20\x7e\xa0\xff";
plStringBuffer<wchar_t> output = plString("abcd\xC3\xA9").ToWchar(); const plUniChar unicode_cp0[] = { 0x20, 0x7e, 0xa0, 0xff, 0 };
EXPECT_STREQ(expected.GetData(),output.GetData()); plString from_latin1 = plString::FromIso8859_1(latin1);
plUnicodeBuffer unicode = from_latin1.GetUnicodeArray();
EXPECT_EQ(0, T_strcmp(unicode_cp0, unicode.GetData()));
// From plString to ISO-8859-1
plStringBuffer<char> to_latin1 = plString::FromUtf32(unicode_cp0).ToIso8859_1();
EXPECT_STREQ(latin1, to_latin1.GetData());
} }
TEST(PlStringTest,ToIso8859_1) TEST(PlStringTest, ConvertWchar)
{ {
char text[] ="abcde"; // UTF-8 and UTF-16 are already tested, so just make sure we test
plStringBuffer<char> expected = plStringBuffer<char>(text,arrsize(text)); // wchar_t and L"" conversions
plStringBuffer<char> output = plString("abcde").ToIso8859_1();
EXPECT_STREQ(expected.GetData(),output.GetData()); const wchar_t wtext[] = L"\x20\x7f\xff\u0100\uffff";
const plUniChar unicode_text[] = { 0x20, 0x7f, 0xff, 0x100, 0xffff, 0 };
plString from_wchar = plString::FromWchar(wtext);
plUnicodeBuffer unicode = from_wchar.GetUnicodeArray();
EXPECT_EQ(0, T_strcmp(unicode_text, unicode.GetData()));
// From plString to wchar_t
plStringBuffer<wchar_t> to_wchar = plString::FromUtf32(unicode_text).ToWchar();
EXPECT_STREQ(wtext, to_wchar.GetData());
} }
TEST(PlStringTest,FindChar) TEST(PlStringTest,FindChar)

Loading…
Cancel
Save