CWE-ou-minkata/Sources/Plasma/CoreLib/plString.h

/*==LICENSE==*

CyanWorlds.com Engine - MMOG client, server and tools
Copyright (C) 2011  Cyan Worlds, Inc.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Additional permissions under GNU GPL version 3 section 7

If you modify this Program, or any covered work, by linking or
combining it with any of RAD Game Tools Bink SDK, Autodesk 3ds Max SDK,
NVIDIA PhysX SDK, Microsoft DirectX SDK, OpenSSL library, Independent
JPEG Group JPEG library, Microsoft Windows Media SDK, or Apple QuickTime SDK
(or a modified version of those libraries),
containing parts covered by the terms of the Bink SDK EULA, 3ds Max EULA,
PhysX SDK EULA, DirectX SDK EULA, OpenSSL and SSLeay licenses, IJG
JPEG Library README, Windows Media SDK EULA, or QuickTime SDK EULA, the
licensors of this Program grant you additional
permission to convey the resulting work. Corresponding Source for a
non-source form of such a combination shall include the source code for
the parts of OpenSSL and IJG JPEG Library used as well as that of the covered
work.

You can contact Cyan Worlds, Inc. by email legal@cyan.com
 or by snail mail at:
      Cyan Worlds, Inc.
      14617 N Newport Hwy
      Mead, WA   99021

*==LICENSE==*/

#ifndef plString_Defined
#define plString_Defined

#include <stdint.h>
#include <vector>

/** Single Unicode character code unit */
typedef unsigned int UniChar;

#define SSO_CHARS (16)
#define STRING_STACK_SIZE (256)
#define WHITESPACE_CHARS " \t\n\r"

/** Ref-counted string data buffer.
 *  This is used to store actual string data in any (unchecked) encoding format,
 *  including both the internal UTF-8 data of plString itself as well as the
 *  temporaries returned in the conversion operators.
 *  \sa plString
 */
template <typename _Ch>
class plStringBuffer
{
private:
    struct StringRef
    {
        unsigned int fRefs;
        const _Ch *fStringData;

        StringRef(const _Ch *data)
            : fRefs(1), fStringData(data) { }

        inline void AddRef() { ++fRefs; }
        inline void DecRef()
        {
            if (--fRefs == 0) {
                delete [] fStringData;
                delete this;
            }
        }
    };

    union {
        StringRef *fData;
        _Ch fShort[SSO_CHARS];
    };
    size_t fSize;

    bool IHaveACow() const { return fSize >= SSO_CHARS; }

public:
    /** Construct an empty string buffer. */
    plStringBuffer() : fSize(0) { memset(fShort, 0, sizeof(fShort)); }

    /** Copy constructor - adds a reference to the copied buffer */
    plStringBuffer(const plStringBuffer<_Ch> &copy) : fSize(copy.fSize)
    {
        memcpy(fShort, copy.fShort, sizeof(fShort));
        if (IHaveACow())
            fData->AddRef();
    }

    /** Construct a string buffer which holds a COPY of the \a data, up to
     *  \a size characters.  The terminating '\0' is added automatically,
     *  meaning this constructor is safe to use on buffers which are not
     *  already null-terminated.
     */
    plStringBuffer(const _Ch *data, size_t size) : fSize(size)
    {
        memset(fShort, 0, sizeof(fShort));
        _Ch *copyData = IHaveACow() ? new _Ch[size + 1] : fShort;
        memcpy(copyData, data, size);
        copyData[size] = 0;

        if (IHaveACow())
            fData = new StringRef(copyData);
    }

    /** Destructor.  The ref-counted data will only be freed if no other
     *  string buffers still reference it.
     */
    ~plStringBuffer<_Ch>()
    {
        if (IHaveACow())
            fData->DecRef();
    }

    /** Assignment operator.  Changes the reference to point to the
     *  copied buffer in \a copy.
     */
    plStringBuffer<_Ch> &operator=(const plStringBuffer<_Ch> &copy)
    {
        if (copy.IHaveACow())
            copy.fData->AddRef();
        if (IHaveACow())
            fData->DecRef();

        memcpy(fShort, copy.fShort, sizeof(fShort));
        fSize = copy.fSize;
        return *this;
    }

    /** Returns a pointer to the referenced string buffer. */
    const _Ch *GetData() const { return IHaveACow() ? fData->fStringData : fShort; }

    /** Returns the number of characters (not including the '\0') in the
     *  referenced string buffer.
     */
    size_t GetSize() const { return fSize; }

    /** Cast operator.  This is a shortcut for not needing to call GetData on
     *  buffer objects passed to methods or objects expecting a C-style string.
     */
    operator const _Ch *() const { return GetData(); }

    /** Create a writable buffer for \a size characters.
     *  From Haxxia with love!  This will release the current string buffer
     *  reference and then create a new buffer with space for \a size
     *  characters, plus one extra for the terminating '\0'.  The newly
     *  allocated buffer is returned as a non-const pointer, so it can be
     *  written to without having to use a \c const_cast.
     *  \warning The caller is expected to null-terminate the returned buffer.
     *           Not doing so may cause problems for functions and objects
     *           expecting a null-terminated C-style string.
     */
    _Ch *CreateWritableBuffer(size_t size)
    {
        if (IHaveACow())
            fData->DecRef();

        fSize = size;
        if (IHaveACow()) {
            _Ch *writable = new _Ch[fSize + 1];
            fData = new StringRef(writable);
            return writable;
        } else {
            return fShort;
        }
    }
};

/** A plStringBuffer for storing fully-expanded Unicode data */
typedef plStringBuffer<UniChar> plUnicodeBuffer;

/** Unicode-capable and (mostly) binary safe string class.
 *  plString stores SSO-optimized or reference counted strings (automatically
 *  determined based on string length) for easy and performant string data
 *  storage and manipulation.  The internal format of plString is UTF-8,
 *  meaning it keeps all Unicode information from conversions.  plStrings
 *  are safe to share without making explicit copies, since plStrings
 *  follow the strings-are-immutable philosophy.  Anything which mutates
 *  a plString object will do so in a new string buffer, allowing other
 *  string objects to retain the old data without getting unexpected changes.
 */
class plString
{
public:
    enum {
        /** Automatically determine the size of input (Requires input to be
         *  correctly null-terminated).
         */
        kSizeAuto = (size_t)(0x80000000)
    };

    /** Represents a "null" plString object. */
    static const plString Null;

private:
    plStringBuffer<char> fUtf8Buffer;

    void IConvertFromUtf8(const char *utf8, size_t size);
    void IConvertFromUtf16(const uint16_t *utf16, size_t size);
    void IConvertFromWchar(const wchar_t *wstr, size_t size);
    void IConvertFromUtf32(const UniChar *ustr, size_t size);
    void IConvertFromIso8859_1(const char *astr, size_t size);

    // Constructing and comparing with nil or nullptr won't break plString,
    // but it's preferred not to do so with the constants.  That is to say,
    // you can construct with a const char * which points to null, but
    // don't actually write `plString foo = nil;`
    plString(std::nullptr_t) { }
    void operator=(std::nullptr_t) { }
    void operator==(std::nullptr_t) const { }
    void operator!=(std::nullptr_t) const { }

public:
    /** Construct a valid, empty string. */
    plString() { }

    /** Construct a string from a C-style string.
     *  \note This constructor expects the input to be UTF-8 encoded.  For
     *        conversion from ISO-8859-1 8-bit data, use FromIso8859_1().
     */
    plString(const char *cstr, size_t size = kSizeAuto) { IConvertFromUtf8(cstr, size); }

    /** Copy constructor. */
    plString(const plString &copy) : fUtf8Buffer(copy.fUtf8Buffer) { }

    /** Copy constructor from plStringBuffer<char>.
     *  \note This constructor expects the input to be UTF-8 encoded.  For
     *        conversion from ISO-8859-1 8-bit data, use FromIso8859_1().
     */
    plString(const plStringBuffer<char> &init) { operator=(init); }

    /** Construct a string from expanded Unicode data. */
    plString(const plUnicodeBuffer &init) { IConvertFromUtf32(init.GetData(), init.GetSize()); }

    /** Assignment operator.  Same as plString(const char *). */
    plString &operator=(const char *cstr) { IConvertFromUtf8(cstr, kSizeAuto); return *this; }

    /** Assignment operator.  Same as plString(const plString &). */
    plString &operator=(const plString &copy) { fUtf8Buffer = copy.fUtf8Buffer; return *this; }

    /** Assignment operator.  Same as plString(const plStringBuffer<char> &). */
    plString &operator=(const plStringBuffer<char> &init);

    /** Assignment operator.  Same as plString(const plUnicodeBuffer &). */
    plString &operator=(const plUnicodeBuffer &init) { IConvertFromUtf32(init.GetData(), init.GetSize()); return *this; }

    /** Append UTF-8 data from a C-style string pointer to the end of this
     *  string object.
     *  \sa plStringStream
     */
    plString &operator+=(const char *cstr) { return operator=(*this + cstr); }

    /** Append the string \a str to the end of this string object.
     *  \sa plStringStream
     */
    plString &operator+=(const plString &str) { return operator=(*this + str); }

    /** Create a new plString object from the UTF-8 formatted data in \a utf8. */
    static inline plString FromUtf8(const char *utf8, size_t size = kSizeAuto)
    {
        plString str;
        str.IConvertFromUtf8(utf8, size);
        return str;
    }

    /** Create a new plString object from the UTF-16 formatted data in \a utf16. */
    static inline plString FromUtf16(const uint16_t *utf16, size_t size = kSizeAuto)
    {
        plString str;
        str.IConvertFromUtf16(utf16, size);
        return str;
    }

    /** Create a new plString object from the \p wchar_t data in \a wstr. */
    static inline plString FromWchar(const wchar_t *wstr, size_t size = kSizeAuto)
    {
        plString str;
        str.IConvertFromWchar(wstr, size);
        return str;
    }

    /** Create a new plString object from the UTF-32 formatted data in \a utf32. */
    static inline plString FromUtf32(const UniChar *utf32, size_t size = kSizeAuto)
    {
        plString str;
        str.IConvertFromUtf32(utf32, size);
        return str;
    }

    /** Create a new plString object from the ISO-8859-1 formatted data in \a astr. */
    static inline plString FromIso8859_1(const char *astr, size_t size = kSizeAuto)
    {
        plString str;
        str.IConvertFromIso8859_1(astr, size);
        return str;
    }

    /** Return the internal UTF-8 data pointer for use in functions and objects
     *  expecting C-style string pointers.  If this string is empty, returns
     *  \a substitute instead.
     */
    const char *c_str(const char *substitute = "") const
    { return IsEmpty() ? substitute : fUtf8Buffer.GetData(); }

    /** Return the byte at position \a position.  Note that this may be in
     *  the middle of a UTF-8 sequence -- if you want an actual Unicode
     *  character, use the buffer returned from GetUnicodeArray() instead.
     */
    char CharAt(size_t position) const { return c_str()[position]; }

    /** Returns the internal UTF-8 data buffer object. */
    plStringBuffer<char> ToUtf8() const { return fUtf8Buffer; }

    /** Convert this string's data to a UTF-16 string buffer. */
    plStringBuffer<uint16_t> ToUtf16() const;

    /** Convert this string's data to a wchar_t string buffer.
     *  \note Depending on your platform and compiler configuration, this
     *        will either return UTF-16 or UTF-32 data -- it will never
     *        return a non-unicode data buffer.
     */
    plStringBuffer<wchar_t> ToWchar() const;

    /** Convert this string's data as closely as possible to ISO-8859-1.
     *  Unicode characters outside of the ISO-8859-1 range will be stored
     *  in the buffer as a question mark ('?').
     */
    plStringBuffer<char> ToIso8859_1() const;

    /** Convert the string's data to a fully expanded UTF-32 buffer.  This
     *  makes it easy to operate on actual Unicode characters instead of
     *  UTF-8 bytes (e.g. for use in rendering characters to a display).
     */
    plUnicodeBuffer GetUnicodeArray() const;

    /** Returns the size in number of bytes (excluding the null-terminator) of
     *  this string.
     */
    size_t GetSize() const { return fUtf8Buffer.GetSize(); }

    /** Returns \c true if this string is empty (""). */
    bool IsEmpty() const { return fUtf8Buffer.GetSize() == 0; }

    /** Returns \c true if this string is "null".  Currently, this is just
     *  a synonym for IsEmpty(), as plString makes no distinction between
     *  null and empty strings.
     *  \todo Evaluate whether Plasma actually needs to distinguish between
     *        empty and NULL strings.  Ideally, only IsEmpty should be required.
     */
    bool IsNull() const { return IsEmpty(); }

    /** Convert the string data to an integer in base \a base.
     *  If base is set to 0, this function behaves like strtol, which checks
     *  for hex or octal prefixes (e.g. 0777 or 0x1234), and assumes base 10
     *  if none are found.
     */
    int ToInt(int base = 0) const;

    /** Convert the string to an unsigned integer in base \a base.
     *  If base is set to 0, this function behaves like strtoul, which checks
     *  for hex or octal prefixes (e.g. 0777 or 0x1234), and assumes base 10
     *  if none are found.
     */
    unsigned int ToUInt(int base = 0) const;

    /** Convert the string to a floating point value. */
    float ToFloat() const;

    /** Convert the string to a double precision floating point value. */
    double ToDouble() const;

    /** Construct a plString using a printf-like format string. */
    static plString Format(const char *fmt, ...);

    /** Construct a plString using a printf-like format string.
     *  This function should be called inside of vararg functions, such as
     *  plString::Format().
     */
    static plString IFormat(const char *fmt, va_list vptr);

    enum CaseSensitivity {
        kCaseSensitive, kCaseInsensitive
    };

    /** Compare this string with \a str.
     *  \return an integer which indicates:
     *    \li \p =0 - the strings are equal
     *    \li \p \<0 - this string is lexicographically less than \a str
     *    \li \p \>0 - this string is lexicographically greater than \a str
     */
    int Compare(const plString &str, CaseSensitivity sense = kCaseSensitive) const
    {
        return (sense == kCaseSensitive) ? strcmp(c_str(), str.c_str())
                                         : stricmp(c_str(), str.c_str());
    }

    /** Compare this string with \a str.
     *  \return an integer which indicates:
     *    \li \p =0 - the strings are equal
     *    \li \p \<0 - this string is lexicographically less than \a str
     *    \li \p \>0 - this string is lexicographically greater than \a str
     */
    int Compare(const char *str, CaseSensitivity sense = kCaseSensitive) const
    {
        return (sense == kCaseSensitive) ? strcmp(c_str(), str ? str : "")
                                         : stricmp(c_str(), str ? str : "");
    }

    /** Compare up to but never exceeding the first \a count bytes of this
     *  string with \a str.
     *  \sa Compare(const plString &, CaseSensitivity) const
     */
    int CompareN(const plString &str, size_t count, CaseSensitivity sense = kCaseSensitive) const
    {
        return (sense == kCaseSensitive) ? strncmp(c_str(), str.c_str(), count)
                                         : strnicmp(c_str(), str.c_str(), count);
    }

    /** Compare up to but never exceeding the first \a count bytes of this
     *  string with \a str.
     *  \sa Compare(const char *, CaseSensitivity) const
     */
    int CompareN(const char *str, size_t count, CaseSensitivity sense = kCaseSensitive) const
    {
        return (sense == kCaseSensitive) ? strncmp(c_str(), str ? str : "", count)
                                         : strnicmp(c_str(), str ? str : "", count);
    }

    /** Shortcut for Compare(str, kCaseInsensitive). */
    int CompareI(const plString &str) const { return Compare(str, kCaseInsensitive); }

    /** Shortcut for Compare(str, kCaseInsensitive). */
    int CompareI(const char *str) const { return Compare(str, kCaseInsensitive); }

    /** Shortcut for CompareN(str, kCaseInsensitive). */
    int CompareNI(const plString &str, size_t count) const { return CompareN(str, count, kCaseInsensitive); }

    /** Shortcut for CompareN(str, kCaseInsensitive). */
    int CompareNI(const char *str, size_t count) const { return CompareN(str, count, kCaseInsensitive); }

    /** Operator overload for use in containers which depend on \c std::less. */
    bool operator<(const plString &other) const { return Compare(other) < 0; }

    /** Test if this string contains the same string data as \a other. */
    bool operator==(const char *other) const { return Compare(other) == 0; }

    /** Test if this string contains the same string data as \a other. */
    bool operator==(const plString &other) const { return Compare(other) == 0; }

    /** Inverse of operator==(const char *) const. */
    bool operator!=(const char *other) const { return Compare(other) != 0; }

    /** Inverse of operator==(const plString &) const. */
    bool operator!=(const plString &other) const { return Compare(other) != 0; }

    /** Find the index of the first instance of \a ch in this string.
     *  \return -1 if the character was not found.
     */
    int Find(char ch, CaseSensitivity sense = kCaseSensitive) const;

    /** Find the index of the last instance of \a ch in this string.
     *  \return -1 if the character was not found.
     */
    int FindLast(char ch, CaseSensitivity sense = kCaseSensitive) const;

    /** Find the index of the first instance of \a str in this string.
     *  \return -1 if the substring was not found.
     */
    int Find(const char *str, CaseSensitivity sense = kCaseSensitive) const;

    /** Find the index of the first instance of \a str in this string.
     *  \return -1 if the substring was not found.
     */
    int Find(const plString &str, CaseSensitivity sense = kCaseSensitive) const
    { return Find(str.c_str(), sense); }

    /** Check that this string matches the specified regular expression.
     *  This with only return true if the whole string can be matched
     *  by \a pattern.
     */
    bool REMatch(const char *pattern, CaseSensitivity sense = kCaseSensitive) const;

    /** Search for substrings which match the specified regular expression.
     *  If capture groups are specified in the pattern, they will be
     *  returned as additional strings in the returned vector, starting at
     *  index 1 (index 0 contains the whole match).  If the pattern was not
     *  found, this returns an empty vector.
     */
    std::vector<plString> RESearch(const char *pattern, CaseSensitivity sense = kCaseSensitive) const;

    /** Trim any characters in the supplied \a charset from the left of
     *  this string.
     */
    plString TrimLeft(const char *charset = WHITESPACE_CHARS) const;

    /** Trim any characters in the supplied \a charset from the right of
     *  this string.
     */
    plString TrimRight(const char *charset = WHITESPACE_CHARS) const;

    /** Trim any characters in the supplied \a charset from both ends of
     *  this string.  Logically equivalent to (but more efficient than)
     *  str.TrimLeft(charset).TrimRight(charset)
     */
    plString Trim(const char *charset = WHITESPACE_CHARS) const;

    /** Return a substring starting at index \a start, with up to \a size
     *  characters from the start position.  If \a size is greater than the
     *  number of characters left in the string after \a start, Substr will
     *  return the remainder of the string.
     */
    plString Substr(int start, size_t size = kSizeAuto) const;

    /** Return a substring containing at most \a size characters from the left
     *  of the string.  Equivalent to Substr(0, size).
     */
    plString Left(size_t size) const { return Substr(0, size); }

    /** Return a substring containing at most \a size characters from the right
     *  of the string.  Equivalent to Substr(GetSize() - size, size).
     */
    plString Right(size_t size) const { return Substr(GetSize() - size, size); }

    /** Return a copy of this string with all occurances of \a from replaced
     *  with \a to. */
    plString Replace(const char *from, const char *to) const;

    /** Return a copy of this string with all Latin-1 alphabetic characters
     *  converted to upper case.
     *  \sa CompareI()
     */
    plString ToUpper() const;

    /** Return a copy of this string with all Latin-1 alphabetic characters
     *  converted to lower case.
     *  \sa CompareI()
     */
    plString ToLower() const;

    /** Split this string into pieces separated by the substring \a split.
     *  This will return the complete contents of everything between split
     *  markers, meaning that two subsequent markers will produce an empty
     *  string in the returned vector.
     *  \sa Tokenize()
     */
    std::vector<plString> Split(const char *split, size_t maxSplits = kSizeAuto) const;

    /** Split this string into tokens, delimited by \a delims.
     *  Note that, unlike Split(), Tokenize will return only non-blank strings
     *  after stripping out all delimiters between tokens.
     *  \sa Split()
     */
    std::vector<plString> Tokenize(const char *delims = WHITESPACE_CHARS) const;

    /** Create a string initialized with \a count copies of the character \a c. */
    static plString Fill(size_t count, char c);

public:
    /** Functor which compares two strings case-insensitively for sorting. */
    struct less_i
    {
        bool operator()(const plString &_L, const plString &_R) const
        { return _L.Compare(_R, kCaseInsensitive) < 0; }
    };

    /** Functor which compares two strings case-insensitively for equality. */
    struct equal_i
    {
        bool operator()(const plString &_L, const plString &_R) const
        { return _L.Compare(_R, kCaseInsensitive) == 0; }
    };

private:
    friend plString operator+(const plString &left, const plString &right);
    friend plString operator+(const plString &left, const char *right);
    friend plString operator+(const char *left, const plString &right);
};

/** Concatenation operator for plStrings. */
plString operator+(const plString &left, const plString &right);

/** Concatenation operator for plStrings and UTF-8 C-style string data. */
plString operator+(const plString &left, const char *right);

/** Concatenation operator for plStrings and UTF-8 C-style string data. */
plString operator+(const char *left, const plString &right);


/** Helper class for writing frequent data to a text buffer efficiently.
 *  This should be used instead of plString::operator+=() for constructing
 *  string data in pieces, as it keeps a running buffer instead of allocating
 *  new storage for each append result.
 */
class plStringStream
{
public:
    /** Construct a new empty string stream.  The first STRING_STACK_SIZE
     *  bytes are allocated on the stack for further efficiency.
     */
    plStringStream() : fBufSize(STRING_STACK_SIZE), fLength(0) { }

    /** Destructor, frees any allocated heap memory owned by the stream. */
    ~plStringStream() { if (ICanHasHeap()) delete [] fBuffer; }

    /** Append string data to the end of the stream. */
    plStringStream &append(const char *data, size_t length);

    /** Append UTF-8 C-style string data to the stream. */
    plStringStream &operator<<(const char *text);

    /** Append a base-10 formatted signed integer to the stream. */
    plStringStream &operator<<(int num);

    /** Append a base-10 formatted unsigned integer to the stream. */
    plStringStream &operator<<(unsigned int num);

    /** Append a base-10 formatted float to the stream. */
    plStringStream &operator<<(float num) { return operator<<(static_cast<double>(num)); }

    /** Append a base-10 formatted double to the stream. */
    plStringStream &operator<<(double num);

    /** Append a single Latin-1 character to the stream. */
    plStringStream &operator<<(char ch) { return append(&ch, 1); }

    /** Append the contents of \a text to the stream. */
    plStringStream &operator<<(const plString &text)
    {
        return append(text.c_str(), text.GetSize());
    }

    /** Returns a pointer to the beginning of the stream buffer.
     *  \warning This pointer is not null-terminated.
     */
    const char *GetRawBuffer() const
    {
        return ICanHasHeap() ? fBuffer : fShort;
    }

    /** Return the size (in bytes) of the stream's data. */
    size_t GetLength() const { return fLength; }

    /** Convert the stream's data to a UTF-8 string. */
    plString GetString() const { return plString::FromUtf8(GetRawBuffer(), fLength); }

    /** Reset the stream's append pointer back to the beginning.
     *  This does not incur a reallocation of the buffer -- it is left
     *  with as much space as it had before, making this method more
     *  useful for re-using string streams in loops.
     */
    void Truncate() { fLength = 0; }

private:
    union {
        char *fBuffer;
        char fShort[STRING_STACK_SIZE];
    };
    size_t fBufSize, fLength;

    bool ICanHasHeap() const { return fBufSize > STRING_STACK_SIZE; }
};

/** \p strlen implementation for UniChar based C-style string buffers. */
size_t ustrlen(const UniChar *ustr, size_t max = plString::kSizeAuto);

#endif //plString_Defined