hi, here is my code:
std::string test = “abcdefghij”;
CCLOG(“test1 = %s”, Helper::getSubStringOfUTF8String(test, 0, 2).c_str());
CCLOG(“test2 = %s”, Helper::getSubStringOfUTF8String(test, 2, 2).c_str());
CCLOG(“test3 = %s”, Helper::getSubStringOfUTF8String(test, 4, 2).c_str());
and in output window i get next:
test1 = ab
test2 = cdef
test3 = efghij
but i want:
test1 = ab
test2 = cd
test3 = ef
i can’t understand, what i am doing wrong, please, any idea?
thanks
Hello.
I have reproduce it in my local machine. It seems to be a bug that should be fixed.
The following code works just fine instead:
std::string Helper::getSubStringOfUTF8String(const std::string& str, std::string::size_type start, std::string::size_type length)
{
std::u32string utf32;
if (!StringUtils::UTF8ToUTF32(str, utf32)) {
CCLOGERROR("Can't convert string to UTF-32: %s", str.c_str());
return "";
}
if (utf32.size() < start) {
CCLOGERROR("'start' is out of range: %lu, %s", start, str.c_str());
return "";
}
std::string result;
if (!StringUtils::UTF32ToUTF8(utf32.substr(start, length), result)) {
CCLOGERROR("Can't convert internal UTF-32 string to UTF-8: %s", str.c_str());
return "";
}
return result;
}
an alternative way of solving the problem using LLVM’s ConvertUTF.h without converting to UTF-32:
#include "external/ConvertUTF/ConvertUTF.h"
std::string Helper::getSubStringOfUTF8String(const std::string& str, std::string::size_type start, std::string::size_type length)
{
if (start > str.size()) {
CCLOGERROR("'start' is out of range: %lu, %s", start, str.c_str());
return "";
}
std::string::size_type min = std::string::npos;
std::string::size_type max = std::string::npos;
std::string::size_type utf8CharacterCount = 0;
std::string::size_type index = 0;
while (index < str.size()) {
auto source = reinterpret_cast<const UTF8*>(str.c_str() + index);
auto sourceEnd = reinterpret_cast<const UTF8*>(str.c_str() + str.size());
if (!isLegalUTF8Sequence(source, sourceEnd)) {
CCLOGERROR("Invalid UTF-8 byte sequence: %s", str.c_str());
return "";
}
if (utf8CharacterCount == start) {
min = index;
}
if (utf8CharacterCount == (start + length)) {
max = index;
break;
}
auto bytes = getNumBytesForUTF8(static_cast<UTF8>(str[index]));
CC_ASSERT((index + bytes) <= str.size());
index += bytes;
utf8CharacterCount++;
}
if (min == std::string::npos) {
if (utf8CharacterCount < start) {
CCLOGERROR("'start' is out of range: %lu, %s", start, str.c_str());
return "";
}
CC_ASSERT(index == str.size());
min = index;
}
if (max == std::string::npos) {
CC_ASSERT(index == str.size());
max = index;
}
CC_ASSERT(min <= max);
CC_ASSERT(min <= str.size());
CC_ASSERT(max <= str.size());
return str.substr(min, max - min);
}
I will try to fix it and submit upstream when have time. Thanks.
I’ve just submitted a new PR against this issue:
great news, thank you