Cleanup and optimize StringUtils::ConvertUTF82UTF16

This commit is contained in:
Wojtek Figat
2021-03-07 18:09:26 +01:00
parent 862c02da6a
commit 2b42e9256c
2 changed files with 63 additions and 27 deletions

View File

@@ -64,16 +64,25 @@ const char* StringUtils::FindIgnoreCase(const char* str, const char* toFind)
return nullptr; return nullptr;
} }
void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLength, uint32* toLength) void PrintUTF8Error(const char* from, uint32 fromLength)
{ {
Array<unsigned long> unicode; LOG(Error, "Not a UTF-8 string. Length: {0}", fromLength);
uint32 i = 0; for (uint32 i = 0; i < fromLength; i++)
*toLength = 0; {
LOG(Error, "str[{0}] = {0}", i, from[i]);
}
}
void ConvertUTF82UTF16Helper(Array<uint32>& unicode, const char* from, int32 fromLength, int32& toLength)
{
// Reference: https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
unicode.EnsureCapacity(fromLength);
int32 i = 0, todo;
uint32 uni;
toLength = 0;
while (i < fromLength) while (i < fromLength)
{ {
unsigned long uni; byte ch = from[i++];
uint32 todo;
unsigned char ch = from[i++];
if (ch <= 0x7F) if (ch <= 0x7F)
{ {
@@ -82,7 +91,7 @@ void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLengt
} }
else if (ch <= 0xBF) else if (ch <= 0xBF)
{ {
LOG(Error, "Not a UTF-8 string."); PrintUTF8Error(from, fromLength);
return; return;
} }
else if (ch <= 0xDF) else if (ch <= 0xDF)
@@ -102,21 +111,21 @@ void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLengt
} }
else else
{ {
LOG(Error, "Not a UTF-8 string."); PrintUTF8Error(from, fromLength);
return; return;
} }
for (uint32 j = 0; j < todo; j++) for (int32 j = 0; j < todo; j++)
{ {
if (i == fromLength) if (i == fromLength)
{ {
LOG(Error, "Not a UTF-8 string."); PrintUTF8Error(from, fromLength);
return; return;
} }
ch = from[i++]; ch = from[i++];
if (ch < 0x80 || ch > 0xBF) if (ch < 0x80 || ch > 0xBF)
{ {
LOG(Error, "Not a UTF-8 string."); PrintUTF8Error(from, fromLength);
return; return;
} }
@@ -126,28 +135,27 @@ void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLengt
if ((uni >= 0xD800 && uni <= 0xDFFF) || uni > 0x10FFFF) if ((uni >= 0xD800 && uni <= 0xDFFF) || uni > 0x10FFFF)
{ {
LOG(Error, "Not a UTF-8 string."); PrintUTF8Error(from, fromLength);
return; return;
} }
unicode.Add(uni); unicode.Add(uni);
}
// Count chars toLength++;
uint32 length = (uint32)unicode.Count(); if (uni > 0xFFFF)
for (i = 0; i < length; i++)
{
if (unicode[i] > 0xFFFF)
{ {
length++; toLength++;
} }
} }
}
// Copy chars void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, int32 fromLength, int32& toLength)
*toLength = length; {
for (i = 0; i < length; i++) Array<uint32> unicode;
ConvertUTF82UTF16Helper(unicode, from, fromLength, toLength);
for (int32 i = 0; i < toLength; i++)
{ {
unsigned long uni = unicode[i]; uint32 uni = unicode[i];
if (uni <= 0xFFFF) if (uni <= 0xFFFF)
{ {
to[i] = (Char)uni; to[i] = (Char)uni;
@@ -161,6 +169,31 @@ void StringUtils::ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLengt
} }
} }
Char* StringUtils::ConvertUTF82UTF16(const char* from, int32 fromLength, int32& toLength)
{
Array<uint32> unicode;
ConvertUTF82UTF16Helper(unicode, from, fromLength, toLength);
if (toLength == 0)
return nullptr;
Char* to = (Char*)Allocator::Allocate((toLength + 1) * sizeof(Char));
for (int32 i = 0; i < toLength; i++)
{
uint32 uni = unicode[i];
if (uni <= 0xFFFF)
{
to[i] = (Char)uni;
}
else
{
uni -= 0x10000;
to[i++] += (Char)((uni >> 10) + 0xD800);
to[i] += (Char)((uni & 0x3FF) + 0xDC00);
}
}
to[toLength] = 0;
return to;
}
void RemoveLongPathPrefix(const String& path, String& result) void RemoveLongPathPrefix(const String& path, String& result)
{ {
if (!path.StartsWith(TEXT("\\\\?\\"), StringSearchCase::CaseSensitive)) if (!path.StartsWith(TEXT("\\\\?\\"), StringSearchCase::CaseSensitive))

View File

@@ -184,14 +184,17 @@ public:
public: public:
// Convert characters from ANSI to UTF-16 // Converts characters from ANSI to UTF-16
static void ConvertANSI2UTF16(const char* from, Char* to, int32 len); static void ConvertANSI2UTF16(const char* from, Char* to, int32 len);
// Convert characters from UTF-16 to ANSI // Converts characters from UTF-16 to ANSI
static void ConvertUTF162ANSI(const Char* from, char* to, int32 len); static void ConvertUTF162ANSI(const Char* from, char* to, int32 len);
// Convert characters from UTF-8 to UTF-16 // Convert characters from UTF-8 to UTF-16
static void ConvertUTF82UTF16(const char* from, Char* to, uint32 fromLength, uint32* toLength); static void ConvertUTF82UTF16(const char* from, Char* to, int32 fromLength, int32& toLength);
// Convert characters from UTF-8 to UTF-16 (allocates the output buffer with Allocator::Allocate of size toLength + 1, call Allocator::Free after usage). Returns null on empty or invalid string.
static Char* ConvertUTF82UTF16(const char* from, int32 fromLength, int32& toLength);
public: public: