From 64685705d02dda246dcfc969b9f2ff7d07873256 Mon Sep 17 00:00:00 2001 From: Christoph Oelckers Date: Sat, 16 Feb 2019 13:05:19 +0100 Subject: [PATCH] - made the console Unicode-capable. This also necessitated some character remapping in the console font to move the Windows-1252 extra characters to their proper Unicode code points. --- src/CMakeLists.txt | 1 + src/c_console.cpp | 293 ++++++++++++++++-------------- src/c_consolebuffer.cpp | 3 +- src/posix/sdl/i_input.cpp | 4 +- src/scripting/backend/codegen.cpp | 20 +- src/serializer.cpp | 113 ++---------- src/utility/utf8.cpp | 249 +++++++++++++++++++++++++ src/utility/utf8.h | 8 + src/v_font.cpp | 18 +- src/v_text.cpp | 113 ------------ 10 files changed, 454 insertions(+), 368 deletions(-) create mode 100644 src/utility/utf8.cpp create mode 100644 src/utility/utf8.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1184dd150..b0789d91e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1271,6 +1271,7 @@ set (PCH_SOURCES utility/name.cpp utility/s_playlist.cpp utility/v_collection.cpp + utility/utf8.cpp utility/zstrformat.cpp ) diff --git a/src/c_console.cpp b/src/c_console.cpp index 03687cf83..3185267f2 100644 --- a/src/c_console.cpp +++ b/src/c_console.cpp @@ -61,6 +61,7 @@ #include "c_consolebuffer.h" #include "g_levellocals.h" #include "vm.h" +#include "utf8.h" #include "gi.h" @@ -167,17 +168,19 @@ struct History struct FCommandBuffer { +private: FString Text; // The actual command line text - unsigned CursorPos; - unsigned StartPos; // First character to display + unsigned CursorPos = 0; + unsigned StartPos = 0; // First character to display + unsigned CursorPosChars = 0; + unsigned StartPosChars = 0; FString YankBuffer; // Deleted text buffer - bool AppendToYankBuffer; // Append consecutive deletes to buffer - FCommandBuffer() - { - CursorPos = StartPos = 0; - } +public: + bool AppendToYankBuffer = false; // Append consecutive deletes to buffer + + FCommandBuffer() = default; FCommandBuffer(const FCommandBuffer &o) { @@ -186,6 +189,16 @@ struct FCommandBuffer StartPos = o.StartPos; } + FString GetText() const + { + return Text; + } + + size_t TextLength() const + { + return Text.Len(); + } + void Draw(int x, int y, int scale, bool cursor) { if (scale == 1) @@ -197,7 +210,7 @@ struct FCommandBuffer if (cursor) { screen->DrawChar(ConFont, CR_YELLOW, - x + ConFont->GetCharWidth(0x1c) + (CursorPos - StartPos) * ConFont->GetCharWidth(0xb), + x + ConFont->GetCharWidth(0x1c) + (CursorPosChars - StartPosChars) * ConFont->GetCharWidth(0xb), y, '\xb', TAG_DONE); } } @@ -217,7 +230,7 @@ struct FCommandBuffer if (cursor) { screen->DrawChar(ConFont, CR_YELLOW, - x + ConFont->GetCharWidth(0x1c) + (CursorPos - StartPos) * ConFont->GetCharWidth(0xb), + x + ConFont->GetCharWidth(0x1c) + (CursorPosChars - StartPosChars) * ConFont->GetCharWidth(0xb), y, '\xb', DTA_VirtualWidth, screen->GetWidth() / scale, DTA_VirtualHeight, screen->GetHeight() / scale, @@ -226,108 +239,127 @@ struct FCommandBuffer } } + unsigned BytesForChars(unsigned chars) + { + unsigned bytes = 0; + while (chars > 0) + { + if ((Text[bytes++] & 0xc0) != 0x80) chars--; + } + return bytes; + } + void MakeStartPosGood() { - int n = StartPos; + int n = StartPosChars; unsigned cols = ConCols / active_con_scale(); - if (StartPos >= Text.Len()) + if (StartPosChars >= Text.CharacterCount()) { // Start of visible line is beyond end of line - n = CursorPos - cols + 2; + n = CursorPosChars - cols + 2; } - if ((CursorPos - StartPos) >= cols - 2) + if ((CursorPosChars - StartPosChars) >= cols - 2) { // The cursor is beyond the visible part of the line - n = CursorPos - cols + 2; + n = CursorPosChars - cols + 2; } - if (StartPos > CursorPos) + if (StartPosChars > CursorPosChars) { // The cursor is in front of the visible part of the line - n = CursorPos; + n = CursorPosChars; } - StartPos = MAX(0, n); - } - - unsigned WordBoundaryRight() - { - unsigned index = CursorPos; - while (index < Text.Len() && Text[index] == ' ') { - index++; - } - while (index < Text.Len() && Text[index] != ' ') { - index++; - } - return index; - } - - unsigned WordBoundaryLeft() - { - int index = CursorPos - 1; - while (index > -1 && Text[index] == ' ') { - index--; - } - while (index > -1 && Text[index] != ' ') { - index--; - } - return (unsigned)index + 1; + StartPosChars = MAX(0, n); + StartPos = BytesForChars(StartPosChars); } void CursorStart() { CursorPos = 0; StartPos = 0; + CursorPosChars = 0; + StartPosChars = 0; } void CursorEnd() { CursorPos = (unsigned)Text.Len(); - StartPos = 0; + CursorPosChars = (unsigned)Text.CharacterCount(); + StartPosChars = 0; MakeStartPosGood(); } +private: + void MoveCursorLeft() + { + CursorPosChars--; + do CursorPos--; + while ((Text[CursorPos] & 0xc0) == 0x80); // Step back to the last non-continuation byte. + } + + void MoveCursorRight() + { + CursorPosChars++; + do CursorPos++; + while ((Text[CursorPos] & 0xc0) == 0x80); // Step back to the last non-continuation byte. + } + +public: void CursorLeft() { - if (CursorPos > 0) + if (CursorPosChars > 0) { - CursorPos--; + MoveCursorLeft(); MakeStartPosGood(); } } void CursorRight() { - if (CursorPos < Text.Len()) + if (CursorPosChars < Text.CharacterCount()) { - CursorPos++; + MoveCursorRight(); MakeStartPosGood(); } } void CursorWordLeft() { - CursorPos = WordBoundaryLeft(); - MakeStartPosGood(); + if (CursorPosChars > 0) + { + do MoveCursorLeft(); + while (CursorPosChars > 0 && Text[CursorPos - 1] != ' '); + MakeStartPosGood(); + } } void CursorWordRight() { - CursorPos = WordBoundaryRight(); - MakeStartPosGood(); + if (CursorPosChars < Text.CharacterCount()) + { + do MoveCursorRight(); + while (CursorPosChars < Text.CharacterCount() && Text[CursorPos] != ' '); + MakeStartPosGood(); + } } void DeleteLeft() { if (CursorPos > 0) { - Text.Remove(CursorPos - 1, 1); - CursorPos--; + auto now = CursorPos; + MoveCursorLeft(); + Text.Remove(CursorPos, now - CursorPos); MakeStartPosGood(); } } void DeleteRight() { - if (CursorPos < Text.Len()) + if (CursorPosChars < Text.CharacterCount()) { - Text.Remove(CursorPos, 1); + auto now = CursorPos; + MoveCursorRight(); + Text.Remove(now, CursorPos - now); + CursorPos = now; + CursorPosChars--; MakeStartPosGood(); } } @@ -336,14 +368,16 @@ struct FCommandBuffer { if (CursorPos > 0) { - unsigned index = WordBoundaryLeft(); + auto now = CursorPos; + + CursorWordLeft(); + if (AppendToYankBuffer) { - YankBuffer = FString(&Text[index], CursorPos - index) + YankBuffer; + YankBuffer = FString(&Text[CursorPos], now - CursorPos) + YankBuffer; } else { - YankBuffer = FString(&Text[index], CursorPos - index); + YankBuffer = FString(&Text[CursorPos], now - CursorPos); } - Text.Remove(index, CursorPos - index); - CursorPos = index; + Text.Remove(CursorPos, now - CursorPos); MakeStartPosGood(); } } @@ -378,18 +412,23 @@ struct FCommandBuffer void AddChar(int character) { - ///FIXME: Not Unicode-aware - if (CursorPos == Text.Len()) + uint8_t encoded[5]; + int size; + if (utf8_encode(character, encoded, &size) == 0) { - Text += char(character); + encoded[size] = 0; + if (Text.IsEmpty()) + { + Text = (char*)encoded; + } + else + { + Text.Insert(CursorPos, (char*)encoded); + } + CursorPos += size; + CursorPosChars++; + MakeStartPosGood(); } - else - { - char foo = char(character); - Text.Insert(CursorPos, &foo, 1); - } - CursorPos++; - MakeStartPosGood(); } void AddString(FString clip) @@ -401,6 +440,7 @@ struct FCommandBuffer if (brk >= 0) { clip.Truncate(brk); + clip = MakeUTF8(clip.GetChars()); // Make sure that we actually have UTF-8 text. } if (Text.IsEmpty()) { @@ -411,16 +451,22 @@ struct FCommandBuffer Text.Insert(CursorPos, clip); } CursorPos += (unsigned)clip.Len(); + CursorPosChars += (unsigned)clip.CharacterCount(); MakeStartPosGood(); } } - void SetString(FString str) + void SetString(const FString &str) { - Text = str; - CursorPos = (unsigned)Text.Len(); + Text = MakeUTF8(str); + CursorEnd(); MakeStartPosGood(); } + + void AddYankBuffer() + { + AddString(YankBuffer); + } }; static FCommandBuffer CmdLine; @@ -798,33 +844,6 @@ void FNotifyBuffer::AddString(int printlevel, FString source) TopGoal = 0; } -/* Adds a string to the console and also to the notify buffer */ -int utf8_encode(int32_t codepoint, char *buffer, int *size); -static TArray UTF8String; - -const char *MakeUTF8(const char *outline, int *numchars = nullptr) -{ - UTF8String.Clear(); - const uint8_t *in = (const uint8_t*)outline; - - if (numchars) *numchars = 0; - while (int chr = GetCharFromString(in)) - { - int size = 0; - char encode[4]; - if (!utf8_encode(chr, encode, &size)) - { - for (int i = 0; i < size; i++) - { - UTF8String.Push(encode[i]); - } - } - if (numchars) *numchars++; - } - UTF8String.Push(0); - return UTF8String.Data(); -} - void AddToConsole (int printlevel, const char *text) { conbuffer->AddText(printlevel, MakeUTF8(text), Logfile); @@ -844,7 +863,7 @@ int PrintString (int printlevel, const char *outline) if (printlevel != PRINT_LOG) { - I_PrintStr(UTF8String.Data()); + I_PrintStr(outline); conbuffer->AddText(printlevel, outline, Logfile); if (vidactive && screen && SmallFont) @@ -1242,10 +1261,7 @@ void C_DrawConsole () { if (gamestate != GS_STARTUP) { - // Make a copy of the command line, in case an input event is handled - // while we draw the console and it changes. - FCommandBuffer command(CmdLine); - command.Draw(left, bottomline, textScale, cursoron); + CmdLine.Draw(left, bottomline, textScale, cursoron); } if (RowAdjust && ConBottom >= ConFont->GetHeight()*7/2) { @@ -1527,7 +1543,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) break; case 'D': - if (ev->data3 & GKM_CTRL && buffer.Text.Len() == 0) + if (ev->data3 & GKM_CTRL && buffer.TextLength() == 0) { // Control-D pressed on an empty line if (strlen(con_ctrl_d) == 0) { @@ -1542,16 +1558,18 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) // Intentional fall-through for command(s) added with Ctrl-D case '\r': + { // Execute command line (ENTER) + FString bufferText = buffer.GetText(); - buffer.Text.StripLeftRight(); - Printf(127, TEXTCOLOR_WHITE "]%s\n", buffer.Text.GetChars()); + bufferText.StripLeftRight(); + Printf(127, TEXTCOLOR_WHITE "]%s\n", bufferText.GetChars()); - if (buffer.Text.Len() == 0) + if (bufferText.Len() == 0) { - // Command line is empty, so do nothing to the history + // Command line is empty, so do nothing to the history } - else if (HistHead && HistHead->String.CompareNoCase(buffer.Text) == 0) + else if (HistHead && HistHead->String.CompareNoCase(bufferText) == 0) { // Command line was the same as the previous one, // so leave the history list alone @@ -1563,7 +1581,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) // so add it to the history list. History *temp = new History; - temp->String = buffer.Text; + temp->String = bufferText; temp->Older = HistHead; if (HistHead) { @@ -1589,16 +1607,12 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) } } HistPos = NULL; - { - // Work with a copy of command to avoid side effects caused by - // exception raised during execution, like with 'error' CCMD. - FString copy = buffer.Text; - buffer.SetString(""); - AddCommandString(copy); - } + buffer.SetString(""); + AddCommandString(bufferText); TabbedLast = false; TabbedList = false; break; + } case '`': // Check to see if we have ` bound to the console before accepting @@ -1639,9 +1653,9 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) { if (data1 == 'C') { // copy to clipboard - if (buffer.Text.IsNotEmpty()) + if (buffer.TextLength() > 0) { - I_PutInClipboard(buffer.Text); + I_PutInClipboard(buffer.GetText()); } } else @@ -1696,7 +1710,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer) case 'Y': if (ev->data3 & GKM_CTRL) { - buffer.AddString(buffer.YankBuffer); + buffer.AddYankBuffer(); TabbedLast = false; TabbedList = false; HistPos = NULL; @@ -1939,25 +1953,27 @@ static void C_TabComplete (bool goForward) unsigned i; int diffpoint; + auto CmdLineText = CmdLine.GetText(); if (!TabbedLast) { bool cancomplete; + // Skip any spaces at beginning of command line - for (i = 0; i < CmdLine.Text.Len(); ++i) + for (i = 0; i < CmdLineText.Len(); ++i) { - if (CmdLine.Text[i] != ' ') + if (CmdLineText[i] != ' ') break; } - if (i == CmdLine.Text.Len()) + if (i == CmdLineText.Len()) { // Line was nothing but spaces return; } TabStart = i; - TabSize = (int)CmdLine.Text.Len() - TabStart; + TabSize = (int)CmdLineText.Len() - TabStart; - if (!FindTabCommand(&CmdLine.Text[TabStart], &TabPos, TabSize)) + if (!FindTabCommand(&CmdLineText[TabStart], &TabPos, TabSize)) return; // No initial matches // Show a list of possible completions, if more than one. @@ -1980,7 +1996,7 @@ static void C_TabComplete (bool goForward) { // Find the last matching tab, then go one past it. while (++TabPos < (int)TabCommands.Size()) { - if (FindDiffPoint(TabCommands[TabPos].TabName, &CmdLine.Text[TabStart]) < TabSize) + if (FindDiffPoint(TabCommands[TabPos].TabName, &CmdLineText[TabStart]) < TabSize) { break; } @@ -1997,25 +2013,25 @@ static void C_TabComplete (bool goForward) (!goForward && --TabPos < 0)) { TabbedLast = false; - CmdLine.Text.Truncate(TabSize); + CmdLineText.Truncate(TabSize); } else { - diffpoint = FindDiffPoint(TabCommands[TabPos].TabName, &CmdLine.Text[TabStart]); + diffpoint = FindDiffPoint(TabCommands[TabPos].TabName, &CmdLineText[TabStart]); if (diffpoint < TabSize) { // No more matches TabbedLast = false; - CmdLine.Text.Truncate(TabSize - TabStart); + CmdLineText.Truncate(TabSize - TabStart); } else { - CmdLine.Text.Truncate(TabStart); - CmdLine.Text << TabCommands[TabPos].TabName << ' '; + CmdLineText.Truncate(TabStart); + CmdLineText << TabCommands[TabPos].TabName << ' '; } } - CmdLine.CursorPos = (unsigned)CmdLine.Text.Len(); + CmdLine.SetString(CmdLineText); CmdLine.MakeStartPosGood(); } @@ -2028,9 +2044,10 @@ static bool C_TabCompleteList () nummatches = 0; maxwidth = 0; + auto CmdLineText = CmdLine.GetText(); for (i = TabPos; i < (int)TabCommands.Size(); ++i) { - if (FindDiffPoint (TabCommands[i].TabName, &CmdLine.Text[TabStart]) < TabSize) + if (FindDiffPoint (TabCommands[i].TabName, &CmdLineText[TabStart]) < TabSize) { break; } @@ -2055,7 +2072,7 @@ static bool C_TabCompleteList () { size_t x = 0; maxwidth += 3; - Printf (TEXTCOLOR_BLUE "Completions for %s:\n", CmdLine.Text.GetChars()); + Printf (TEXTCOLOR_BLUE "Completions for %s:\n", CmdLineText.GetChars()); for (i = TabPos; nummatches > 0; ++i, --nummatches) { // [Dusk] Print console commands blue, CVars green, aliases red. @@ -2087,9 +2104,9 @@ static bool C_TabCompleteList () if (TabSize != commonsize) { TabSize = commonsize; - CmdLine.Text.Truncate(TabStart); - CmdLine.Text.AppendCStrPart(TabCommands[TabPos].TabName.GetChars(), commonsize); - CmdLine.CursorPos = (unsigned)CmdLine.Text.Len(); + CmdLineText.Truncate(TabStart); + CmdLineText.AppendCStrPart(TabCommands[TabPos].TabName.GetChars(), commonsize); + CmdLine.SetString(CmdLineText); } return false; } diff --git a/src/c_consolebuffer.cpp b/src/c_consolebuffer.cpp index 9868bc6c3..0d0d0bc03 100644 --- a/src/c_consolebuffer.cpp +++ b/src/c_consolebuffer.cpp @@ -270,8 +270,7 @@ void FConsoleBuffer::FormatText(FFont *formatfont, int displaywidth) unsigned brokensize = m_BrokenConsoleText.Size(); if (brokensize == mConsoleText.Size()) { - // The last line got text appended. We have to wait until here to format it because - // it is possible that during display new text will be added from the NetUpdate calls in the software version of DrawTextureV. + // The last line got text appended. if (mLastLineNeedsUpdate) { brokensize--; diff --git a/src/posix/sdl/i_input.cpp b/src/posix/sdl/i_input.cpp index 6ea1cb70d..10c46a4a4 100644 --- a/src/posix/sdl/i_input.cpp +++ b/src/posix/sdl/i_input.cpp @@ -46,6 +46,7 @@ #include "events.h" #include "g_game.h" #include "g_levellocals.h" +#include "utf8.h" static void I_CheckGUICapture (); @@ -471,10 +472,9 @@ void MessagePump (const SDL_Event &sev) case SDL_TEXTINPUT: if (GUICapture) { - int utf8_decode(const char *src, int *size); int size; - int unichar = utf8_decode(sev.text.text, &size); + int unichar = utf8_decode((const uint8_t*)sev.text.text, &size); if (size != 4) { event.type = EV_GUI_Event; diff --git a/src/scripting/backend/codegen.cpp b/src/scripting/backend/codegen.cpp index 878c7beea..7c8f207d6 100644 --- a/src/scripting/backend/codegen.cpp +++ b/src/scripting/backend/codegen.cpp @@ -49,10 +49,10 @@ #include "doomstat.h" #include "g_levellocals.h" #include "v_video.h" +#include "utf8.h" extern FRandom pr_exrandom; FMemArena FxAlloc(65536); -int utf8_decode(const char *src, int *size); struct FLOP { @@ -318,19 +318,13 @@ static FxExpression *StringConstToChar(FxExpression *basex) // This serves as workaround for not being able to use single quoted literals because those are taken for names. ExpVal constval = static_cast(basex)->GetValue(); FString str = constval.GetString(); - if (str.Len() == 1) + int position = 0; + int chr = str.GetNextCharacter(position); + + // Only succeed if the full string is consumed, i.e. it contains only one code point. + if (position == str.Len()) { - return new FxConstant(str[0], basex->ScriptPosition); - } - else if (str.Len() > 1) - { - // If the string is UTF-8, allow a single character UTF-8 sequence. - int size; - int c = utf8_decode(str.GetChars(), &size); - if (c >= 0 && size_t(size) == str.Len()) - { - return new FxConstant(c, basex->ScriptPosition); - } + return new FxConstant(chr, basex->ScriptPosition); } return nullptr; } diff --git a/src/serializer.cpp b/src/serializer.cpp index 7fccd7475..e8a84f353 100644 --- a/src/serializer.cpp +++ b/src/serializer.cpp @@ -59,114 +59,31 @@ #include "v_text.h" #include "cmdlib.h" #include "g_levellocals.h" +#include "utf8.h" char nulspace[1024 * 1024 * 4]; bool save_full = false; // for testing. Should be removed afterward. -int utf8_encode(int32_t codepoint, char *buffer, int *size) -{ - if (codepoint < 0) - return -1; - else if (codepoint < 0x80) - { - buffer[0] = (char)codepoint; - *size = 1; - } - else if (codepoint < 0x800) - { - buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); - buffer[1] = 0x80 + ((codepoint & 0x03F)); - *size = 2; - } - else if (codepoint < 0x10000) - { - buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); - buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); - buffer[2] = 0x80 + ((codepoint & 0x003F)); - *size = 3; - } - else if (codepoint <= 0x10FFFF) - { - buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); - buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); - buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); - buffer[3] = 0x80 + ((codepoint & 0x00003F)); - *size = 4; - } - else - return -1; - - return 0; -} - -int utf8_decode(const char *src, int *size) -{ - int c = src[0] & 255; - int r; - - *size = 1; - if ((c & 0x80) == 0) - { - return c; - } - - int c1 = src[1] & 255; - - if ((c & 0xE0) == 0xC0) - { - r = ((c & 0x1F) << 6) | c1; - if (r >= 128) - { - *size = 2; - return r; - } - return -1; - } - - int c2 = src[2] & 255; - - if ((c & 0xF0) == 0xE0) - { - r = ((c & 0x0F) << 12) | (c1 << 6) | c2; - if (r >= 2048 && (r < 55296 || r > 57343)) - { - *size = 3; - return r; - } - return -1; - } - - int c3 = src[3] & 255; - - if ((c & 0xF8) == 0xF0) - { - r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; - if (r >= 65536 && r <= 1114111) - { - *size = 4; - return r; - } - } - return -1; -} +//========================================================================== +// +// This will double-encode already existing UTF-8 content. +// The reason for this behavior is to preserve any original data coming through here, no matter what it is. +// If these are script-based strings, exact preservation in the serializer is very important. +// +//========================================================================== static TArray out; static const char *StringToUnicode(const char *cc, int size = -1) { int ch; - const char *c = cc; + const uint8_t *c = (const uint8_t*)cc; int count = 0; int count1 = 0; out.Clear(); while ((ch = (*c++) & 255)) { count1++; - if (ch >= 128) - { - if (ch < 0x800) count += 2; - else count += 3; - // The source cannot contain 4-byte chars. - } + if (ch >= 128) count += 2; else count++; if (count1 == size && size > 0) break; } @@ -174,11 +91,11 @@ static const char *StringToUnicode(const char *cc, int size = -1) // we need to convert out.Resize(count + 1); out.Last() = 0; - c = cc; + c = (const uint8_t*)cc; int i = 0; - while ((ch = (*c++) & 255)) + while ((ch = (*c++))) { - utf8_encode(ch, &out[i], &count1); + utf8_encode(ch, (uint8_t*)&out[i], &count1); i += count1; } return &out[0]; @@ -191,8 +108,8 @@ static const char *UnicodeToString(const char *cc) while (*cc != 0) { int size; - int c = utf8_decode(cc, &size); - if (c < 0 || c > 255) c = '?'; + int c = utf8_decode((const uint8_t*)cc, &size); + if (c < 0 || c > 255) c = '?'; // This should never happen because all content was encoded with StringToUnicode which only produces code points 0-255. out[ndx++] = c; cc += size; } diff --git a/src/utility/utf8.cpp b/src/utility/utf8.cpp new file mode 100644 index 000000000..d6e3e7edf --- /dev/null +++ b/src/utility/utf8.cpp @@ -0,0 +1,249 @@ +/* +** utf8.cpp +** UTF-8 utilities +** +**--------------------------------------------------------------------------- +** Copyright 2019 Christoph Oelckers +** All rights reserved. +** +** Redistribution and use in source and binary forms, with or without +** modification, are permitted provided that the following conditions +** are met: +** +** 1. Redistributions of source code must retain the above copyright +** notice, this list of conditions and the following disclaimer. +** 2. Redistributions in binary form must reproduce the above copyright +** notice, this list of conditions and the following disclaimer in the +** documentation and/or other materials provided with the distribution. +** 3. The name of the author may not be used to endorse or promote products +** derived from this software without specific prior written permission. +** +** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**--------------------------------------------------------------------------- +** +*/ +#include +#include "tarray.h" + + +//========================================================================== +// +// +// +//========================================================================== + +int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size) +{ + if (codepoint < 0) + return -1; + else if (codepoint < 0x80) + { + buffer[0] = (char)codepoint; + *size = 1; + } + else if (codepoint < 0x800) + { + buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); + buffer[1] = 0x80 + ((codepoint & 0x03F)); + *size = 2; + } + else if (codepoint < 0x10000) + { + buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); + buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); + buffer[2] = 0x80 + ((codepoint & 0x003F)); + *size = 3; + } + else if (codepoint <= 0x10FFFF) + { + buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); + buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); + buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); + buffer[3] = 0x80 + ((codepoint & 0x00003F)); + *size = 4; + } + else + return -1; + + return 0; +} + +//========================================================================== +// +// +// +//========================================================================== + +int utf8_decode(const uint8_t *src, int *size) +{ + int c = src[0]; + int r; + + *size = 1; + if ((c & 0x80) == 0) + { + return c; + } + + int c1 = src[1]; + + if ((c & 0xE0) == 0xC0) + { + r = ((c & 0x1F) << 6) | c1; + if (r >= 128) + { + *size = 2; + return r; + } + return -1; + } + + int c2 = src[2]; + + if ((c & 0xF0) == 0xE0) + { + r = ((c & 0x0F) << 12) | (c1 << 6) | c2; + if (r >= 2048 && (r < 55296 || r > 57343)) + { + *size = 3; + return r; + } + return -1; + } + + int c3 = src[3]; + + if ((c & 0xF8) == 0xF0) + { + r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3; + if (r >= 65536 && r <= 1114111) + { + *size = 4; + return r; + } + } + return -1; +} + +//========================================================================== +// +// Unicode mapping for the 0x80-0x9f range of the Windows 1252 code page +// +//========================================================================== + +uint16_t win1252map[] = { + 0x20AC, + 0x81 , + 0x201A, + 0x0192, + 0x201E, + 0x2026, + 0x2020, + 0x2021, + 0x02C6, + 0x2030, + 0x0160, + 0x2039, + 0x0152, + 0x8d , + 0x017D, + 0x8f , + 0x90 , + 0x2018, + 0x2019, + 0x201C, + 0x201D, + 0x2022, + 0x2013, + 0x2014, + 0x02DC, + 0x2122, + 0x0161, + 0x203A, + 0x0153, + 0x9d , + 0x017E, + 0x0178, +}; + +//========================================================================== +// +// reads one character from the string. +// This can handle both ISO 8859-1/Windows-1252 and UTF-8, as well as mixed strings +// between both encodings, which may happen if inconsistent encoding is +// used between different files in a mod. +// +//========================================================================== + +int GetCharFromString(const uint8_t *&string) +{ + int z; + + z = *string; + + if (z < 192) + { + string++; + + // Handle Windows 1252 characters + if (z >= 128 && z < 160) + { + return win1252map[z - 128]; + } + return z; + } + else + { + int size = 0; + auto chr = utf8_decode(string, &size); + if (chr >= 0) + { + string += size; + return chr; + } + string++; + return z; + } +} + +//========================================================================== +// +// convert a potentially mixed-encoded string to pure UTF-8 +// this returns a pointer to a static buffer, +// assuming that its caller will immediately process the result. +// +//========================================================================== + +static TArray UTF8String; + +const char *MakeUTF8(const char *outline, int *numchars = nullptr) +{ + UTF8String.Clear(); + const uint8_t *in = (const uint8_t*)outline; + + if (numchars) *numchars = 0; + while (int chr = GetCharFromString(in)) + { + int size = 0; + uint8_t encode[4]; + if (!utf8_encode(chr, encode, &size)) + { + for (int i = 0; i < size; i++) + { + UTF8String.Push(encode[i]); + } + } + if (numchars) *numchars++; + } + UTF8String.Push(0); + return UTF8String.Data(); +} diff --git a/src/utility/utf8.h b/src/utility/utf8.h new file mode 100644 index 000000000..60531b12f --- /dev/null +++ b/src/utility/utf8.h @@ -0,0 +1,8 @@ +#pragma once + +int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size); +int utf8_decode(const uint8_t *src, int *size); +int GetCharFromString(const uint8_t *&string); +const char *MakeUTF8(const char *outline, int *numchars = nullptr); // returns a pointer to a static buffer, assuming that its caller will immediately process the result. + +extern uint16_t win1252map[]; diff --git a/src/v_font.cpp b/src/v_font.cpp index 0faa55a8c..b1ce4812a 100644 --- a/src/v_font.cpp +++ b/src/v_font.cpp @@ -91,6 +91,7 @@ The FON2 header is followed by variable length data: #include "v_text.h" #include "vm.h" #include "image.h" +#include "utf8.h" #include "textures/formats/fontchars.h" // MACROS ------------------------------------------------------------------ @@ -1844,7 +1845,10 @@ void FSingleLumpFont::LoadFON1 (int lump, const uint8_t *data) { int w, h; - Chars.Resize(256); + // The default console font is for Windows-1252 and fills the 0x80-0x9f range with valid glyphs. + // Since now all internal text is processed as Unicode, these have to be remapped to their proper places. + // The highest valid character in this range is 0x2122, so we need 0x2123 entries in our character table. + Chars.Resize(0x2123); w = data[4] + data[5]*256; h = data[6] + data[7]*256; @@ -1853,10 +1857,20 @@ void FSingleLumpFont::LoadFON1 (int lump, const uint8_t *data) FontHeight = h; SpaceWidth = w; FirstChar = 0; - LastChar = 255; + LastChar = 255; // This is to allow LoadTranslations to function. The way this is all set up really needs to be changed. GlobalKerning = 0; translateUntranslated = true; LoadTranslations(); + LastChar = 0x2122; + + // Move the Windows-1252 characters to their proper place. + for (int i = 0x80; i < 0xa0; i++) + { + if (win1252map[i-0x80] != i && Chars[i].TranslatedPic != nullptr && Chars[win1252map[i - 0x80]].TranslatedPic == nullptr) + { + std::swap(Chars[i], Chars[win1252map[i - 0x80]]); + } + } } //========================================================================== diff --git a/src/v_text.cpp b/src/v_text.cpp index 313abb275..e7634fa5e 100644 --- a/src/v_text.cpp +++ b/src/v_text.cpp @@ -49,119 +49,6 @@ int ListGetInt(VMVa_List &tags); -//========================================================================== -// -// reads one character from the string. -// This can handle both ISO 8859-1 and UTF-8, as well as mixed strings -// between both encodings, which may happen if inconsistent encoding is -// used between different files in a mod. -// The long term goal should be to convert all text to UTF-8 on loading and -// make this require pure UTF-8 input. -// -//========================================================================== - -int GetCharFromString(const uint8_t *&string) -{ - int z, y, x; - - z = *string++; - - if (z < 192) - { - // Handle Windows 1252 characters - if (z >= 128 && z < 160) - { - static const uint16_t map0x80_0x9f[] = { - 0x20AC, - 0x81 , - 0x201A, - 0x0192, - 0x201E, - 0x2026, - 0x2020, - 0x2021, - 0x02C6, - 0x2030, - 0x0160, - 0x2039, - 0x0152, - 0x8d , - 0x017D, - 0x8f , - 0x90 , - 0x2018, - 0x2019, - 0x201C, - 0x201D, - 0x2022, - 0x2013, - 0x2014, - 0x02DC, - 0x2122, - 0x0161, - 0x203A, - 0x0153, - 0x9d , - 0x017E, - 0x0178, - }; - return map0x80_0x9f[z - 128]; - } - return z; - } - else if (z <= 223) - { - y = *string++; - if (y < 128 || y >= 192) - { - // not an UTF-8 sequence so return the first byte unchanged - string--; - } - else - { - z = (z - 192) * 64 + (y - 128); - } - } - else if (z >= 224 && z <= 239) - { - y = *string++; - if (y < 128 || y >= 192) - { - // not an UTF-8 sequence so return the first byte unchanged - string--; - } - else - { - x = *string++; - if (x < 128 || x >= 192) - { - // not an UTF-8 sequence so return the first byte unchanged - string -= 2; - } - else - { - z = (z - 224) * 4096 + (y - 128) * 64 + (x - 128); - } - } - } - else if (z >= 240) - { - y = *string++; - if (y < 128 || y >= 192) - { - // not an UTF-8 sequence so return the first byte unchanged - string--; - } - else - { - // we do not support 4-Byte UTF-8 here - string += 2; - return '?'; - } - } - return z; -} - //========================================================================== // // DrawChar