From 64685705d02dda246dcfc969b9f2ff7d07873256 Mon Sep 17 00:00:00 2001
From: Christoph Oelckers <coelckers@users.noreply.github.com>
Date: Sat, 16 Feb 2019 13:05:19 +0100
Subject: [PATCH] - made the console Unicode-capable.

This also necessitated some character remapping in the console font to move the Windows-1252 extra characters to their proper Unicode code points.
---
 src/CMakeLists.txt                |   1 +
 src/c_console.cpp                 | 293 ++++++++++++++++--------------
 src/c_consolebuffer.cpp           |   3 +-
 src/posix/sdl/i_input.cpp         |   4 +-
 src/scripting/backend/codegen.cpp |  20 +-
 src/serializer.cpp                | 113 ++----------
 src/utility/utf8.cpp              | 249 +++++++++++++++++++++++++
 src/utility/utf8.h                |   8 +
 src/v_font.cpp                    |  18 +-
 src/v_text.cpp                    | 113 ------------
 10 files changed, 454 insertions(+), 368 deletions(-)
 create mode 100644 src/utility/utf8.cpp
 create mode 100644 src/utility/utf8.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1184dd150..b0789d91e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1271,6 +1271,7 @@ set (PCH_SOURCES
 	utility/name.cpp
 	utility/s_playlist.cpp
 	utility/v_collection.cpp
+	utility/utf8.cpp
 	utility/zstrformat.cpp
 )
 
diff --git a/src/c_console.cpp b/src/c_console.cpp
index 03687cf83..3185267f2 100644
--- a/src/c_console.cpp
+++ b/src/c_console.cpp
@@ -61,6 +61,7 @@
 #include "c_consolebuffer.h"
 #include "g_levellocals.h"
 #include "vm.h"
+#include "utf8.h"
 
 
 #include "gi.h"
@@ -167,17 +168,19 @@ struct History
 
 struct FCommandBuffer
 {
+private:
 	FString Text;	// The actual command line text
-	unsigned CursorPos;
-	unsigned StartPos;	// First character to display
+	unsigned CursorPos = 0;
+	unsigned StartPos = 0;	// First character to display
+	unsigned CursorPosChars = 0;
+	unsigned StartPosChars = 0;
 
 	FString YankBuffer;	// Deleted text buffer
-	bool AppendToYankBuffer;	// Append consecutive deletes to buffer
 
-	FCommandBuffer()
-	{
-		CursorPos = StartPos = 0;
-	}
+public:
+	bool AppendToYankBuffer = false;	// Append consecutive deletes to buffer
+
+	FCommandBuffer() = default;
 
 	FCommandBuffer(const FCommandBuffer &o)
 	{
@@ -186,6 +189,16 @@ struct FCommandBuffer
 		StartPos = o.StartPos;
 	}
 
+	FString GetText() const
+	{
+		return Text;
+	}
+
+	size_t TextLength() const
+	{
+		return Text.Len();
+	}
+
 	void Draw(int x, int y, int scale, bool cursor)
 	{
 		if (scale == 1)
@@ -197,7 +210,7 @@ struct FCommandBuffer
 			if (cursor)
 			{
 				screen->DrawChar(ConFont, CR_YELLOW,
-					x + ConFont->GetCharWidth(0x1c) + (CursorPos - StartPos) * ConFont->GetCharWidth(0xb),
+					x + ConFont->GetCharWidth(0x1c) + (CursorPosChars - StartPosChars) * ConFont->GetCharWidth(0xb),
 					y, '\xb', TAG_DONE);
 			}
 		}
@@ -217,7 +230,7 @@ struct FCommandBuffer
 			if (cursor)
 			{
 				screen->DrawChar(ConFont, CR_YELLOW,
-					x + ConFont->GetCharWidth(0x1c) + (CursorPos - StartPos) * ConFont->GetCharWidth(0xb),
+					x + ConFont->GetCharWidth(0x1c) + (CursorPosChars - StartPosChars) * ConFont->GetCharWidth(0xb),
 					y, '\xb',
 					DTA_VirtualWidth, screen->GetWidth() / scale,
 					DTA_VirtualHeight, screen->GetHeight() / scale,
@@ -226,108 +239,127 @@ struct FCommandBuffer
 		}
 	}
 
+	unsigned BytesForChars(unsigned chars)
+	{
+		unsigned bytes = 0;
+		while (chars > 0)
+		{ 
+			if ((Text[bytes++] & 0xc0) != 0x80) chars--;
+		}
+		return bytes;
+	}
+
 	void MakeStartPosGood()
 	{
-		int n = StartPos;
+		int n = StartPosChars;
 		unsigned cols = ConCols / active_con_scale();
 
-		if (StartPos >= Text.Len())
+		if (StartPosChars >= Text.CharacterCount())
 		{ // Start of visible line is beyond end of line
-			n = CursorPos - cols + 2;
+			n = CursorPosChars - cols + 2;
 		}
-		if ((CursorPos - StartPos) >= cols - 2)
+		if ((CursorPosChars - StartPosChars) >= cols - 2)
 		{ // The cursor is beyond the visible part of the line
-			n = CursorPos - cols + 2;
+			n = CursorPosChars - cols + 2;
 		}
-		if (StartPos > CursorPos)
+		if (StartPosChars > CursorPosChars)
 		{ // The cursor is in front of the visible part of the line
-			n = CursorPos;
+			n = CursorPosChars;
 		}
-		StartPos = MAX(0, n);
-	}
-
-	unsigned WordBoundaryRight()
-	{
-		unsigned index = CursorPos;
-		while (index < Text.Len() && Text[index] == ' ') {
-			index++;
-		}
-		while (index < Text.Len() && Text[index] != ' ') {
-			index++;
-		}
-		return index;
-	}
-
-	unsigned WordBoundaryLeft()
-	{
-		int index = CursorPos - 1;
-		while (index > -1 && Text[index] == ' ') {
-			index--;
-		}
-		while (index > -1 && Text[index] != ' ') {
-			index--;
-		}
-		return (unsigned)index + 1;
+		StartPosChars = MAX(0, n);
+		StartPos = BytesForChars(StartPosChars);
 	}
 
 	void CursorStart()
 	{
 		CursorPos = 0;
 		StartPos = 0;
+		CursorPosChars = 0;
+		StartPosChars = 0;
 	}
 
 	void CursorEnd()
 	{
 		CursorPos = (unsigned)Text.Len();
-		StartPos = 0;
+		CursorPosChars = (unsigned)Text.CharacterCount();
+		StartPosChars = 0;
 		MakeStartPosGood();
 	}
 
+private:
+	void MoveCursorLeft()
+	{
+		CursorPosChars--;
+		do CursorPos--;
+		while ((Text[CursorPos] & 0xc0) == 0x80);	// Step back to the last non-continuation byte.
+	}
+
+	void MoveCursorRight()
+	{
+		CursorPosChars++;
+		do CursorPos++;
+		while ((Text[CursorPos] & 0xc0) == 0x80);	// Step back to the last non-continuation byte.
+	}
+
+public:
 	void CursorLeft()
 	{
-		if (CursorPos > 0)
+		if (CursorPosChars > 0)
 		{
-			CursorPos--;
+			MoveCursorLeft();
 			MakeStartPosGood();
 		}
 	}
 
 	void CursorRight()
 	{
-		if (CursorPos < Text.Len())
+		if (CursorPosChars < Text.CharacterCount())
 		{
-			CursorPos++;
+			MoveCursorRight();
 			MakeStartPosGood();
 		}
 	}
 
 	void CursorWordLeft()
 	{
-		CursorPos = WordBoundaryLeft();
-		MakeStartPosGood();
+		if (CursorPosChars > 0)
+		{
+			do MoveCursorLeft();
+			while (CursorPosChars > 0 && Text[CursorPos - 1] != ' ');
+			MakeStartPosGood();
+		}
 	}
 
 	void CursorWordRight()
 	{
-		CursorPos = WordBoundaryRight();
-		MakeStartPosGood();
+		if (CursorPosChars < Text.CharacterCount())
+		{
+			do MoveCursorRight();
+			while (CursorPosChars < Text.CharacterCount() && Text[CursorPos] != ' ');
+			MakeStartPosGood();
+		}
 	}
 
 	void DeleteLeft()
 	{
 		if (CursorPos > 0)
 		{
-			Text.Remove(CursorPos - 1, 1);
-			CursorPos--;
+			auto now = CursorPos;
+			MoveCursorLeft();
+			Text.Remove(CursorPos, now - CursorPos);
 			MakeStartPosGood();
 		}
 	}
 
 	void DeleteRight()
 	{
-		if (CursorPos < Text.Len())
+		if (CursorPosChars < Text.CharacterCount())
 		{
-			Text.Remove(CursorPos, 1);
+			auto now = CursorPos;
+			MoveCursorRight();
+			Text.Remove(now, CursorPos - now);
+			CursorPos = now;
+			CursorPosChars--;
 			MakeStartPosGood();
 		}
 	}
@@ -336,14 +368,16 @@ struct FCommandBuffer
 	{
 		if (CursorPos > 0)
 		{
-			unsigned index = WordBoundaryLeft();
+			auto now = CursorPos;
+
+			CursorWordLeft();
+
 			if (AppendToYankBuffer) {
-				YankBuffer = FString(&Text[index], CursorPos - index) + YankBuffer;
+				YankBuffer = FString(&Text[CursorPos], now - CursorPos) + YankBuffer;
 			} else {
-				YankBuffer = FString(&Text[index], CursorPos - index);
+				YankBuffer = FString(&Text[CursorPos], now - CursorPos);
 			}
-			Text.Remove(index, CursorPos - index);
-			CursorPos = index;
+			Text.Remove(CursorPos, now - CursorPos);
 			MakeStartPosGood();
 		}
 	}
@@ -378,18 +412,23 @@ struct FCommandBuffer
 
 	void AddChar(int character)
 	{
-		///FIXME: Not Unicode-aware
-		if (CursorPos == Text.Len())
+		uint8_t encoded[5];
+		int size;
+		if (utf8_encode(character, encoded, &size) == 0)
 		{
-			Text += char(character);
+			encoded[size] = 0;
+			if (Text.IsEmpty())
+			{
+				Text = (char*)encoded;
+			}
+			else
+			{
+				Text.Insert(CursorPos, (char*)encoded);
+			}
+			CursorPos += size;
+			CursorPosChars++;
+			MakeStartPosGood();
 		}
-		else
-		{
-			char foo = char(character);
-			Text.Insert(CursorPos, &foo, 1);
-		}
-		CursorPos++;
-		MakeStartPosGood();
 	}
 
 	void AddString(FString clip)
@@ -401,6 +440,7 @@ struct FCommandBuffer
 			if (brk >= 0)
 			{
 				clip.Truncate(brk);
+				clip = MakeUTF8(clip.GetChars());	// Make sure that we actually have UTF-8 text.
 			}
 			if (Text.IsEmpty())
 			{
@@ -411,16 +451,22 @@ struct FCommandBuffer
 				Text.Insert(CursorPos, clip);
 			}
 			CursorPos += (unsigned)clip.Len();
+			CursorPosChars += (unsigned)clip.CharacterCount();
 			MakeStartPosGood();
 		}
 	}
 
-	void SetString(FString str)
+	void SetString(const FString &str)
 	{
-		Text = str;
-		CursorPos = (unsigned)Text.Len();
+		Text = MakeUTF8(str);
+		CursorEnd();
 		MakeStartPosGood();
 	}
+
+	void AddYankBuffer()
+	{
+		AddString(YankBuffer);
+	}
 };
 static FCommandBuffer CmdLine;
 
@@ -798,33 +844,6 @@ void FNotifyBuffer::AddString(int printlevel, FString source)
 	TopGoal = 0;
 }
 
-/* Adds a string to the console and also to the notify buffer */
-int utf8_encode(int32_t codepoint, char *buffer, int *size);
-static TArray<char> UTF8String;
-
-const char *MakeUTF8(const char *outline, int *numchars = nullptr)
-{
-	UTF8String.Clear();
-	const uint8_t *in = (const uint8_t*)outline;
-
-	if (numchars) *numchars = 0;
-	while (int chr = GetCharFromString(in))
-	{
-		int size = 0;
-		char encode[4];
-		if (!utf8_encode(chr, encode, &size))
-		{
-			for (int i = 0; i < size; i++)
-			{
-				UTF8String.Push(encode[i]);
-			}
-		}
-		if (numchars) *numchars++;
-	}
-	UTF8String.Push(0);
-	return UTF8String.Data();
-}
-
 void AddToConsole (int printlevel, const char *text)
 {
 	conbuffer->AddText(printlevel, MakeUTF8(text), Logfile);
@@ -844,7 +863,7 @@ int PrintString (int printlevel, const char *outline)
 
 		if (printlevel != PRINT_LOG)
 		{
-			I_PrintStr(UTF8String.Data());
+			I_PrintStr(outline);
 
 			conbuffer->AddText(printlevel, outline, Logfile);
 			if (vidactive && screen && SmallFont)
@@ -1242,10 +1261,7 @@ void C_DrawConsole ()
 		{
 			if (gamestate != GS_STARTUP)
 			{
-				// Make a copy of the command line, in case an input event is handled
-				// while we draw the console and it changes.
-				FCommandBuffer command(CmdLine);
-				command.Draw(left, bottomline, textScale, cursoron);
+				CmdLine.Draw(left, bottomline, textScale, cursoron);
 			}
 			if (RowAdjust && ConBottom >= ConFont->GetHeight()*7/2)
 			{
@@ -1527,7 +1543,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 			break;
 
 		case 'D':
-			if (ev->data3 & GKM_CTRL && buffer.Text.Len() == 0)
+			if (ev->data3 & GKM_CTRL && buffer.TextLength() == 0)
 			{ // Control-D pressed on an empty line
 				if (strlen(con_ctrl_d) == 0)
 				{
@@ -1542,16 +1558,18 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 			// Intentional fall-through for command(s) added with Ctrl-D
 
 		case '\r':
+		{
 			// Execute command line (ENTER)
+			FString bufferText = buffer.GetText();
 
-			buffer.Text.StripLeftRight();
-			Printf(127, TEXTCOLOR_WHITE "]%s\n", buffer.Text.GetChars());
+			bufferText.StripLeftRight();
+			Printf(127, TEXTCOLOR_WHITE "]%s\n", bufferText.GetChars());
 
-			if (buffer.Text.Len() == 0)
+			if (bufferText.Len() == 0)
 			{
-				 // Command line is empty, so do nothing to the history
+				// Command line is empty, so do nothing to the history
 			}
-			else if (HistHead && HistHead->String.CompareNoCase(buffer.Text) == 0)
+			else if (HistHead && HistHead->String.CompareNoCase(bufferText) == 0)
 			{
 				// Command line was the same as the previous one,
 				// so leave the history list alone
@@ -1563,7 +1581,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 				// so add it to the history list.
 
 				History *temp = new History;
-				temp->String = buffer.Text;
+				temp->String = bufferText;
 				temp->Older = HistHead;
 				if (HistHead)
 				{
@@ -1589,16 +1607,12 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 				}
 			}
 			HistPos = NULL;
-			{
-				// Work with a copy of command to avoid side effects caused by
-				// exception raised during execution, like with 'error' CCMD.
-				FString copy = buffer.Text;
-				buffer.SetString("");
-				AddCommandString(copy);
-			}
+			buffer.SetString("");
+			AddCommandString(bufferText);
 			TabbedLast = false;
 			TabbedList = false;
 			break;
+		}
 		
 		case '`':
 			// Check to see if we have ` bound to the console before accepting
@@ -1639,9 +1653,9 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 			{
 				if (data1 == 'C')
 				{ // copy to clipboard
-					if (buffer.Text.IsNotEmpty())
+					if (buffer.TextLength() > 0)
 					{
-						I_PutInClipboard(buffer.Text);
+						I_PutInClipboard(buffer.GetText());
 					}
 				}
 				else
@@ -1696,7 +1710,7 @@ static bool C_HandleKey (event_t *ev, FCommandBuffer &buffer)
 		case 'Y':
 			if (ev->data3 & GKM_CTRL)
 			{
-				buffer.AddString(buffer.YankBuffer);
+				buffer.AddYankBuffer();
 				TabbedLast = false;
 				TabbedList = false;
 				HistPos = NULL;
@@ -1939,25 +1953,27 @@ static void C_TabComplete (bool goForward)
 	unsigned i;
 	int diffpoint;
 
+	auto CmdLineText = CmdLine.GetText();
 	if (!TabbedLast)
 	{
 		bool cancomplete;
 
+
 		// Skip any spaces at beginning of command line
-		for (i = 0; i < CmdLine.Text.Len(); ++i)
+		for (i = 0; i < CmdLineText.Len(); ++i)
 		{
-			if (CmdLine.Text[i] != ' ')
+			if (CmdLineText[i] != ' ')
 				break;
 		}
-		if (i == CmdLine.Text.Len())
+		if (i == CmdLineText.Len())
 		{ // Line was nothing but spaces
 			return;
 		}
 		TabStart = i;
 
-		TabSize = (int)CmdLine.Text.Len() - TabStart;
+		TabSize = (int)CmdLineText.Len() - TabStart;
 
-		if (!FindTabCommand(&CmdLine.Text[TabStart], &TabPos, TabSize))
+		if (!FindTabCommand(&CmdLineText[TabStart], &TabPos, TabSize))
 			return;		// No initial matches
 
 		// Show a list of possible completions, if more than one.
@@ -1980,7 +1996,7 @@ static void C_TabComplete (bool goForward)
 		{ // Find the last matching tab, then go one past it.
 			while (++TabPos < (int)TabCommands.Size())
 			{
-				if (FindDiffPoint(TabCommands[TabPos].TabName, &CmdLine.Text[TabStart]) < TabSize)
+				if (FindDiffPoint(TabCommands[TabPos].TabName, &CmdLineText[TabStart]) < TabSize)
 				{
 					break;
 				}
@@ -1997,25 +2013,25 @@ static void C_TabComplete (bool goForward)
 		(!goForward && --TabPos < 0))
 	{
 		TabbedLast = false;
-		CmdLine.Text.Truncate(TabSize);
+		CmdLineText.Truncate(TabSize);
 	}
 	else
 	{
-		diffpoint = FindDiffPoint(TabCommands[TabPos].TabName, &CmdLine.Text[TabStart]);
+		diffpoint = FindDiffPoint(TabCommands[TabPos].TabName, &CmdLineText[TabStart]);
 
 		if (diffpoint < TabSize)
 		{
 			// No more matches
 			TabbedLast = false;
-			CmdLine.Text.Truncate(TabSize - TabStart);
+			CmdLineText.Truncate(TabSize - TabStart);
 		}
 		else
 		{
-			CmdLine.Text.Truncate(TabStart);
-			CmdLine.Text << TabCommands[TabPos].TabName << ' ';
+			CmdLineText.Truncate(TabStart);
+			CmdLineText << TabCommands[TabPos].TabName << ' ';
 		}
 	}
-	CmdLine.CursorPos = (unsigned)CmdLine.Text.Len();
+	CmdLine.SetString(CmdLineText);
 	CmdLine.MakeStartPosGood();
 }
 
@@ -2028,9 +2044,10 @@ static bool C_TabCompleteList ()
 	nummatches = 0;
 	maxwidth = 0;
 
+	auto CmdLineText = CmdLine.GetText();
 	for (i = TabPos; i < (int)TabCommands.Size(); ++i)
 	{
-		if (FindDiffPoint (TabCommands[i].TabName, &CmdLine.Text[TabStart]) < TabSize)
+		if (FindDiffPoint (TabCommands[i].TabName, &CmdLineText[TabStart]) < TabSize)
 		{
 			break;
 		}
@@ -2055,7 +2072,7 @@ static bool C_TabCompleteList ()
 	{
 		size_t x = 0;
 		maxwidth += 3;
-		Printf (TEXTCOLOR_BLUE "Completions for %s:\n", CmdLine.Text.GetChars());
+		Printf (TEXTCOLOR_BLUE "Completions for %s:\n", CmdLineText.GetChars());
 		for (i = TabPos; nummatches > 0; ++i, --nummatches)
 		{
 			// [Dusk] Print console commands blue, CVars green, aliases red.
@@ -2087,9 +2104,9 @@ static bool C_TabCompleteList ()
 		if (TabSize != commonsize)
 		{
 			TabSize = commonsize;
-			CmdLine.Text.Truncate(TabStart);
-			CmdLine.Text.AppendCStrPart(TabCommands[TabPos].TabName.GetChars(), commonsize);
-			CmdLine.CursorPos = (unsigned)CmdLine.Text.Len();
+			CmdLineText.Truncate(TabStart);
+			CmdLineText.AppendCStrPart(TabCommands[TabPos].TabName.GetChars(), commonsize);
+			CmdLine.SetString(CmdLineText);
 		}
 		return false;
 	}
diff --git a/src/c_consolebuffer.cpp b/src/c_consolebuffer.cpp
index 9868bc6c3..0d0d0bc03 100644
--- a/src/c_consolebuffer.cpp
+++ b/src/c_consolebuffer.cpp
@@ -270,8 +270,7 @@ void FConsoleBuffer::FormatText(FFont *formatfont, int displaywidth)
 	unsigned brokensize = m_BrokenConsoleText.Size();
 	if (brokensize == mConsoleText.Size())
 	{
-		// The last line got text appended. We have to wait until here to format it because
-		// it is possible that during display new text will be added from the NetUpdate calls in the software version of DrawTextureV.
+		// The last line got text appended. 
 		if (mLastLineNeedsUpdate)
 		{
 			brokensize--;
diff --git a/src/posix/sdl/i_input.cpp b/src/posix/sdl/i_input.cpp
index 6ea1cb70d..10c46a4a4 100644
--- a/src/posix/sdl/i_input.cpp
+++ b/src/posix/sdl/i_input.cpp
@@ -46,6 +46,7 @@
 #include "events.h"
 #include "g_game.h"
 #include "g_levellocals.h"
+#include "utf8.h"
 
 
 static void I_CheckGUICapture ();
@@ -471,10 +472,9 @@ void MessagePump (const SDL_Event &sev)
 	case SDL_TEXTINPUT:
 		if (GUICapture)
 		{
-			int utf8_decode(const char *src, int *size);
 			int size;
 			
-			int unichar = utf8_decode(sev.text.text, &size);
+			int unichar = utf8_decode((const uint8_t*)sev.text.text, &size);
 			if (size != 4)
 			{
 				event.type = EV_GUI_Event;
diff --git a/src/scripting/backend/codegen.cpp b/src/scripting/backend/codegen.cpp
index 878c7beea..7c8f207d6 100644
--- a/src/scripting/backend/codegen.cpp
+++ b/src/scripting/backend/codegen.cpp
@@ -49,10 +49,10 @@
 #include "doomstat.h"
 #include "g_levellocals.h"
 #include "v_video.h"
+#include "utf8.h"
 
 extern FRandom pr_exrandom;
 FMemArena FxAlloc(65536);
-int utf8_decode(const char *src, int *size);
 
 struct FLOP
 {
@@ -318,19 +318,13 @@ static FxExpression *StringConstToChar(FxExpression *basex)
 	// This serves as workaround for not being able to use single quoted literals because those are taken for names.
 	ExpVal constval = static_cast<FxConstant *>(basex)->GetValue();
 	FString str = constval.GetString();
-	if (str.Len() == 1)
+	int position = 0;
+	int chr = str.GetNextCharacter(position);
+
+	// Only succeed if the full string is consumed, i.e. it contains only one code point.
+	if (position == str.Len())
 	{
-		return new FxConstant(str[0], basex->ScriptPosition);
-	}
-	else if (str.Len() > 1)
-	{
-		// If the string is UTF-8, allow a single character UTF-8 sequence.
-		int size;
-		int c = utf8_decode(str.GetChars(), &size);
-		if (c >= 0 && size_t(size) == str.Len())
-		{
-			return new FxConstant(c, basex->ScriptPosition);
-		}
+		return new FxConstant(chr, basex->ScriptPosition);
 	}
 	return nullptr;
 }
diff --git a/src/serializer.cpp b/src/serializer.cpp
index 7fccd7475..e8a84f353 100644
--- a/src/serializer.cpp
+++ b/src/serializer.cpp
@@ -59,114 +59,31 @@
 #include "v_text.h"
 #include "cmdlib.h"
 #include "g_levellocals.h"
+#include "utf8.h"
 
 char nulspace[1024 * 1024 * 4];
 bool save_full = false;	// for testing. Should be removed afterward.
 
-int utf8_encode(int32_t codepoint, char *buffer, int *size)
-{
-	if (codepoint < 0)
-		return -1;
-	else if (codepoint < 0x80)
-	{
-		buffer[0] = (char)codepoint;
-		*size = 1;
-	}
-	else if (codepoint < 0x800)
-	{
-		buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
-		buffer[1] = 0x80 + ((codepoint & 0x03F));
-		*size = 2;
-	}
-	else if (codepoint < 0x10000)
-	{
-		buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
-		buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
-		buffer[2] = 0x80 + ((codepoint & 0x003F));
-		*size = 3;
-	}
-	else if (codepoint <= 0x10FFFF)
-	{
-		buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
-		buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
-		buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
-		buffer[3] = 0x80 + ((codepoint & 0x00003F));
-		*size = 4;
-	}
-	else
-		return -1;
-
-	return 0;
-}
-
-int utf8_decode(const char *src, int *size) 
-{
-	int c = src[0] & 255;
-	int r;
-
-	*size = 1;
-	if ((c & 0x80) == 0)
-	{
-		return c;
-	}
-
-	int c1 = src[1] & 255;
-
-	if ((c & 0xE0) == 0xC0) 
-	{
-		r = ((c & 0x1F) << 6) | c1;
-		if (r >= 128) 
-		{
-			*size = 2;
-			return r;
-		}
-		return -1;
-	}
-
-	int c2 = src[2] & 255;
-
-	if ((c & 0xF0) == 0xE0) 
-	{
-		r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
-		if (r >= 2048 && (r < 55296 || r > 57343)) 
-		{
-			*size = 3;
-			return r;
-		}
-		return -1;
-	}
-	
-	int c3 = src[3] & 255;
-
-	if ((c & 0xF8) == 0xF0) 
-	{
-		r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
-		if (r >= 65536 && r <= 1114111) 
-		{
-			*size = 4;
-			return r;
-		}
-	}
-	return -1;
-}
+//==========================================================================
+//
+// This will double-encode already existing UTF-8 content.
+// The reason for this behavior is to preserve any original data coming through here, no matter what it is.
+// If these are script-based strings, exact preservation in the serializer is very important.
+//
+//==========================================================================
 
 static TArray<char> out;
 static const char *StringToUnicode(const char *cc, int size = -1)
 {
 	int ch;
-	const char *c = cc;
+	const uint8_t *c = (const uint8_t*)cc;
 	int count = 0;
 	int count1 = 0;
 	out.Clear();
 	while ((ch = (*c++) & 255))
 	{
 		count1++;
-		if (ch >= 128)
-		{
-			if (ch < 0x800) count += 2;
-			else count += 3;
-			// The source cannot contain 4-byte chars.
-		}
+		if (ch >= 128) count += 2;
 		else count++;
 		if (count1 == size && size > 0) break;
 	}
@@ -174,11 +91,11 @@ static const char *StringToUnicode(const char *cc, int size = -1)
 	// we need to convert
 	out.Resize(count + 1);
 	out.Last() = 0;
-	c = cc;
+	c = (const uint8_t*)cc;
 	int i = 0;
-	while ((ch = (*c++) & 255))
+	while ((ch = (*c++)))
 	{
-		utf8_encode(ch, &out[i], &count1);
+		utf8_encode(ch, (uint8_t*)&out[i], &count1);
 		i += count1;
 	}
 	return &out[0];
@@ -191,8 +108,8 @@ static const char *UnicodeToString(const char *cc)
 	while (*cc != 0)
 	{
 		int size;
-		int c = utf8_decode(cc, &size);
-		if (c < 0 || c > 255) c = '?';
+		int c = utf8_decode((const uint8_t*)cc, &size);
+		if (c < 0 || c > 255) c = '?';	// This should never happen because all content was encoded with StringToUnicode which only produces code points 0-255.
 		out[ndx++] = c;
 		cc += size;
 	}
diff --git a/src/utility/utf8.cpp b/src/utility/utf8.cpp
new file mode 100644
index 000000000..d6e3e7edf
--- /dev/null
+++ b/src/utility/utf8.cpp
@@ -0,0 +1,249 @@
+/*
+** utf8.cpp
+** UTF-8 utilities
+**
+**---------------------------------------------------------------------------
+** Copyright 2019 Christoph Oelckers
+** All rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without
+** modification, are permitted provided that the following conditions
+** are met:
+**
+** 1. Redistributions of source code must retain the above copyright
+**    notice, this list of conditions and the following disclaimer.
+** 2. Redistributions in binary form must reproduce the above copyright
+**    notice, this list of conditions and the following disclaimer in the
+**    documentation and/or other materials provided with the distribution.
+** 3. The name of the author may not be used to endorse or promote products
+**    derived from this software without specific prior written permission.
+**
+** THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+** OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+** IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+** INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+** NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+** DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+** THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+** (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+** THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**---------------------------------------------------------------------------
+**
+*/
+#include <stdint.h>
+#include "tarray.h"
+
+
+//==========================================================================
+//
+//
+//
+//==========================================================================
+
+int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size)
+{
+	if (codepoint < 0)
+		return -1;
+	else if (codepoint < 0x80)
+	{
+		buffer[0] = (char)codepoint;
+		*size = 1;
+	}
+	else if (codepoint < 0x800)
+	{
+		buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
+		buffer[1] = 0x80 + ((codepoint & 0x03F));
+		*size = 2;
+	}
+	else if (codepoint < 0x10000)
+	{
+		buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
+		buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
+		buffer[2] = 0x80 + ((codepoint & 0x003F));
+		*size = 3;
+	}
+	else if (codepoint <= 0x10FFFF)
+	{
+		buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+		buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
+		buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
+		buffer[3] = 0x80 + ((codepoint & 0x00003F));
+		*size = 4;
+	}
+	else
+		return -1;
+
+	return 0;
+}
+
+//==========================================================================
+//
+//
+//
+//==========================================================================
+
+int utf8_decode(const uint8_t *src, int *size) 
+{
+	int c = src[0];
+	int r;
+
+	*size = 1;
+	if ((c & 0x80) == 0)
+	{
+		return c;
+	}
+
+	int c1 = src[1];
+
+	if ((c & 0xE0) == 0xC0) 
+	{
+		r = ((c & 0x1F) << 6) | c1;
+		if (r >= 128) 
+		{
+			*size = 2;
+			return r;
+		}
+		return -1;
+	}
+
+	int c2 = src[2];
+
+	if ((c & 0xF0) == 0xE0) 
+	{
+		r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
+		if (r >= 2048 && (r < 55296 || r > 57343)) 
+		{
+			*size = 3;
+			return r;
+		}
+		return -1;
+	}
+	
+	int c3 = src[3];
+
+	if ((c & 0xF8) == 0xF0) 
+	{
+		r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
+		if (r >= 65536 && r <= 1114111) 
+		{
+			*size = 4;
+			return r;
+		}
+	}
+	return -1;
+}
+
+//==========================================================================
+//
+// Unicode mapping for the 0x80-0x9f range of the Windows 1252 code page
+//
+//==========================================================================
+
+uint16_t win1252map[] = {
+	0x20AC,
+	0x81  ,
+	0x201A,
+	0x0192,
+	0x201E,
+	0x2026,
+	0x2020,
+	0x2021,
+	0x02C6,
+	0x2030,
+	0x0160,
+	0x2039,
+	0x0152,
+	0x8d  ,
+	0x017D,
+	0x8f  ,
+	0x90  ,
+	0x2018,
+	0x2019,
+	0x201C,
+	0x201D,
+	0x2022,
+	0x2013,
+	0x2014,
+	0x02DC,
+	0x2122,
+	0x0161,
+	0x203A,
+	0x0153,
+	0x9d  ,
+	0x017E,
+	0x0178,
+};
+
+//==========================================================================
+//
+// reads one character from the string.
+// This can handle both ISO 8859-1/Windows-1252 and UTF-8, as well as mixed strings
+// between both encodings, which may happen if inconsistent encoding is 
+// used between different files in a mod.
+//
+//==========================================================================
+
+int GetCharFromString(const uint8_t *&string)
+{
+	int z;
+
+	z = *string;
+
+	if (z < 192)
+	{
+		string++;
+		
+		// Handle Windows 1252 characters
+		if (z >= 128 && z < 160)
+		{
+			return win1252map[z - 128];
+		}
+		return z;
+	}
+	else 
+	{
+		int size = 0;
+		auto chr = utf8_decode(string, &size);
+		if (chr >= 0)
+		{
+			string += size;
+			return chr;
+		}
+		string++;
+		return z;
+	}
+}
+
+//==========================================================================
+//
+// convert a potentially mixed-encoded string to pure UTF-8
+// this returns a pointer to a static buffer, 
+// assuming that its caller will immediately process the result. 
+//
+//==========================================================================
+
+static TArray<char> UTF8String;
+
+const char *MakeUTF8(const char *outline, int *numchars = nullptr)
+{
+	UTF8String.Clear();
+	const uint8_t *in = (const uint8_t*)outline;
+
+	if (numchars) *numchars = 0;
+	while (int chr = GetCharFromString(in))
+	{
+		int size = 0;
+		uint8_t encode[4];
+		if (!utf8_encode(chr, encode, &size))
+		{
+			for (int i = 0; i < size; i++)
+			{
+				UTF8String.Push(encode[i]);
+			}
+		}
+		if (numchars) *numchars++;
+	}
+	UTF8String.Push(0);
+	return UTF8String.Data();
+}
diff --git a/src/utility/utf8.h b/src/utility/utf8.h
new file mode 100644
index 000000000..60531b12f
--- /dev/null
+++ b/src/utility/utf8.h
@@ -0,0 +1,8 @@
+#pragma once
+
+int utf8_encode(int32_t codepoint, uint8_t *buffer, int *size);
+int utf8_decode(const uint8_t *src, int *size);
+int GetCharFromString(const uint8_t *&string);
+const char *MakeUTF8(const char *outline, int *numchars = nullptr);	// returns a pointer to a static buffer, assuming that its caller will immediately process the result. 
+
+extern uint16_t win1252map[];
diff --git a/src/v_font.cpp b/src/v_font.cpp
index 0faa55a8c..b1ce4812a 100644
--- a/src/v_font.cpp
+++ b/src/v_font.cpp
@@ -91,6 +91,7 @@ The FON2 header is followed by variable length data:
 #include "v_text.h"
 #include "vm.h"
 #include "image.h"
+#include "utf8.h"
 #include "textures/formats/fontchars.h"
 
 // MACROS ------------------------------------------------------------------
@@ -1844,7 +1845,10 @@ void FSingleLumpFont::LoadFON1 (int lump, const uint8_t *data)
 {
 	int w, h;
 
-	Chars.Resize(256);
+	// The default console font is for Windows-1252 and fills the 0x80-0x9f range with valid glyphs.
+	// Since now all internal text is processed as Unicode, these have to be remapped to their proper places.
+	// The highest valid character in this range is 0x2122, so we need 0x2123 entries in our character table.
+	Chars.Resize(0x2123);
 
 	w = data[4] + data[5]*256;
 	h = data[6] + data[7]*256;
@@ -1853,10 +1857,20 @@ void FSingleLumpFont::LoadFON1 (int lump, const uint8_t *data)
 	FontHeight = h;
 	SpaceWidth = w;
 	FirstChar = 0;
-	LastChar = 255;
+	LastChar = 255;	// This is to allow LoadTranslations to function. The way this is all set up really needs to be changed.
 	GlobalKerning = 0;
 	translateUntranslated = true;
 	LoadTranslations();
+	LastChar = 0x2122;
+
+	// Move the Windows-1252 characters to their proper place.
+	for (int i = 0x80; i < 0xa0; i++)
+	{
+		if (win1252map[i-0x80] != i && Chars[i].TranslatedPic != nullptr && Chars[win1252map[i - 0x80]].TranslatedPic == nullptr)
+		{ 
+			std::swap(Chars[i], Chars[win1252map[i - 0x80]]);
+		}
+	}
 }
 
 //==========================================================================
diff --git a/src/v_text.cpp b/src/v_text.cpp
index 313abb275..e7634fa5e 100644
--- a/src/v_text.cpp
+++ b/src/v_text.cpp
@@ -49,119 +49,6 @@
 
 int ListGetInt(VMVa_List &tags);
 
-//==========================================================================
-//
-// reads one character from the string.
-// This can handle both ISO 8859-1 and UTF-8, as well as mixed strings
-// between both encodings, which may happen if inconsistent encoding is 
-// used between different files in a mod.
-// The long term goal should be to convert all text to UTF-8 on loading and
-// make this require pure UTF-8 input.
-//
-//==========================================================================
-
-int GetCharFromString(const uint8_t *&string)
-{
-	int z, y, x;
-
-	z = *string++;
-
-	if (z < 192)
-	{
-		// Handle Windows 1252 characters
-		if (z >= 128 && z < 160)
-		{
-			static const uint16_t map0x80_0x9f[] = {
-				0x20AC,
-				0x81  ,
-				0x201A,
-				0x0192,
-				0x201E,
-				0x2026,
-				0x2020,
-				0x2021,
-				0x02C6,
-				0x2030,
-				0x0160,
-				0x2039,
-				0x0152,
-				0x8d  ,
-				0x017D,
-				0x8f  ,
-				0x90  ,
-				0x2018,
-				0x2019,
-				0x201C,
-				0x201D,
-				0x2022,
-				0x2013,
-				0x2014,
-				0x02DC,
-				0x2122,
-				0x0161,
-				0x203A,
-				0x0153,
-				0x9d  ,
-				0x017E,
-				0x0178,
-			};
-			return map0x80_0x9f[z - 128];
-		}
-		return z;
-	}
-	else if (z <= 223)
-	{
-		y = *string++;
-		if (y < 128 || y >= 192)
-		{
-			// not an UTF-8 sequence so return the first byte unchanged
-			string--;
-		}
-		else
-		{
-			z = (z - 192) * 64 + (y - 128);
-		}
-	}
-	else if (z >= 224 && z <= 239)
-	{
-		y = *string++;
-		if (y < 128 || y >= 192)
-		{
-			// not an UTF-8 sequence so return the first byte unchanged
-			string--;
-		}
-		else
-		{
-			x = *string++;
-			if (x < 128 || x >= 192)
-			{
-				// not an UTF-8 sequence so return the first byte unchanged
-				string -= 2;
-			}
-			else
-			{
-				z = (z - 224) * 4096 + (y - 128) * 64 + (x - 128);
-			}
-		}
-	}
-	else if (z >= 240)
-	{
-		y = *string++;
-		if (y < 128 || y >= 192)
-		{
-			// not an UTF-8 sequence so return the first byte unchanged
-			string--;
-		}
-		else
-		{
-			// we do not support 4-Byte UTF-8 here
-			string += 2;
-			return '?';
-		}
-	}
-	return z;
-}
-
 //==========================================================================
 //
 // DrawChar