gtkradiant/libs/convert.h

306 lines
8.1 KiB
C++

/*
Copyright (C) 2001-2006, William Joseph.
All Rights Reserved.
This file is part of GtkRadiant.
GtkRadiant is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
GtkRadiant is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GtkRadiant; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#if !defined(INCLUDED_CONVERT_H)
#define INCLUDED_CONVERT_H
/// \file
/// \brief Character encoding conversion.
#include "debugging/debugging.h"
#include <algorithm>
#include <glib/gunicode.h>
#include <glib/gconvert.h>
#include "character.h"
/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
inline std::size_t utf8_character_length(const char* character)
{
if((*character & 0xE0) == 0xC0) // 110xxxxx
{
return 2;
}
else if((*character & 0xF0) == 0xE0) // 1110xxxx
{
return 3;
}
else if((*character & 0xF8) == 0xF0) // 11110xxx
{
return 4;
}
else if((*character & 0xFC) == 0xF8) // 111110xx
{
return 5;
}
else if((*character & 0xFE) == 0xFC) // 1111110x
{
return 6;
}
ERROR_MESSAGE("");
return 0;
}
struct UTF8Character
{
const char* buffer;
std::size_t length;
UTF8Character() : buffer(0), length(0)
{
}
UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
{
}
};
inline bool operator<(const UTF8Character& self, const UTF8Character& other)
{
return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
}
/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
{
for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
{
ostream << HexChar(*p);
}
return ostream;
}
/// \brief The character-set encoding for the current C locale.
///
/// Obtain the global instance with globalCharacterSet().
class CharacterSet
{
const char* m_charSet;
public:
CharacterSet()
{
if(g_get_charset(&m_charSet) != FALSE)
{
m_charSet = 0;
}
}
bool isUTF8() const
{
return m_charSet == 0;
}
const char* get() const
{
return m_charSet;
}
};
typedef LazyStatic<CharacterSet> GlobalCharacterSet;
/// \brief Returns the global instance of CharacterSet.
inline CharacterSet& globalCharacterSet()
{
return GlobalCharacterSet::instance();
}
class UTF8CharacterToExtendedASCII
{
public:
UTF8Character m_utf8;
char m_c;
UTF8CharacterToExtendedASCII() : m_c('\0')
{
}
UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
{
}
};
inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
{
return self.m_utf8 < other.m_utf8;
}
inline std::size_t extended_ascii_to_index(char c)
{
return static_cast<std::size_t>(c & 0x7F);
}
inline char extended_ascii_for_index(std::size_t i)
{
return static_cast<char>(i | 0x80);
}
/// \brief The active extended-ascii character set encoding.
/// Performs UTF-8 encoding and decoding of extended-ascii characters.
///
/// Obtain the global instance with globalExtendedASCIICharacterSet().
class ExtendedASCIICharacterSet
{
typedef char UTF8CharBuffer[6];
UTF8CharBuffer m_converted[128];
UTF8Character m_decodeMap[128];
UTF8CharacterToExtendedASCII m_encodeMap[128];
public:
ExtendedASCIICharacterSet()
{
if(!globalCharacterSet().isUTF8())
{
GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
for(std::size_t i = 1; i < 128; ++i)
{
char c = extended_ascii_for_index(i);
char* inbuf = &c;
std::size_t inbytesleft = 1;
char* outbuf = m_converted[i];
std::size_t outbytesleft = 6;
if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
{
UTF8Character utf8(m_converted[i]);
m_decodeMap[i] = utf8;
m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
}
}
g_iconv_close(descriptor);
std::sort(m_encodeMap, m_encodeMap + 128);
}
}
/// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
/// Useful for debugging.
void print() const
{
globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
for(std::size_t i = 1; i < 128; ++i)
{
if(m_decodeMap[i].buffer != 0)
{
globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
}
}
}
/// \brief Returns \p c decoded from extended-ascii to UTF-8.
/// \p c must be an extended-ascii character.
const UTF8Character& decode(char c) const
{
ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
return m_decodeMap[extended_ascii_to_index(c)];
}
/// \brief Returns \p c encoded to extended-ascii from UTF-8.
/// \p c must map to an extended-ascii character.
char encode(const UTF8Character& c) const
{
ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
= std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
return (*range.first).m_c;
}
};
typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
/// \brief Returns the global instance of ExtendedASCIICharacterSet.
inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
{
return GlobalExtendedASCIICharacterSet::instance();
}
class ConvertUTF8ToLocale
{
public:
StringRange m_range;
ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
{
}
ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
{
}
};
/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
{
if(globalCharacterSet().isUTF8())
{
return ostream << convert.m_range;
}
for(const char* p = convert.m_range.first; p != convert.m_range.last;)
{
if(!char_is_ascii(*p))
{
UTF8Character c(p);
ostream << globalExtendedASCIICharacterSet().encode(c);
p += c.length;
}
else
{
ostream << *p++;
}
}
return ostream;
}
class ConvertLocaleToUTF8
{
public:
StringRange m_range;
ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
{
}
ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
{
}
};
/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
{
if(globalCharacterSet().isUTF8())
{
return ostream << convert.m_range;
}
for(const char* p = convert.m_range.first; p != convert.m_range.last; ++p)
{
if(!char_is_ascii(*p))
{
UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
ostream.write(c.buffer, c.length);
}
else
{
ostream << *p;
}
}
return ostream;
}
#endif