gtkradiant/libs/convert.h

/*
Copyright (C) 2001-2006, William Joseph.
All Rights Reserved.

This file is part of GtkRadiant.

GtkRadiant is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

GtkRadiant is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with GtkRadiant; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
*/

#if !defined(INCLUDED_CONVERT_H)
#define INCLUDED_CONVERT_H

/// \file
/// \brief Character encoding conversion.

#include "debugging/debugging.h"
#include <algorithm>
#include <glib/gunicode.h>
#include <glib/gconvert.h>

#include "character.h"

/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
inline std::size_t utf8_character_length(const char* character)
{
  if((*character & 0xE0) == 0xC0) // 110xxxxx
  {
    return 2;
  }
  else if((*character & 0xF0) == 0xE0) // 1110xxxx
  {
    return 3;
  }
  else if((*character & 0xF8) == 0xF0) // 11110xxx
  {
    return 4;
  }
  else if((*character & 0xFC) == 0xF8) // 111110xx
  {
    return 5;
  }
  else if((*character & 0xFE) == 0xFC) // 1111110x
  {
    return 6;
  }
  ERROR_MESSAGE("");
  return 0;
}

struct UTF8Character
{
  const char* buffer;
  std::size_t length;
  UTF8Character() : buffer(0), length(0)
  {
  }
  UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))
  {
  }
};

inline bool operator<(const UTF8Character& self, const UTF8Character& other)
{
  return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);
}

/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)
{
  for(const char* p = c.buffer; p != c.buffer + c.length; ++p)
  {
    ostream << HexChar(*p);
  }
  return ostream;
}


/// \brief The character-set encoding for the current C locale.
///
/// Obtain the global instance with globalCharacterSet().
class CharacterSet
{
  const char* m_charSet;
public:
  CharacterSet()
  {
    if(g_get_charset(&m_charSet) != FALSE)
    {
      m_charSet = 0;
    }
  }
  bool isUTF8() const
  {
    return m_charSet == 0;
  }
  const char* get() const
  {
    return m_charSet;
  }
};

typedef LazyStatic<CharacterSet> GlobalCharacterSet;

/// \brief Returns the global instance of CharacterSet.
inline CharacterSet& globalCharacterSet()
{
  return GlobalCharacterSet::instance();
}


class UTF8CharacterToExtendedASCII
{
public:
  UTF8Character m_utf8;
  char m_c;
  UTF8CharacterToExtendedASCII() : m_c('\0')
  {
  }
  UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)
  {
  }
};

inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
{
  return self.m_utf8 < other.m_utf8;
}

inline std::size_t extended_ascii_to_index(char c)
{
  return static_cast<std::size_t>(c & 0x7F);
}

inline char extended_ascii_for_index(std::size_t i)
{
  return static_cast<char>(i | 0x80);
}

/// \brief The active extended-ascii character set encoding.
/// Performs UTF-8 encoding and decoding of extended-ascii characters.
///
/// Obtain the global instance with globalExtendedASCIICharacterSet().
class ExtendedASCIICharacterSet
{
  typedef char UTF8CharBuffer[6];
  UTF8CharBuffer m_converted[128];
  UTF8Character m_decodeMap[128];
  UTF8CharacterToExtendedASCII m_encodeMap[128];
public:
  ExtendedASCIICharacterSet()
  {
    if(!globalCharacterSet().isUTF8())
    {
      GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
      for(std::size_t i = 1; i < 128; ++i)
      {
        char c = extended_ascii_for_index(i);
        char* inbuf = &c;
        std::size_t inbytesleft = 1;
        char* outbuf = m_converted[i];
        std::size_t outbytesleft = 6;
        if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))
        {
          UTF8Character utf8(m_converted[i]);
          m_decodeMap[i] = utf8;
          m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
        }
      }
      g_iconv_close(descriptor);
      std::sort(m_encodeMap, m_encodeMap + 128);
    }
  }
  /// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
  /// Useful for debugging.
  void print() const
  {
    globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
    for(std::size_t i = 1; i < 128; ++i)
    {
      if(m_decodeMap[i].buffer != 0)
      {
        globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
      }
    }
  }
  /// \brief Returns \p c decoded from extended-ascii to UTF-8.
  /// \p c must be an extended-ascii character.
  const UTF8Character& decode(char c) const
  {
    ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
    ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
    ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
    return m_decodeMap[extended_ascii_to_index(c)];
  }
  /// \brief Returns \p c encoded to extended-ascii from UTF-8.
  /// \p c must map to an extended-ascii character.
  char encode(const UTF8Character& c) const
  {
    ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
    ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
    std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
      = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
    ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
    return (*range.first).m_c;
  }
};

typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;

/// \brief Returns the global instance of ExtendedASCIICharacterSet.
inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()
{
  return GlobalExtendedASCIICharacterSet::instance();
}

class ConvertUTF8ToLocale
{
public:
  StringRange m_range;
  ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))
  {
  }
  ConvertUTF8ToLocale(const StringRange& range) : m_range(range)
  {
  }
};

/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
{
  if(globalCharacterSet().isUTF8())
  {
    return ostream << convert.m_range;
  }

  for(const char* p = convert.m_range.begin; p != convert.m_range.end;)
  {
    if(!char_is_ascii(*p))
    {
      UTF8Character c(p);
      ostream << globalExtendedASCIICharacterSet().encode(c);
      p += c.length;
    }
    else
    {
      ostream << *p++;
    }
  }
  return ostream; 
}


class ConvertLocaleToUTF8
{
public:
  StringRange m_range;
  ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))
  {
  }
  ConvertLocaleToUTF8(const StringRange& range) : m_range(range)
  {
  }
};

/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
{
  if(globalCharacterSet().isUTF8())
  {
    return ostream << convert.m_range;
  }

  for(const char* p = convert.m_range.begin; p != convert.m_range.end; ++p)
  {
    if(!char_is_ascii(*p))
    {
      UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
      ostream.write(c.buffer, c.length);
    }
    else
    {
      ostream << *p;
    }
  }
  return ostream; 
}


#endif
ok git-svn-id: svn://svn.icculus.org/gtkradiant/GtkRadiant@1 8a3a26a2-13c4-0310-b231-cf6edde360e5 2006-02-10 22:01:20 +00:00			`/*`
			`Copyright (C) 2001-2006, William Joseph.`
			`All Rights Reserved.`

			`This file is part of GtkRadiant.`

			`GtkRadiant is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`GtkRadiant is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with GtkRadiant; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#if !defined(INCLUDED_CONVERT_H)`
			`#define INCLUDED_CONVERT_H`

			`/// \file`
			`/// \brief Character encoding conversion.`

			`#include "debugging/debugging.h"`
			`#include <algorithm>`
			`#include <glib/gunicode.h>`
			`#include <glib/gconvert.h>`

			`#include "character.h"`

			`/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.`
			`inline std::size_t utf8_character_length(const char* character)`
			`{`
			`if((*character & 0xE0) == 0xC0) // 110xxxxx`
			`{`
			`return 2;`
			`}`
			`else if((*character & 0xF0) == 0xE0) // 1110xxxx`
			`{`
			`return 3;`
			`}`
			`else if((*character & 0xF8) == 0xF0) // 11110xxx`
			`{`
			`return 4;`
			`}`
			`else if((*character & 0xFC) == 0xF8) // 111110xx`
			`{`
			`return 5;`
			`}`
			`else if((*character & 0xFE) == 0xFC) // 1111110x`
			`{`
			`return 6;`
			`}`
			`ERROR_MESSAGE("");`
			`return 0;`
			`}`

			`struct UTF8Character`
			`{`
			`const char* buffer;`
			`std::size_t length;`
			`UTF8Character() : buffer(0), length(0)`
			`{`
			`}`
			`UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes))`
			`{`
			`}`
			`};`

			`inline bool operator<(const UTF8Character& self, const UTF8Character& other)`
			`{`
			`return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length);`
			`}`

			`/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c)`
			`{`
			`for(const char* p = c.buffer; p != c.buffer + c.length; ++p)`
			`{`
			`ostream << HexChar(*p);`
			`}`
			`return ostream;`
			`}`



			`/// \brief The character-set encoding for the current C locale.`
			`///`
			`/// Obtain the global instance with globalCharacterSet().`
			`class CharacterSet`
			`{`
			`const char* m_charSet;`
			`public:`
			`CharacterSet()`
			`{`
			`if(g_get_charset(&m_charSet) != FALSE)`
			`{`
			`m_charSet = 0;`
			`}`
			`}`
			`bool isUTF8() const`
			`{`
			`return m_charSet == 0;`
			`}`
			`const char* get() const`
			`{`
			`return m_charSet;`
			`}`
			`};`

			`typedef LazyStatic<CharacterSet> GlobalCharacterSet;`

			`/// \brief Returns the global instance of CharacterSet.`
			`inline CharacterSet& globalCharacterSet()`
			`{`
			`return GlobalCharacterSet::instance();`
			`}`


			`class UTF8CharacterToExtendedASCII`
			`{`
			`public:`
			`UTF8Character m_utf8;`
			`char m_c;`
			`UTF8CharacterToExtendedASCII() : m_c('\0')`
			`{`
			`}`
			`UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c)`
			`{`
			`}`
			`};`

			`inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)`
			`{`
			`return self.m_utf8 < other.m_utf8;`
			`}`

			`inline std::size_t extended_ascii_to_index(char c)`
			`{`
			`return static_cast<std::size_t>(c & 0x7F);`
			`}`

			`inline char extended_ascii_for_index(std::size_t i)`
			`{`
			`return static_cast<char>(i \| 0x80);`
			`}`

			`/// \brief The active extended-ascii character set encoding.`
			`/// Performs UTF-8 encoding and decoding of extended-ascii characters.`
			`///`
			`/// Obtain the global instance with globalExtendedASCIICharacterSet().`
			`class ExtendedASCIICharacterSet`
			`{`
			`typedef char UTF8CharBuffer[6];`
			`UTF8CharBuffer m_converted[128];`
			`UTF8Character m_decodeMap[128];`
			`UTF8CharacterToExtendedASCII m_encodeMap[128];`
			`public:`
			`ExtendedASCIICharacterSet()`
			`{`
			`if(!globalCharacterSet().isUTF8())`
			`{`
			`GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());`
			`for(std::size_t i = 1; i < 128; ++i)`
			`{`
			`char c = extended_ascii_for_index(i);`
			`char* inbuf = &c;`
			`std::size_t inbytesleft = 1;`
			`char* outbuf = m_converted[i];`
			`std::size_t outbytesleft = 6;`
			`if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1))`
			`{`
			`UTF8Character utf8(m_converted[i]);`
			`m_decodeMap[i] = utf8;`
			`m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);`
			`}`
			`}`
			`g_iconv_close(descriptor);`
			`std::sort(m_encodeMap, m_encodeMap + 128);`
			`}`
			`}`
			`/// \brief Prints the (up to) 128 characters in the current extended-ascii character set.`
			`/// Useful for debugging.`
			`void print() const`
			`{`
			`globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";`
			`for(std::size_t i = 1; i < 128; ++i)`
			`{`
			`if(m_decodeMap[i].buffer != 0)`
			`{`
			`globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";`
			`}`
			`}`
			`}`
			`/// \brief Returns \p c decoded from extended-ascii to UTF-8.`
			`/// \p c must be an extended-ascii character.`
			`const UTF8Character& decode(char c) const`
			`{`
			`ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");`
			`ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");`
			`ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));`
			`return m_decodeMap[extended_ascii_to_index(c)];`
			`}`
			`/// \brief Returns \p c encoded to extended-ascii from UTF-8.`
			`/// \p c must map to an extended-ascii character.`
			`char encode(const UTF8Character& c) const`
			`{`
			`ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");`
			`ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");`
			`std::pair<const UTF8CharacterToExtendedASCII, const UTF8CharacterToExtendedASCII> range`
			`= std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));`
			`ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);`
			`return (*range.first).m_c;`
			`}`
			`};`

			`typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;`

			`/// \brief Returns the global instance of ExtendedASCIICharacterSet.`
			`inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet()`
			`{`
			`return GlobalExtendedASCIICharacterSet::instance();`
			`}`

			`class ConvertUTF8ToLocale`
			`{`
			`public:`
			`StringRange m_range;`
			`ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string)))`
			`{`
			`}`
			`ConvertUTF8ToLocale(const StringRange& range) : m_range(range)`
			`{`
			`}`
			`};`

			`/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)`
			`{`
			`if(globalCharacterSet().isUTF8())`
			`{`
			`return ostream << convert.m_range;`
			`}`

			`for(const char* p = convert.m_range.begin; p != convert.m_range.end;)`
			`{`
			`if(!char_is_ascii(*p))`
			`{`
			`UTF8Character c(p);`
			`ostream << globalExtendedASCIICharacterSet().encode(c);`
			`p += c.length;`
			`}`
			`else`
			`{`
			`ostream << *p++;`
			`}`
			`}`
			`return ostream;`
			`}`


			`class ConvertLocaleToUTF8`
			`{`
			`public:`
			`StringRange m_range;`
			`ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string)))`
			`{`
			`}`
			`ConvertLocaleToUTF8(const StringRange& range) : m_range(range)`
			`{`
			`}`
			`};`

			`/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)`
			`{`
			`if(globalCharacterSet().isUTF8())`
			`{`
			`return ostream << convert.m_range;`
			`}`

			`for(const char* p = convert.m_range.begin; p != convert.m_range.end; ++p)`
			`{`
			`if(!char_is_ascii(*p))`
			`{`
			`UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));`
			`ostream.write(c.buffer, c.length);`
			`}`
			`else`
			`{`
			`ostream << *p;`
			`}`
			`}`
			`return ostream;`
			`}`


			`#endif`