#if !defined(INCLUDED_CONVERT_H) #define INCLUDED_CONVERT_H /// \file /// \brief Character encoding conversion. #include "debugging/debugging.h" #include #include #include #include "character.h" /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding. inline std::size_t utf8_character_length(const char* character) { if((*character & 0xE0) == 0xC0) // 110xxxxx { return 2; } else if((*character & 0xF0) == 0xE0) // 1110xxxx { return 3; } else if((*character & 0xF8) == 0xF0) // 11110xxx { return 4; } else if((*character & 0xFC) == 0xF8) // 111110xx { return 5; } else if((*character & 0xFE) == 0xFC) // 1111110x { return 6; } ERROR_MESSAGE(""); return 0; } struct UTF8Character { const char* buffer; std::size_t length; UTF8Character() : buffer(0), length(0) { } UTF8Character(const char* bytes) : buffer(bytes), length(utf8_character_length(bytes)) { } }; inline bool operator<(const UTF8Character& self, const UTF8Character& other) { return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length); } /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging. template inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const UTF8Character& c) { for(const char* p = c.buffer; p != c.buffer + c.length; ++p) { ostream << HexChar(*p); } return ostream; } /// \brief The character-set encoding for the current C locale. /// /// Obtain the global instance with globalCharacterSet(). class CharacterSet { const char* m_charSet; public: CharacterSet() { if(g_get_charset(&m_charSet) != FALSE) { m_charSet = 0; } } bool isUTF8() const { return m_charSet == 0; } const char* get() const { return m_charSet; } }; typedef LazyStatic GlobalCharacterSet; /// \brief Returns the global instance of CharacterSet. inline CharacterSet& globalCharacterSet() { return GlobalCharacterSet::instance(); } class UTF8CharacterToExtendedASCII { public: UTF8Character m_utf8; char m_c; UTF8CharacterToExtendedASCII() : m_c('\0') { } UTF8CharacterToExtendedASCII(const UTF8Character& utf8, char c) : m_utf8(utf8), m_c(c) { } }; inline bool operator<(const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other) { return self.m_utf8 < other.m_utf8; } inline std::size_t extended_ascii_to_index(char c) { return static_cast(c & 0x7F); } inline char extended_ascii_for_index(std::size_t i) { return static_cast(i | 0x80); } /// \brief The active extended-ascii character set encoding. /// Performs UTF-8 encoding and decoding of extended-ascii characters. /// /// Obtain the global instance with globalExtendedASCIICharacterSet(). class ExtendedASCIICharacterSet { typedef char UTF8CharBuffer[6]; UTF8CharBuffer m_converted[128]; UTF8Character m_decodeMap[128]; UTF8CharacterToExtendedASCII m_encodeMap[128]; public: ExtendedASCIICharacterSet() { if(!globalCharacterSet().isUTF8()) { GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get()); for(std::size_t i = 1; i < 128; ++i) { char c = extended_ascii_for_index(i); char* inbuf = &c; std::size_t inbytesleft = 1; char* outbuf = m_converted[i]; std::size_t outbytesleft = 6; if(g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)(-1)) { UTF8Character utf8(m_converted[i]); m_decodeMap[i] = utf8; m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c); } } g_iconv_close(descriptor); std::sort(m_encodeMap, m_encodeMap + 128); } } /// \brief Prints the (up to) 128 characters in the current extended-ascii character set. /// Useful for debugging. void print() const { globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n"; for(std::size_t i = 1; i < 128; ++i) { if(m_decodeMap[i].buffer != 0) { globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n"; } } } /// \brief Returns \p c decoded from extended-ascii to UTF-8. /// \p c must be an extended-ascii character. const UTF8Character& decode(char c) const { ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required"); ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character"); ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c)); return m_decodeMap[extended_ascii_to_index(c)]; } /// \brief Returns \p c encoded to extended-ascii from UTF-8. /// \p c must map to an extended-ascii character. char encode(const UTF8Character& c) const { ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required"); ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character"); std::pair range = std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0)); ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c); return (*range.first).m_c; } }; typedef LazyStatic GlobalExtendedASCIICharacterSet; /// \brief Returns the global instance of ExtendedASCIICharacterSet. inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet() { return GlobalExtendedASCIICharacterSet::instance(); } class ConvertUTF8ToLocale { public: StringRange m_range; ConvertUTF8ToLocale(const char* string) : m_range(StringRange(string, string + strlen(string))) { } ConvertUTF8ToLocale(const StringRange& range) : m_range(range) { } }; /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8. template inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert) { if(globalCharacterSet().isUTF8()) { return ostream << convert.m_range; } for(const char* p = convert.m_range.begin; p != convert.m_range.end;) { if(!char_is_ascii(*p)) { UTF8Character c(p); ostream << globalExtendedASCIICharacterSet().encode(c); p += c.length; } else { ostream << *p++; } } return ostream; } class ConvertLocaleToUTF8 { public: StringRange m_range; ConvertLocaleToUTF8(const char* string) : m_range(StringRange(string, string + strlen(string))) { } ConvertLocaleToUTF8(const StringRange& range) : m_range(range) { } }; /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8. template inline TextOutputStreamType& ostream_write(TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert) { if(globalCharacterSet().isUTF8()) { return ostream << convert.m_range; } for(const char* p = convert.m_range.begin; p != convert.m_range.end; ++p) { if(!char_is_ascii(*p)) { UTF8Character c(globalExtendedASCIICharacterSet().decode(*p)); ostream.write(c.buffer, c.length); } else { ostream << *p; } } return ostream; } #endif