#if !defined( INCLUDED_CONVERT_H ) #define INCLUDED_CONVERT_H /// \file /// \brief Character encoding conversion. #include "debugging/debugging.h" #include #include #include #include "character.h" /// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding. inline std::size_t utf8_character_length( const char* character ){ if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx return 2; } else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx return 3; } else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx return 4; } else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx return 5; } else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x return 6; } ERROR_MESSAGE( "" ); return 0; } struct UTF8Character { const char* buffer; std::size_t length; UTF8Character() : buffer( 0 ), length( 0 ){ } UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){ } }; inline bool operator<( const UTF8Character& self, const UTF8Character& other ){ return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length ); } /// \brief Writes \p c to \p ostream in Hex form. Useful for debugging. template inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){ for ( const char* p = c.buffer; p != c.buffer + c.length; ++p ) { ostream << HexChar( *p ); } return ostream; } /// \brief The character-set encoding for the current C locale. /// /// Obtain the global instance with globalCharacterSet(). class CharacterSet { const char* m_charSet; public: CharacterSet(){ if ( g_get_charset( &m_charSet ) != FALSE ) { m_charSet = 0; } } bool isUTF8() const { return m_charSet == 0; } const char* get() const { return m_charSet; } }; typedef LazyStatic GlobalCharacterSet; /// \brief Returns the global instance of CharacterSet. inline CharacterSet& globalCharacterSet(){ return GlobalCharacterSet::instance(); } class UTF8CharacterToExtendedASCII { public: UTF8Character m_utf8; char m_c; UTF8CharacterToExtendedASCII() : m_c( '\0' ){ } UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){ } }; inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){ return self.m_utf8 < other.m_utf8; } inline std::size_t extended_ascii_to_index( char c ){ return static_cast( c & 0x7F ); } inline char extended_ascii_for_index( std::size_t i ){ return static_cast( i | 0x80 ); } /// \brief The active extended-ascii character set encoding. /// Performs UTF-8 encoding and decoding of extended-ascii characters. /// /// Obtain the global instance with globalExtendedASCIICharacterSet(). class ExtendedASCIICharacterSet { typedef char UTF8CharBuffer[6]; UTF8CharBuffer m_converted[128]; UTF8Character m_decodeMap[128]; UTF8CharacterToExtendedASCII m_encodeMap[128]; public: ExtendedASCIICharacterSet(){ if ( !globalCharacterSet().isUTF8() ) { GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() ); for ( std::size_t i = 1; i < 128; ++i ) { char c = extended_ascii_for_index( i ); char* inbuf = &c; std::size_t inbytesleft = 1; char* outbuf = m_converted[i]; std::size_t outbytesleft = 6; if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) { UTF8Character utf8( m_converted[i] ); m_decodeMap[i] = utf8; m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c ); } } g_iconv_close( descriptor ); std::sort( m_encodeMap, m_encodeMap + 128 ); } } /// \brief Prints the (up to) 128 characters in the current extended-ascii character set. /// Useful for debugging. void print() const { globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n"; for ( std::size_t i = 1; i < 128; ++i ) { if ( m_decodeMap[i].buffer != 0 ) { globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n"; } } } /// \brief Returns \p c decoded from extended-ascii to UTF-8. /// \p c must be an extended-ascii character. const UTF8Character& decode( char c ) const { ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" ); ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" ); ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) ); return m_decodeMap[extended_ascii_to_index( c )]; } /// \brief Returns \p c encoded to extended-ascii from UTF-8. /// \p c must map to an extended-ascii character. char encode( const UTF8Character& c ) const { ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" ); ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" ); std::pair range = std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) ); ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c ); return ( *range.first ).m_c; } }; typedef LazyStatic GlobalExtendedASCIICharacterSet; /// \brief Returns the global instance of ExtendedASCIICharacterSet. inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){ return GlobalExtendedASCIICharacterSet::instance(); } class ConvertUTF8ToLocale { public: StringRange m_range; ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){ } ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){ } }; /// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8. template inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){ if ( globalCharacterSet().isUTF8() ) { return ostream << convert.m_range; } for ( const char* p = convert.m_range.begin; p != convert.m_range.end; ) { if ( !char_is_ascii( *p ) ) { UTF8Character c( p ); ostream << globalExtendedASCIICharacterSet().encode( c ); p += c.length; } else { ostream << *p++; } } return ostream; } class ConvertLocaleToUTF8 { public: StringRange m_range; ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){ } ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){ } }; /// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8. template inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){ if ( globalCharacterSet().isUTF8() ) { return ostream << convert.m_range; } for ( const char* p = convert.m_range.begin; p != convert.m_range.end; ++p ) { if ( !char_is_ascii( *p ) ) { UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) ); ostream.write( c.buffer, c.length ); } else { ostream << *p; } } return ostream; } #endif