worldspawn/libs/convert.h

/*
   Copyright (C) 2001-2006, William Joseph.
   All Rights Reserved.

   This file is part of GtkRadiant.

   GtkRadiant is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   GtkRadiant is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with GtkRadiant; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#if !defined( INCLUDED_CONVERT_H )
#define INCLUDED_CONVERT_H

/// \file
/// \brief Character encoding conversion.

#include "debugging/debugging.h"
#include <algorithm>
#include <glib.h>

#include "character.h"

/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
inline std::size_t utf8_character_length( const char* character ){
	if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
		return 2;
	}
	else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
		return 3;
	}
	else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
		return 4;
	}
	else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
		return 5;
	}
	else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
		return 6;
	}
	ERROR_MESSAGE( "" );
	return 0;
}

struct UTF8Character
{
	const char* buffer;
	std::size_t length;
	UTF8Character() : buffer( 0 ), length( 0 ){
	}
	UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
	}
};

inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
	return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
}

/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
	for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
	{
		ostream << HexChar( *p );
	}
	return ostream;
}


/// \brief The character-set encoding for the current C locale.
///
/// Obtain the global instance with globalCharacterSet().
class CharacterSet
{
const char* m_charSet;
public:
CharacterSet(){
	if ( g_get_charset( &m_charSet ) != FALSE ) {
		m_charSet = 0;
	}
}
bool isUTF8() const {
	return m_charSet == 0;
}
const char* get() const {
	return m_charSet;
}
};

typedef LazyStatic<CharacterSet> GlobalCharacterSet;

/// \brief Returns the global instance of CharacterSet.
inline CharacterSet& globalCharacterSet(){
	return GlobalCharacterSet::instance();
}


class UTF8CharacterToExtendedASCII
{
public:
UTF8Character m_utf8;
char m_c;
UTF8CharacterToExtendedASCII() : m_c( '\0' ){
}
UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
}
};

inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
	return self.m_utf8 < other.m_utf8;
}

inline std::size_t extended_ascii_to_index( char c ){
	return static_cast<std::size_t>( c & 0x7F );
}

inline char extended_ascii_for_index( std::size_t i ){
	return static_cast<char>( i | 0x80 );
}

/// \brief The active extended-ascii character set encoding.
/// Performs UTF-8 encoding and decoding of extended-ascii characters.
///
/// Obtain the global instance with globalExtendedASCIICharacterSet().
class ExtendedASCIICharacterSet
{
typedef char UTF8CharBuffer[6];
UTF8CharBuffer m_converted[128];
UTF8Character m_decodeMap[128];
UTF8CharacterToExtendedASCII m_encodeMap[128];
public:
ExtendedASCIICharacterSet(){
	if ( !globalCharacterSet().isUTF8() ) {
		GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
		for ( std::size_t i = 1; i < 128; ++i )
		{
			char c = extended_ascii_for_index( i );
			char* inbuf = &c;
			gsize inbytesleft = 1;
			char* outbuf = m_converted[i];
			gsize outbytesleft = 6;
			if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
				UTF8Character utf8( m_converted[i] );
				m_decodeMap[i] = utf8;
				m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
			}
		}
		g_iconv_close( descriptor );
		std::sort( m_encodeMap, m_encodeMap + 128 );
	}
}
/// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
/// Useful for debugging.
void print() const {
	globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
	for ( std::size_t i = 1; i < 128; ++i )
	{
		if ( m_decodeMap[i].buffer != 0 ) {
			globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
		}
	}
}
/// \brief Returns \p c decoded from extended-ascii to UTF-8.
/// \p c must be an extended-ascii character.
const UTF8Character& decode( char c ) const {
	ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
	ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
	ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
	return m_decodeMap[extended_ascii_to_index( c )];
}
/// \brief Returns \p c encoded to extended-ascii from UTF-8.
/// \p c must map to an extended-ascii character.
char encode( const UTF8Character& c ) const {
	ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
	ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
	std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
		= std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
	ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
	return ( *range.first ).m_c;
}
};

typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;

/// \brief Returns the global instance of ExtendedASCIICharacterSet.
inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
	return GlobalExtendedASCIICharacterSet::instance();
}

class ConvertUTF8ToLocale
{
public:
StringRange m_range;
ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
}
ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
}
};

/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
	if ( globalCharacterSet().isUTF8() ) {
		return ostream << convert.m_range;
	}

	for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
	{
		if ( !char_is_ascii( *p ) ) {
			UTF8Character c( p );
			ostream << globalExtendedASCIICharacterSet().encode( c );
			p += c.length;
		}
		else
		{
			ostream << *p++;
		}
	}
	return ostream;
}


class ConvertLocaleToUTF8
{
public:
StringRange m_range;
ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
}
ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
}
};

/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
template<typename TextOutputStreamType>
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
	if ( globalCharacterSet().isUTF8() ) {
		return ostream << convert.m_range;
	}

	for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
	{
		if ( !char_is_ascii( *p ) ) {
			UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
			ostream.write( c.buffer, c.length );
		}
		else
		{
			ostream << *p;
		}
	}
	return ostream;
}


#endif
Initial commit 2020-11-17 11:16:16 +00:00			`/*`
			`Copyright (C) 2001-2006, William Joseph.`
			`All Rights Reserved.`

			`This file is part of GtkRadiant.`

			`GtkRadiant is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`GtkRadiant is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with GtkRadiant; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#if !defined( INCLUDED_CONVERT_H )`
			`#define INCLUDED_CONVERT_H`

			`/// \file`
			`/// \brief Character encoding conversion.`

			`#include "debugging/debugging.h"`
			`#include <algorithm>`
			`#include <glib.h>`

			`#include "character.h"`

			`/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.`
			`inline std::size_t utf8_character_length( const char* character ){`
			`if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx`
			`return 2;`
			`}`
			`else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx`
			`return 3;`
			`}`
			`else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx`
			`return 4;`
			`}`
			`else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx`
			`return 5;`
			`}`
			`else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x`
			`return 6;`
			`}`
			`ERROR_MESSAGE( "" );`
			`return 0;`
			`}`

			`struct UTF8Character`
			`{`
			`const char* buffer;`
			`std::size_t length;`
			`UTF8Character() : buffer( 0 ), length( 0 ){`
			`}`
			`UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){`
			`}`
			`};`

			`inline bool operator<( const UTF8Character& self, const UTF8Character& other ){`
			`return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );`
			`}`

			`/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){`
			`for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )`
			`{`
			`ostream << HexChar( *p );`
			`}`
			`return ostream;`
			`}`



			`/// \brief The character-set encoding for the current C locale.`
			`///`
			`/// Obtain the global instance with globalCharacterSet().`
			`class CharacterSet`
			`{`
			`const char* m_charSet;`
			`public:`
			`CharacterSet(){`
			`if ( g_get_charset( &m_charSet ) != FALSE ) {`
			`m_charSet = 0;`
			`}`
			`}`
			`bool isUTF8() const {`
			`return m_charSet == 0;`
			`}`
			`const char* get() const {`
			`return m_charSet;`
			`}`
			`};`

			`typedef LazyStatic<CharacterSet> GlobalCharacterSet;`

			`/// \brief Returns the global instance of CharacterSet.`
			`inline CharacterSet& globalCharacterSet(){`
			`return GlobalCharacterSet::instance();`
			`}`


			`class UTF8CharacterToExtendedASCII`
			`{`
			`public:`
			`UTF8Character m_utf8;`
			`char m_c;`
			`UTF8CharacterToExtendedASCII() : m_c( '\0' ){`
			`}`
			`UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){`
			`}`
			`};`

			`inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){`
			`return self.m_utf8 < other.m_utf8;`
			`}`

			`inline std::size_t extended_ascii_to_index( char c ){`
			`return static_cast<std::size_t>( c & 0x7F );`
			`}`

			`inline char extended_ascii_for_index( std::size_t i ){`
			`return static_cast<char>( i \| 0x80 );`
			`}`

			`/// \brief The active extended-ascii character set encoding.`
			`/// Performs UTF-8 encoding and decoding of extended-ascii characters.`
			`///`
			`/// Obtain the global instance with globalExtendedASCIICharacterSet().`
			`class ExtendedASCIICharacterSet`
			`{`
			`typedef char UTF8CharBuffer[6];`
			`UTF8CharBuffer m_converted[128];`
			`UTF8Character m_decodeMap[128];`
			`UTF8CharacterToExtendedASCII m_encodeMap[128];`
			`public:`
			`ExtendedASCIICharacterSet(){`
			`if ( !globalCharacterSet().isUTF8() ) {`
			`GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );`
			`for ( std::size_t i = 1; i < 128; ++i )`
			`{`
			`char c = extended_ascii_for_index( i );`
			`char* inbuf = &c;`
			`gsize inbytesleft = 1;`
			`char* outbuf = m_converted[i];`
			`gsize outbytesleft = 6;`
			`if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {`
			`UTF8Character utf8( m_converted[i] );`
			`m_decodeMap[i] = utf8;`
			`m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );`
			`}`
			`}`
			`g_iconv_close( descriptor );`
			`std::sort( m_encodeMap, m_encodeMap + 128 );`
			`}`
			`}`
			`/// \brief Prints the (up to) 128 characters in the current extended-ascii character set.`
			`/// Useful for debugging.`
			`void print() const {`
			`globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";`
			`for ( std::size_t i = 1; i < 128; ++i )`
			`{`
			`if ( m_decodeMap[i].buffer != 0 ) {`
			`globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";`
			`}`
			`}`
			`}`
			`/// \brief Returns \p c decoded from extended-ascii to UTF-8.`
			`/// \p c must be an extended-ascii character.`
			`const UTF8Character& decode( char c ) const {`
			`ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );`
			`ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );`
			`ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );`
			`return m_decodeMap[extended_ascii_to_index( c )];`
			`}`
			`/// \brief Returns \p c encoded to extended-ascii from UTF-8.`
			`/// \p c must map to an extended-ascii character.`
			`char encode( const UTF8Character& c ) const {`
			`ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );`
			`ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );`
			`std::pair<const UTF8CharacterToExtendedASCII, const UTF8CharacterToExtendedASCII> range`
			`= std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );`
			`ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );`
			`return ( *range.first ).m_c;`
			`}`
			`};`

			`typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;`

			`/// \brief Returns the global instance of ExtendedASCIICharacterSet.`
			`inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){`
			`return GlobalExtendedASCIICharacterSet::instance();`
			`}`

			`class ConvertUTF8ToLocale`
			`{`
			`public:`
			`StringRange m_range;`
			`ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){`
			`}`
			`ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){`
			`}`
			`};`

			`/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){`
			`if ( globalCharacterSet().isUTF8() ) {`
			`return ostream << convert.m_range;`
			`}`

			`for ( const char* p = convert.m_range.first; p != convert.m_range.last; )`
			`{`
			`if ( !char_is_ascii( *p ) ) {`
			`UTF8Character c( p );`
			`ostream << globalExtendedASCIICharacterSet().encode( c );`
			`p += c.length;`
			`}`
			`else`
			`{`
			`ostream << *p++;`
			`}`
			`}`
			`return ostream;`
			`}`


			`class ConvertLocaleToUTF8`
			`{`
			`public:`
			`StringRange m_range;`
			`ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){`
			`}`
			`ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){`
			`}`
			`};`

			`/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.`
			`template<typename TextOutputStreamType>`
			`inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){`
			`if ( globalCharacterSet().isUTF8() ) {`
			`return ostream << convert.m_range;`
			`}`

			`for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )`
			`{`
			`if ( !char_is_ascii( *p ) ) {`
			`UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );`
			`ostream.write( c.buffer, c.length );`
			`}`
			`else`
			`{`
			`ostream << *p;`
			`}`
			`}`
			`return ostream;`
			`}`


			`#endif`