267 lines
7.9 KiB
C
267 lines
7.9 KiB
C
|
/*
|
||
|
Copyright (C) 2001-2006, William Joseph.
|
||
|
All Rights Reserved.
|
||
|
|
||
|
This file is part of GtkRadiant.
|
||
|
|
||
|
GtkRadiant is free software; you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
GtkRadiant is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with GtkRadiant; if not, write to the Free Software
|
||
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
|
||
|
#if !defined( INCLUDED_CONVERT_H )
|
||
|
#define INCLUDED_CONVERT_H
|
||
|
|
||
|
/// \file
|
||
|
/// \brief Character encoding conversion.
|
||
|
|
||
|
#include "debugging/debugging.h"
|
||
|
#include <algorithm>
|
||
|
#include <glib.h>
|
||
|
|
||
|
#include "character.h"
|
||
|
|
||
|
/// \brief Returns the number of bytes required to represent \p character in UTF-8 encoding.
|
||
|
inline std::size_t utf8_character_length( const char* character ){
|
||
|
if ( ( *character & 0xE0 ) == 0xC0 ) { // 110xxxxx
|
||
|
return 2;
|
||
|
}
|
||
|
else if ( ( *character & 0xF0 ) == 0xE0 ) { // 1110xxxx
|
||
|
return 3;
|
||
|
}
|
||
|
else if ( ( *character & 0xF8 ) == 0xF0 ) { // 11110xxx
|
||
|
return 4;
|
||
|
}
|
||
|
else if ( ( *character & 0xFC ) == 0xF8 ) { // 111110xx
|
||
|
return 5;
|
||
|
}
|
||
|
else if ( ( *character & 0xFE ) == 0xFC ) { // 1111110x
|
||
|
return 6;
|
||
|
}
|
||
|
ERROR_MESSAGE( "" );
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
struct UTF8Character
|
||
|
{
|
||
|
const char* buffer;
|
||
|
std::size_t length;
|
||
|
UTF8Character() : buffer( 0 ), length( 0 ){
|
||
|
}
|
||
|
UTF8Character( const char* bytes ) : buffer( bytes ), length( utf8_character_length( bytes ) ){
|
||
|
}
|
||
|
};
|
||
|
|
||
|
inline bool operator<( const UTF8Character& self, const UTF8Character& other ){
|
||
|
return std::lexicographical_compare( self.buffer, self.buffer + self.length, other.buffer, other.buffer + other.length );
|
||
|
}
|
||
|
|
||
|
/// \brief Writes \p c to \p ostream in Hex form. Useful for debugging.
|
||
|
template<typename TextOutputStreamType>
|
||
|
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const UTF8Character& c ){
|
||
|
for ( const char* p = c.buffer; p != c.buffer + c.length; ++p )
|
||
|
{
|
||
|
ostream << HexChar( *p );
|
||
|
}
|
||
|
return ostream;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/// \brief The character-set encoding for the current C locale.
|
||
|
///
|
||
|
/// Obtain the global instance with globalCharacterSet().
|
||
|
class CharacterSet
|
||
|
{
|
||
|
const char* m_charSet;
|
||
|
public:
|
||
|
CharacterSet(){
|
||
|
if ( g_get_charset( &m_charSet ) != FALSE ) {
|
||
|
m_charSet = 0;
|
||
|
}
|
||
|
}
|
||
|
bool isUTF8() const {
|
||
|
return m_charSet == 0;
|
||
|
}
|
||
|
const char* get() const {
|
||
|
return m_charSet;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
typedef LazyStatic<CharacterSet> GlobalCharacterSet;
|
||
|
|
||
|
/// \brief Returns the global instance of CharacterSet.
|
||
|
inline CharacterSet& globalCharacterSet(){
|
||
|
return GlobalCharacterSet::instance();
|
||
|
}
|
||
|
|
||
|
|
||
|
class UTF8CharacterToExtendedASCII
|
||
|
{
|
||
|
public:
|
||
|
UTF8Character m_utf8;
|
||
|
char m_c;
|
||
|
UTF8CharacterToExtendedASCII() : m_c( '\0' ){
|
||
|
}
|
||
|
UTF8CharacterToExtendedASCII( const UTF8Character& utf8, char c ) : m_utf8( utf8 ), m_c( c ){
|
||
|
}
|
||
|
};
|
||
|
|
||
|
inline bool operator<( const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other ){
|
||
|
return self.m_utf8 < other.m_utf8;
|
||
|
}
|
||
|
|
||
|
inline std::size_t extended_ascii_to_index( char c ){
|
||
|
return static_cast<std::size_t>( c & 0x7F );
|
||
|
}
|
||
|
|
||
|
inline char extended_ascii_for_index( std::size_t i ){
|
||
|
return static_cast<char>( i | 0x80 );
|
||
|
}
|
||
|
|
||
|
/// \brief The active extended-ascii character set encoding.
|
||
|
/// Performs UTF-8 encoding and decoding of extended-ascii characters.
|
||
|
///
|
||
|
/// Obtain the global instance with globalExtendedASCIICharacterSet().
|
||
|
class ExtendedASCIICharacterSet
|
||
|
{
|
||
|
typedef char UTF8CharBuffer[6];
|
||
|
UTF8CharBuffer m_converted[128];
|
||
|
UTF8Character m_decodeMap[128];
|
||
|
UTF8CharacterToExtendedASCII m_encodeMap[128];
|
||
|
public:
|
||
|
ExtendedASCIICharacterSet(){
|
||
|
if ( !globalCharacterSet().isUTF8() ) {
|
||
|
GIConv descriptor = g_iconv_open( "UTF-8", globalCharacterSet().get() );
|
||
|
for ( std::size_t i = 1; i < 128; ++i )
|
||
|
{
|
||
|
char c = extended_ascii_for_index( i );
|
||
|
char* inbuf = &c;
|
||
|
gsize inbytesleft = 1;
|
||
|
char* outbuf = m_converted[i];
|
||
|
gsize outbytesleft = 6;
|
||
|
if ( g_iconv( descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft ) != (size_t)( -1 ) ) {
|
||
|
UTF8Character utf8( m_converted[i] );
|
||
|
m_decodeMap[i] = utf8;
|
||
|
m_encodeMap[i] = UTF8CharacterToExtendedASCII( utf8, c );
|
||
|
}
|
||
|
}
|
||
|
g_iconv_close( descriptor );
|
||
|
std::sort( m_encodeMap, m_encodeMap + 128 );
|
||
|
}
|
||
|
}
|
||
|
/// \brief Prints the (up to) 128 characters in the current extended-ascii character set.
|
||
|
/// Useful for debugging.
|
||
|
void print() const {
|
||
|
globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
|
||
|
for ( std::size_t i = 1; i < 128; ++i )
|
||
|
{
|
||
|
if ( m_decodeMap[i].buffer != 0 ) {
|
||
|
globalOutputStream() << extended_ascii_for_index( i ) << " = " << m_decodeMap[i] << "\n";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
/// \brief Returns \p c decoded from extended-ascii to UTF-8.
|
||
|
/// \p c must be an extended-ascii character.
|
||
|
const UTF8Character& decode( char c ) const {
|
||
|
ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
|
||
|
ASSERT_MESSAGE( !char_is_ascii( c ), "decode: ascii character" );
|
||
|
ASSERT_MESSAGE( m_decodeMap[extended_ascii_to_index( c )].buffer != 0, "decode: invalid character: " << HexChar( c ) );
|
||
|
return m_decodeMap[extended_ascii_to_index( c )];
|
||
|
}
|
||
|
/// \brief Returns \p c encoded to extended-ascii from UTF-8.
|
||
|
/// \p c must map to an extended-ascii character.
|
||
|
char encode( const UTF8Character& c ) const {
|
||
|
ASSERT_MESSAGE( !globalCharacterSet().isUTF8(), "locale is utf8, no conversion required" );
|
||
|
ASSERT_MESSAGE( !char_is_ascii( *c.buffer ), "encode: ascii character" );
|
||
|
std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range
|
||
|
= std::equal_range( m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII( c, 0 ) );
|
||
|
ASSERT_MESSAGE( range.first != range.second, "encode: invalid character: " << c );
|
||
|
return ( *range.first ).m_c;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
|
||
|
|
||
|
/// \brief Returns the global instance of ExtendedASCIICharacterSet.
|
||
|
inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet(){
|
||
|
return GlobalExtendedASCIICharacterSet::instance();
|
||
|
}
|
||
|
|
||
|
class ConvertUTF8ToLocale
|
||
|
{
|
||
|
public:
|
||
|
StringRange m_range;
|
||
|
ConvertUTF8ToLocale( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
|
||
|
}
|
||
|
ConvertUTF8ToLocale( const StringRange& range ) : m_range( range ){
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// \brief Writes \p convert to \p ostream after encoding each character to extended-ascii from UTF-8.
|
||
|
template<typename TextOutputStreamType>
|
||
|
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert ){
|
||
|
if ( globalCharacterSet().isUTF8() ) {
|
||
|
return ostream << convert.m_range;
|
||
|
}
|
||
|
|
||
|
for ( const char* p = convert.m_range.first; p != convert.m_range.last; )
|
||
|
{
|
||
|
if ( !char_is_ascii( *p ) ) {
|
||
|
UTF8Character c( p );
|
||
|
ostream << globalExtendedASCIICharacterSet().encode( c );
|
||
|
p += c.length;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ostream << *p++;
|
||
|
}
|
||
|
}
|
||
|
return ostream;
|
||
|
}
|
||
|
|
||
|
|
||
|
class ConvertLocaleToUTF8
|
||
|
{
|
||
|
public:
|
||
|
StringRange m_range;
|
||
|
ConvertLocaleToUTF8( const char* string ) : m_range( StringRange( string, string + strlen( string ) ) ){
|
||
|
}
|
||
|
ConvertLocaleToUTF8( const StringRange& range ) : m_range( range ){
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// \brief Writes \p convert to \p ostream after decoding each character from extended-ascii to UTF-8.
|
||
|
template<typename TextOutputStreamType>
|
||
|
inline TextOutputStreamType& ostream_write( TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert ){
|
||
|
if ( globalCharacterSet().isUTF8() ) {
|
||
|
return ostream << convert.m_range;
|
||
|
}
|
||
|
|
||
|
for ( const char* p = convert.m_range.first; p != convert.m_range.last; ++p )
|
||
|
{
|
||
|
if ( !char_is_ascii( *p ) ) {
|
||
|
UTF8Character c( globalExtendedASCIICharacterSet().decode( *p ) );
|
||
|
ostream.write( c.buffer, c.length );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ostream << *p;
|
||
|
}
|
||
|
}
|
||
|
return ostream;
|
||
|
}
|
||
|
|
||
|
|
||
|
#endif
|