Improve transliteration in lossy characterset conversion.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@22399 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
rfm 2006-02-01 06:18:41 +00:00
parent ae06020b9f
commit e112662ba7
2 changed files with 163 additions and 75 deletions

View file

@ -1,3 +1,10 @@
2006-02-01 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/Unicode.m: Try to use iconv transliteration for
lossy conversion. When handling unconvertable characters in lossy
conversion, substitute a question mark rather than an asterisk for
consistency with iconv and other implementations.
2006-01-27 Richard Frith-Macdonald <rfm@gnu.org> 2006-01-27 Richard Frith-Macdonald <rfm@gnu.org>
* Source/NSDistributedLock.m: ([description]) implemented * Source/NSDistributedLock.m: ([description]) implemented

View file

@ -158,6 +158,7 @@ struct _strenc_ {
* A negative means unsupported. * A negative means unsupported.
* A zero means not yet checked. * A zero means not yet checked.
*/ */
const char *lossy; /* Iconv name for lossy encoding */
}; };
/* /*
@ -165,46 +166,82 @@ struct _strenc_ {
* encoding information we might need. It gets modified at runtime. * encoding information we might need. It gets modified at runtime.
*/ */
static struct _strenc_ str_encoding_table[] = { static struct _strenc_ str_encoding_table[] = {
{NSASCIIStringEncoding,"NSASCIIStringEncoding","ASCII",1,1}, {NSASCIIStringEncoding,
{NSNEXTSTEPStringEncoding,"NSNEXTSTEPStringEncoding","NEXTSTEP",1,1}, "NSASCIIStringEncoding","ASCII",1,1,0},
{NSJapaneseEUCStringEncoding, "NSJapaneseEUCStringEncoding","EUC-JP",0,0}, {NSNEXTSTEPStringEncoding,
{NSUTF8StringEncoding,"NSUTF8StringEncoding","UTF-8",0,1}, "NSNEXTSTEPStringEncoding","NEXTSTEP",1,1,0},
{NSISOLatin1StringEncoding,"NSISOLatin1StringEncoding","ISO-8859-1",1,1}, {NSJapaneseEUCStringEncoding,
{NSSymbolStringEncoding,"NSSymbolStringEncoding","",0,0}, "NSJapaneseEUCStringEncoding","EUC-JP",0,0,0},
{NSNonLossyASCIIStringEncoding,"NSNonLossyASCIIStringEncoding","",1,1}, {NSUTF8StringEncoding,
{NSShiftJISStringEncoding,"NSShiftJISStringEncoding","SHIFT-JIS",0,0}, "NSUTF8StringEncoding","UTF-8",0,1,0},
{NSISOLatin2StringEncoding,"NSISOLatin2StringEncoding","ISO-8859-2",1,1}, {NSISOLatin1StringEncoding,
{NSUnicodeStringEncoding, "NSUnicodeStringEncoding","",0,1}, "NSISOLatin1StringEncoding","ISO-8859-1",1,1,0},
{NSWindowsCP1251StringEncoding,"NSWindowsCP1251StringEncoding","CP1251",0,0}, {NSSymbolStringEncoding,
{NSWindowsCP1252StringEncoding,"NSWindowsCP1252StringEncoding","CP1252",0,0}, "NSSymbolStringEncoding","",0,0,0},
{NSWindowsCP1253StringEncoding,"NSWindowsCP1253StringEncoding","CP1253",0,0}, {NSNonLossyASCIIStringEncoding,
{NSWindowsCP1254StringEncoding,"NSWindowsCP1254StringEncoding","CP1254",0,0}, "NSNonLossyASCIIStringEncoding","",1,1,0},
{NSWindowsCP1250StringEncoding,"NSWindowsCP1250StringEncoding","CP1250",0,0}, {NSShiftJISStringEncoding,
{NSISO2022JPStringEncoding,"NSISO2022JPStringEncoding","ISO-2022-JP",0,0}, "NSShiftJISStringEncoding","SHIFT-JIS",0,0,0},
{NSMacOSRomanStringEncoding, "NSMacOSRomanStringEncoding","MACINTOSH",0,0}, {NSISOLatin2StringEncoding,
{NSProprietaryStringEncoding, "NSProprietaryStringEncoding","",0,0}, "NSISOLatin2StringEncoding","ISO-8859-2",1,1,0},
{NSUnicodeStringEncoding,
"NSUnicodeStringEncoding","",0,1,0},
{NSWindowsCP1251StringEncoding,
"NSWindowsCP1251StringEncoding","CP1251",0,0,0},
{NSWindowsCP1252StringEncoding,
"NSWindowsCP1252StringEncoding","CP1252",0,0,0},
{NSWindowsCP1253StringEncoding,
"NSWindowsCP1253StringEncoding","CP1253",0,0,0},
{NSWindowsCP1254StringEncoding,
"NSWindowsCP1254StringEncoding","CP1254",0,0,0},
{NSWindowsCP1250StringEncoding,
"NSWindowsCP1250StringEncoding","CP1250",0,0,0},
{NSISO2022JPStringEncoding,
"NSISO2022JPStringEncoding","ISO-2022-JP",0,0,0},
{NSMacOSRomanStringEncoding,
"NSMacOSRomanStringEncoding","MACINTOSH",0,0,0},
{NSProprietaryStringEncoding,
"NSProprietaryStringEncoding","",0,0,0},
// GNUstep additions // GNUstep additions
{NSISOCyrillicStringEncoding,"NSISOCyrillicStringEncoding","ISO-8859-5",0,1}, {NSISOCyrillicStringEncoding,
{NSKOI8RStringEncoding, "NSKOI8RStringEncoding","KOI8-R",0,0}, "NSISOCyrillicStringEncoding","ISO-8859-5",0,1,0},
{NSISOLatin3StringEncoding, "NSISOLatin3StringEncoding","ISO-8859-3",0,0}, {NSKOI8RStringEncoding,
{NSISOLatin4StringEncoding, "NSISOLatin4StringEncoding","ISO-8859-4",0,0}, "NSKOI8RStringEncoding","KOI8-R",0,0,0},
{NSISOArabicStringEncoding, "NSISOArabicStringEncoding","ISO-8859-6",0,0}, {NSISOLatin3StringEncoding,
{NSISOGreekStringEncoding, "NSISOGreekStringEncoding","ISO-8859-7",0,0}, "NSISOLatin3StringEncoding","ISO-8859-3",0,0,0},
{NSISOHebrewStringEncoding, "NSISOHebrewStringEncoding","ISO-8859-8",0,0}, {NSISOLatin4StringEncoding,
{NSISOLatin5StringEncoding, "NSISOLatin5StringEncoding","ISO-8859-9",0,0}, "NSISOLatin4StringEncoding","ISO-8859-4",0,0,0},
{NSISOLatin6StringEncoding, "NSISOLatin6StringEncoding","ISO-8859-10",0,0}, {NSISOArabicStringEncoding,
{NSISOThaiStringEncoding, "NSISOThaiStringEncoding","ISO-8859-11",1,1}, "NSISOArabicStringEncoding","ISO-8859-6",0,0,0},
{NSISOLatin7StringEncoding, "NSISOLatin7StringEncoding","ISO-8859-13",0,0}, {NSISOGreekStringEncoding,
{NSISOLatin8StringEncoding, "NSISOLatin8StringEncoding","ISO-8859-14",0,0}, "NSISOGreekStringEncoding","ISO-8859-7",0,0,0},
{NSISOLatin9StringEncoding, "NSISOLatin9StringEncoding","ISO-8859-15",1,1}, {NSISOHebrewStringEncoding,
{NSUTF7StringEncoding, "NSUTF7StringEncoding","UTF-7",0,0}, "NSISOHebrewStringEncoding","ISO-8859-8",0,0,0},
{NSGB2312StringEncoding, "NSGB2312StringEncoding","EUC-CN",0,0}, {NSISOLatin5StringEncoding,
{NSGSM0338StringEncoding, "NSGSM0338StringEncoding","",0,1}, "NSISOLatin5StringEncoding","ISO-8859-9",0,0,0},
{NSBIG5StringEncoding, "NSBIG5StringEncoding","BIG5",0,0}, {NSISOLatin6StringEncoding,
{NSKoreanEUCStringEncoding, "NSKoreanEUCStringEncoding", "EUC-KR",0,0}, "NSISOLatin6StringEncoding","ISO-8859-10",0,0,0},
{NSISOThaiStringEncoding,
"NSISOThaiStringEncoding","ISO-8859-11",1,1,0},
{NSISOLatin7StringEncoding,
"NSISOLatin7StringEncoding","ISO-8859-13",0,0,0},
{NSISOLatin8StringEncoding,
"NSISOLatin8StringEncoding","ISO-8859-14",0,0,0},
{NSISOLatin9StringEncoding,
"NSISOLatin9StringEncoding","ISO-8859-15",1,1,0},
{NSUTF7StringEncoding,
"NSUTF7StringEncoding","UTF-7",0,0,0},
{NSGB2312StringEncoding,
"NSGB2312StringEncoding","EUC-CN",0,0,0},
{NSGSM0338StringEncoding,
"NSGSM0338StringEncoding","",0,1,0},
{NSBIG5StringEncoding,
"NSBIG5StringEncoding","BIG5",0,0,0},
{NSKoreanEUCStringEncoding,
"NSKoreanEUCStringEncoding","EUC-KR",0,0,0},
{0,"Unknown encoding","",0,0} {0,"Unknown encoding","",0,0,0}
}; };
static struct _strenc_ **encodingTable = 0; static struct _strenc_ **encodingTable = 0;
@ -260,6 +297,30 @@ static void GSSetupEncodingTable(void)
if (tmp < MAX_ENCODING) if (tmp < MAX_ENCODING)
{ {
encTable[tmp] = &str_encoding_table[i]; encTable[tmp] = &str_encoding_table[i];
#ifdef HAVE_ICONV
if (encTable[tmp]->iconv != 0)
{
iconv_t c;
char *lossy;
/*
* See if we can do a lossy conversion.
*/
lossy = malloc(strlen(encTable[tmp]->iconv) + 12);
strcpy(lossy, encTable[tmp]->iconv);
strcat(lossy, "//TRANSLIT");
c = iconv_open(UNICODE_ENC, encTable[tmp]->iconv);
if (c == (iconv_t)-1)
{
free(lossy);
}
else
{
encTable[tmp]->lossy = lossy;
iconv_close(c);
}
}
#endif
} }
} }
encodingTable = encTable; encodingTable = encTable;
@ -492,7 +553,7 @@ GSEncodingFromLocale(const char *clocale)
if (strchr (clocale, '.') != NULL) if (strchr (clocale, '.') != NULL)
{ {
/* Locale contains the 'codeset' section. Parse it and see /* Locale contains the 'codeset' section. Parse it and see
if we know what encoding this corresponds to */ if we know what encoding this cooresponds to */
NSString *registry; NSString *registry;
NSArray *array; NSArray *array;
char *s; char *s;
@ -764,18 +825,6 @@ GetEncodingName(NSStringEncoding encoding)
} }
#endif #endif
#ifdef HAVE_ICONV
static const char *
iconv_stringforencoding(NSStringEncoding encoding)
{
if (GSEncodingSupported(encoding) == NO)
{
return 0;
}
return encodingTable[encoding]->iconv;
}
#endif
/** /**
* <strong>deprecated</strong> * <strong>deprecated</strong>
* See GSToUnicode() and GSFromUnicode() * See GSToUnicode() and GSFromUnicode()
@ -1050,14 +1099,15 @@ if (dst == 0) \
* adjust the offset into the local buffer on the \ * adjust the offset into the local buffer on the \
* stack and pretend the buffer has grown. \ * stack and pretend the buffer has grown. \
*/ \ */ \
ptr = buf - dpos; \
if (extra == 0) \ if (extra == 0) \
{ \ { \
bsize = dpos + BUFSIZ; \ ptr -= BUFSIZ; \
bsize += BUFSIZ; \
} \ } \
else \ else \
{ \ { \
bsize = dpos + BUFSIZ - 1; \ ptr -= BUFSIZ-1; \
bsize += BUFSIZ-1; \
} \ } \
} \ } \
else if (zone == 0) \ else if (zone == 0) \
@ -1415,9 +1465,13 @@ tables:
size_t outbytesleft; size_t outbytesleft;
size_t rval; size_t rval;
iconv_t cd; iconv_t cd;
const char *estr = iconv_stringforencoding(enc); const char *estr = 0;
BOOL done = NO; BOOL done = NO;
if (GSEncodingSupported(enc) == YES)
{
estr = encodingTable[enc]->iconv;
}
/* explicitly check for empty encoding name since some systems /* explicitly check for empty encoding name since some systems
* have buggy iconv_open() code which succeeds on an empty name. * have buggy iconv_open() code which succeeds on an empty name.
*/ */
@ -1549,6 +1603,9 @@ tables:
NSZoneFree(zone, ptr); NSZoneFree(zone, ptr);
} }
if (dst)
NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer");
return result; return result;
} }
@ -1563,14 +1620,15 @@ if (dst == 0) \
* adjust the offset into the local buffer on the \ * adjust the offset into the local buffer on the \
* stack and pretend the buffer has grown. \ * stack and pretend the buffer has grown. \
*/ \ */ \
ptr = buf - dpos; \
if (extra == 0) \ if (extra == 0) \
{ \ { \
bsize = dpos + BUFSIZ; \ ptr -= BUFSIZ; \
bsize += BUFSIZ; \
} \ } \
else \ else \
{ \ { \
bsize = dpos + BUFSIZ - 1; \ ptr -= BUFSIZ-1; \
bsize += BUFSIZ-1; \
} \ } \
} \ } \
else if (zone == 0) \ else if (zone == 0) \
@ -1757,6 +1815,15 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
bsize = *size; bsize = *size;
} }
#ifdef HAVE_ICONV
if (strict == NO
&& enc != NSUTF8StringEncoding
&& enc != NSGSM0338StringEncoding)
{
goto iconv_start; // For lossy conversion
}
#endif
switch (enc) switch (enc)
{ {
case NSUTF8StringEncoding: case NSUTF8StringEncoding:
@ -1832,12 +1899,14 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
sl = 6; sl = 6;
} }
/* make sure we have enough space for it */
while (dpos + sl >= bsize)
{
GROW();
}
if (sl == 1) if (sl == 1)
{ {
if (dpos >= bsize)
{
GROW();
}
ptr[dpos++] = u & 0x7f; ptr[dpos++] = u & 0x7f;
} }
else else
@ -1852,18 +1921,10 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
u = u >> 6; u = u >> 6;
} }
if (dpos >= bsize)
{
GROW();
}
ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff); ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
/* add bytes into the output sequence */ /* add bytes into the output sequence */
for (i = sl - 2; i >= 0; i--) for (i = sl - 2; i >= 0; i--)
{ {
if (dpos >= bsize)
{
GROW();
}
ptr[dpos++] = reversed[i] | 0x80; ptr[dpos++] = reversed[i] | 0x80;
} }
} }
@ -1903,7 +1964,7 @@ bases:
} }
else else
{ {
ptr[dpos++] = '*'; ptr[dpos++] = '?';
} }
} }
} }
@ -2043,9 +2104,9 @@ tables:
else if (strict == NO) else if (strict == NO)
{ {
/* /*
* The default lossy mapping generates an asterisk. * The default lossy mapping generates a question mark.
*/ */
ptr[dpos++] = '*'; ptr[dpos++] = '?';
} }
else else
{ {
@ -2061,6 +2122,7 @@ tables:
default: default:
#ifdef HAVE_ICONV #ifdef HAVE_ICONV
iconv_start:
{ {
iconv_t cd; iconv_t cd;
unsigned char *inbuf; unsigned char *inbuf;
@ -2068,9 +2130,25 @@ tables:
size_t inbytesleft; size_t inbytesleft;
size_t outbytesleft; size_t outbytesleft;
size_t rval; size_t rval;
const char *estr = iconv_stringforencoding(enc); const char *estr = 0;
BOOL done = NO; BOOL done = NO;
if (GSEncodingSupported(enc) == YES)
{
if (strict == NO)
{
/*
* Try to transliterate where no direct conversion
* is available.
*/
estr = encodingTable[enc]->lossy;
}
if (estr == 0)
{
estr = encodingTable[enc]->iconv;
}
}
/* explicitly check for empty encoding name since some systems /* explicitly check for empty encoding name since some systems
* have buggy iconv_open() code which succeeds on an empty name. * have buggy iconv_open() code which succeeds on an empty name.
*/ */
@ -2131,11 +2209,11 @@ tables:
} }
/* /*
* If we are allowing lossy conversion, we replace any * If we are allowing lossy conversion, we replace any
* unconvertable character with an asterisk. * unconvertable character with a question mark.
*/ */
if (outbytesleft > 0) if (outbytesleft > 0)
{ {
*outbuf++ = '*'; *outbuf++ = '?';
outbytesleft--; outbytesleft--;
inbuf += sizeof(unichar); inbuf += sizeof(unichar);
inbytesleft -= sizeof(unichar); inbytesleft -= sizeof(unichar);
@ -2235,6 +2313,9 @@ tables:
NSZoneFree(zone, ptr); NSZoneFree(zone, ptr);
} }
if (dst)
NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer");
return result; return result;
} }