/** Support functions for Unicode implementation Function to determine default c string encoding for GNUstep based on GNUSTEP_STRING_ENCODING environment variable. Copyright (C) 1997 Free Software Foundation, Inc. Written by: Stevo Crvenkovski < stevo@btinternet.com > Date: March 1997 Merged with GetDefEncoding.m and iconv by: Fred Kiefer Date: September 2000 Rewrite by: Richard Frith-Macdonald This file is part of the GNUstep Base Library. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111 USA. */ #include "config.h" #include #include #include #include #include #include #include "GSCompatibility.h" #include #include #include #include typedef struct {unichar from; unsigned char to;} _ucc_; #include "unicode/cyrillic.h" #include "unicode/latin2.h" #include "unicode/latin9.h" #include "unicode/nextstep.h" #include "unicode/caseconv.h" #include "unicode/cop.h" #include "unicode/decomp.h" #include "unicode/gsm0338.h" #include "unicode/thai.h" #ifdef HAVE_ICONV #ifdef HAVE_GICONV_H #include #else #include #endif #include /* * The whole of the GNUstep code stores UNICODE in internal byte order, * so we do the same. This should be UCS-2-INTERNAL for libiconv */ #ifdef WORDS_BIGENDIAN #define UNICODE_INT "UNICODEBIG" #else #define UNICODE_INT "UNICODELITTLE" #endif #define UNICODE_ENC ((unicode_enc) ? unicode_enc : internal_unicode_enc()) static const char *unicode_enc = NULL; /* Check to see what type of internal unicode format the library supports */ static const char * internal_unicode_enc(void) { iconv_t conv; unicode_enc = UNICODE_INT; conv = iconv_open(unicode_enc, "ASCII"); if (conv != (iconv_t)-1) { iconv_close(conv); return unicode_enc; } unicode_enc = "UCS-2-INTERNAL"; conv = iconv_open(unicode_enc, "ASCII"); if (conv != (iconv_t)-1) { iconv_close(conv); return unicode_enc; } unicode_enc = "UCS-2"; /* This had better work */ return unicode_enc; } #endif typedef unsigned char unc; static NSStringEncoding defEnc = GSUndefinedEncoding; static NSStringEncoding *_availableEncodings = 0; struct _strenc_ { NSStringEncoding enc; // Constant representing the encoding. const char *ename; // ASCII string representation of name. const char *iconv; /* Iconv name of encoding. If this * is the empty string, we cannot use * iconv perform conversions to/from * this encoding. * NB. do not put a null pointer in this * field in the table, use "" instread. */ BOOL eightBit; /* Flag to say whether this encoding * can be stored in a byte array ... * ie whether the encoding consists * entirely of single byte characters * and the first 128 are identical to * the ASCII character set. */ char supported; /* Is this supported? Some encodings * have builtin conversion to/from * unicode, but for others we must * check with iconv to see if it * supports them on this platform. * A one means supported. * A negative means unsupported. * A zero means not yet checked. */ }; /* * The str_encoding_table is a compact representation of all the string * encoding information we might need. It gets modified at runtime. */ static struct _strenc_ str_encoding_table[] = { {NSASCIIStringEncoding,"NSASCIIStringEncoding","ASCII",1,1}, {NSNEXTSTEPStringEncoding,"NSNEXTSTEPStringEncoding","NEXTSTEP",1,1}, {NSJapaneseEUCStringEncoding, "NSJapaneseEUCStringEncoding","EUC-JP",0,0}, {NSUTF8StringEncoding,"NSUTF8StringEncoding","UTF-8",0,1}, {NSISOLatin1StringEncoding,"NSISOLatin1StringEncoding","ISO-8859-1",1,1}, {NSSymbolStringEncoding,"NSSymbolStringEncoding","",0,0}, {NSNonLossyASCIIStringEncoding,"NSNonLossyASCIIStringEncoding","",1,1}, {NSShiftJISStringEncoding,"NSShiftJISStringEncoding","SHIFT-JIS",0,0}, {NSISOLatin2StringEncoding,"NSISOLatin2StringEncoding","ISO-8859-2",1,1}, {NSUnicodeStringEncoding, "NSUnicodeStringEncoding","",0,1}, {NSWindowsCP1251StringEncoding,"NSWindowsCP1251StringEncoding","CP1251",0,0}, {NSWindowsCP1252StringEncoding,"NSWindowsCP1252StringEncoding","CP1252",0,0}, {NSWindowsCP1253StringEncoding,"NSWindowsCP1253StringEncoding","CP1253",0,0}, {NSWindowsCP1254StringEncoding,"NSWindowsCP1254StringEncoding","CP1254",0,0}, {NSWindowsCP1250StringEncoding,"NSWindowsCP1250StringEncoding","CP1250",0,0}, {NSISO2022JPStringEncoding,"NSISO2022JPStringEncoding","ISO-2022-JP",0,0}, {NSMacOSRomanStringEncoding, "NSMacOSRomanStringEncoding","MACINTOSH",0,0}, {NSProprietaryStringEncoding, "NSProprietaryStringEncoding","",0,0}, // GNUstep additions {NSISOCyrillicStringEncoding,"NSISOCyrillicStringEncoding","ISO-8859-5",0,1}, {NSKOI8RStringEncoding, "NSKOI8RStringEncoding","KOI8-R",0,0}, {NSISOLatin3StringEncoding, "NSISOLatin3StringEncoding","ISO-8859-3",0,0}, {NSISOLatin4StringEncoding, "NSISOLatin4StringEncoding","ISO-8859-4",0,0}, {NSISOArabicStringEncoding, "NSISOArabicStringEncoding","ISO-8859-6",0,0}, {NSISOGreekStringEncoding, "NSISOGreekStringEncoding","ISO-8859-7",0,0}, {NSISOHebrewStringEncoding, "NSISOHebrewStringEncoding","ISO-8859-8",0,0}, {NSISOLatin5StringEncoding, "NSISOLatin5StringEncoding","ISO-8859-9",0,0}, {NSISOLatin6StringEncoding, "NSISOLatin6StringEncoding","ISO-8859-10",0,0}, {NSISOThaiStringEncoding, "NSISOThaiStringEncoding","ISO-8859-11",1,1}, {NSISOLatin7StringEncoding, "NSISOLatin7StringEncoding","ISO-8859-13",0,0}, {NSISOLatin8StringEncoding, "NSISOLatin8StringEncoding","ISO-8859-14",0,0}, {NSISOLatin9StringEncoding, "NSISOLatin9StringEncoding","ISO-8859-15",1,1}, {NSUTF7StringEncoding, "NSUTF7StringEncoding","",0,0}, {NSGB2312StringEncoding, "NSGB2312StringEncoding","EUC-CN",0,0}, {NSGSM0338StringEncoding, "NSGSM0338StringEncoding","",0,1}, {NSBIG5StringEncoding, "NSBIG5StringEncoding","BIG5",0,0}, {0,"Unknown encoding","",0,0} }; static struct _strenc_ **encodingTable = 0; static unsigned encTableSize = 0; static void GSSetupEncodingTable(void) { if (encodingTable == 0) { [gnustep_global_lock lock]; if (encodingTable == 0) { static struct _strenc_ **encTable = 0; unsigned count; unsigned i; /* * We want to store pointers to our string encoding info in a * large table so we can do efficient lookup by encoding value. */ #define MAX_ENCODING 128 count = sizeof(str_encoding_table) / sizeof(struct _strenc_); /* * First determine the largest encoding value and create a * large enough table of pointers. */ encTableSize = 0; for (i = 0; i < count; i++) { unsigned tmp = str_encoding_table[i].enc; if (tmp >= MAX_ENCODING) { fprintf(stderr, "ERROR ... illegal NSStringEncoding " "value in str_encoding_table. Ignored\n"); } else if (tmp > encTableSize) { encTableSize = tmp; } } encTable = malloc((encTableSize+1)*sizeof(struct _strenc_ *)); memset(encTable, 0, (encTableSize+1)*sizeof(struct _strenc_ *)); /* * Now set up the pointers at the correct location in the table. */ for (i = 0; i < count; i++) { unsigned tmp = str_encoding_table[i].enc; if (tmp < MAX_ENCODING) { encTable[tmp] = &str_encoding_table[i]; } } encodingTable = encTable; } [gnustep_global_lock unlock]; } } static BOOL GSEncodingSupported(NSStringEncoding enc) { GSSetupEncodingTable(); if (enc == 0 || enc > encTableSize || encodingTable[enc] == 0) { return NO; } #ifdef HAVE_ICONV if (encodingTable[enc]->iconv != 0 && encodingTable[enc]->supported == 0) { if (enc == NSUnicodeStringEncoding) { encodingTable[enc]->iconv = UNICODE_ENC; encodingTable[enc]->supported = 1; } else if (encodingTable[enc]->iconv[0] == 0) { /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ encodingTable[enc]->supported = -1; } else { iconv_t c; c = iconv_open(UNICODE_ENC, encodingTable[enc]->iconv); if (c == (iconv_t)-1) { encodingTable[enc]->supported = -1; } else { iconv_close(c); c = iconv_open(encodingTable[enc]->iconv, UNICODE_ENC); if (c == (iconv_t)-1) { encodingTable[enc]->supported = -1; } else { iconv_close(c); encodingTable[enc]->supported = 1; } } } } #endif if (encodingTable[enc]->supported == 1) { return YES; } return NO; } /** * Returns a nul terminated array of the available string encodings. */ NSStringEncoding * GetAvailableEncodings() { if (_availableEncodings == 0) { [gnustep_global_lock lock]; if (_availableEncodings == 0) { NSStringEncoding *encodings; unsigned pos; unsigned i; /* * Now build up a list of supported encodings ... in the * format needed to support [NSString+availableStringEncodings] * Check to see what iconv support we have as we go along. * This is also the place where we determine the name we use * for iconv to support unicode. */ GSSetupEncodingTable(); encodings = objc_malloc(sizeof(NSStringEncoding) * encTableSize); pos = 0; for (i = 0; i < encTableSize; i++) { if (GSEncodingSupported(i) == YES) { encodings[pos++] = i; } } encodings[pos] = 0; _availableEncodings = encodings; } [gnustep_global_lock unlock]; } return _availableEncodings; } /** Returns the NSStringEncoding that matches the specified * character set registry and encoding information. For instance, * for the iso8859-5 character set, the registry is iso8859 and * the encoding is 5, and the returned NSStringEncoding is * NSISOCyrillicStringEncoding. If there is no specific encoding, * use @"0". Returns GSUndefinedEncoding if there is no match. */ NSStringEncoding GSEncodingForRegistry (NSString *registry, NSString *encoding) { if ([registry isEqualToString: @"iso8859"]) { if ([encoding isEqualToString: @"1"]) return NSISOLatin1StringEncoding; else if ([encoding isEqualToString: @"2"]) return NSISOLatin2StringEncoding; else if ([encoding isEqualToString: @"3"]) return NSISOLatin3StringEncoding; else if ([encoding isEqualToString: @"4"]) return NSISOLatin4StringEncoding; else if ([encoding isEqualToString: @"5"]) return NSISOCyrillicStringEncoding; else if ([encoding isEqualToString: @"6"]) return NSISOArabicStringEncoding; else if ([encoding isEqualToString: @"7"]) return NSISOGreekStringEncoding; else if ([encoding isEqualToString: @"8"]) return NSISOHebrewStringEncoding; else if ([encoding isEqualToString: @"11"]) return NSISOThaiStringEncoding; else if ([encoding isEqualToString: @"15"]) return NSISOLatin9StringEncoding; // Other latin encodings are currently not supported } else if ([registry isEqualToString: @"iso10646"]) { if ([encoding isEqualToString: @"1"]) return NSUnicodeStringEncoding; } else if ([registry isEqualToString: @"microsoft"]) { if ([encoding isEqualToString: @"symbol"]) return NSSymbolStringEncoding; else if ([encoding isEqualToString: @"cp1250"]) return NSWindowsCP1250StringEncoding; else if ([encoding isEqualToString: @"cp1251"]) return NSWindowsCP1251StringEncoding; else if ([encoding isEqualToString: @"cp1252"]) return NSWindowsCP1252StringEncoding; else if ([encoding isEqualToString: @"cp1253"]) return NSWindowsCP1253StringEncoding; else if ([encoding isEqualToString: @"cp1254"]) return NSWindowsCP1254StringEncoding; } else if ([registry isEqualToString: @"apple"]) { if ([encoding isEqualToString: @"roman"]) return NSMacOSRomanStringEncoding; } else if ([registry isEqualToString: @"jisx0201.1976"]) { if ([encoding isEqualToString: @"0"]) return NSShiftJISStringEncoding; } else if ([registry isEqualToString: @"iso646.1991"]) { if ([encoding isEqualToString: @"irv"]) return NSASCIIStringEncoding; } else if ([registry isEqualToString: @"koi8"]) { if ([encoding isEqualToString: @"r"]) return NSKOI8RStringEncoding; } else if ([registry isEqualToString: @"gb2312.1980"]) { if ([encoding isEqualToString: @"0"]) return NSGB2312StringEncoding; } else if ([registry isEqualToString: @"big5"]) { if ([encoding isEqualToString: @"0"]) return NSBIG5StringEncoding; } else if ([registry isEqualToString:@"utf8"] || [registry isEqualToString:@"utf-8"] ) { return NSUTF8StringEncoding; } return GSUndefinedEncoding; } /** Try to deduce the string encoding from the locale string * clocale. This function looks in the Locale.encodings file * installed as part of GNUstep Base if the encoding cannot be * deduced from the clocale string itself. If clocale isn't set or * no match can be found, returns GSUndefinedEncoding. */ /* It would be really nice if this could be used in GetDefEncoding, but * there are too many dependancies on other parts of the library to * make this practical (even if everything possible was written in C, * we'd still need some way to find the Locale.encodings file). */ NSStringEncoding GSEncodingFromLocale(const char *clocale) { NSStringEncoding encoding; NSString *encodstr; if (clocale == NULL || strcmp(clocale, "C") == 0 || strcmp(clocale, "POSIX") == 0) { /* Don't make any assumptions. Let caller handle that */ return GSUndefinedEncoding; } if (strchr (clocale, '.') != NULL) { /* Locale contains the 'codeset' section. Parse it and see if we know what encoding this cooresponds to */ NSString *registry; NSArray *array; char *s; s = strchr (clocale, '.'); registry = [[NSString stringWithCString: s+1] lowercaseString]; array = [registry componentsSeparatedByString: @"-"]; registry = [array objectAtIndex: 0]; if ([array count] > 1) { encodstr = [array lastObject]; } else { encodstr = @"0"; } encoding = GSEncodingForRegistry(registry, encodstr); } else { /* Look up the locale in our table of encodings */ NSString *table; table = [NSBundle pathForGNUstepResource: @"Locale" ofType: @"encodings" inDirectory: @"Resources/Languages"]; if (table != nil) { int count; NSDictionary *dict; dict = [NSDictionary dictionaryWithContentsOfFile: table]; encodstr = [dict objectForKey: [NSString stringWithCString: clocale]]; if (encodstr == nil) return GSUndefinedEncoding; /* Find the matching encoding */ count = 0; while (str_encoding_table[count].enc && strcmp(str_encoding_table[count].ename, [encodstr lossyCString])) { count++; } if (str_encoding_table[count].enc) { encoding = str_encoding_table[count].enc; } if (encoding == GSUndefinedEncoding) { NSLog(@"No known GNUstep encoding for %s = %@", clocale, encodstr); } } } return encoding; } /** * Return the default encoding */ NSStringEncoding GetDefEncoding() { if (defEnc == GSUndefinedEncoding) { char *encoding; unsigned int count; [gnustep_global_lock lock]; if (defEnc != GSUndefinedEncoding) { [gnustep_global_lock unlock]; return defEnc; } GSSetupEncodingTable(); encoding = getenv("GNUSTEP_STRING_ENCODING"); if (encoding != 0) { count = 0; while (str_encoding_table[count].enc && strcmp(str_encoding_table[count].ename, encoding)) { count++; } if (str_encoding_table[count].enc) { defEnc = str_encoding_table[count].enc; } else { fprintf(stderr, "WARNING: %s - encoding not supported.\n", encoding); fprintf(stderr, " NSISOLatin1StringEncoding set as default.\n"); defEnc = NSISOLatin1StringEncoding; } } if (defEnc == GSUndefinedEncoding) { /* Encoding not set */ defEnc = NSISOLatin1StringEncoding; } else if (GSEncodingSupported(defEnc) == NO) { fprintf(stderr, "WARNING: %s - encoding not implemented as " "default c string encoding.\n", encoding); fprintf(stderr, " NSISOLatin1StringEncoding set as default.\n"); defEnc = NSISOLatin1StringEncoding; } [gnustep_global_lock unlock]; } return defEnc; } BOOL GSIsByteEncoding(NSStringEncoding encoding) { if (GSEncodingSupported(encoding) == NO) { return NO; } return encodingTable[encoding]->eightBit; } /** * Returns the standard name for the specified encoding. */ NSString* GSEncodingName(NSStringEncoding encoding) { if (GSEncodingSupported(encoding) == NO) { return @"Unknown encoding"; } return [NSString stringWithCString: encodingTable[encoding]->ename]; } /** * deprecated Use GSEncodingName() */ NSString* GetEncodingName(NSStringEncoding encoding) { return GSEncodingName(encoding); } static const char * iconv_stringforencoding(NSStringEncoding encoding) { if (GSEncodingSupported(encoding) == NO) { return 0; } return encodingTable[encoding]->iconv; } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ unichar encode_chartouni(unsigned char c, NSStringEncoding enc) { BOOL result; unsigned int size = 1; unichar u = 0; unichar *dst = &u; result = GSToUnicode(&dst, &size, &c, 1, enc, 0, 0); if (result == NO) { return 0; } return u; } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ unsigned char encode_unitochar(unichar u, NSStringEncoding enc) { BOOL result; unsigned int size = 1; unsigned char c = 0; unsigned char *dst = &c; result = GSFromUnicode(&dst, &size, &u, 1, enc, 0, 0); if (result == NO) { return 0; } return c; } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ unsigned encode_unitochar_strict(unichar u, NSStringEncoding enc) { BOOL result; unsigned int size = 1; unsigned char c = 0; unsigned char *dst = &c; result = GSFromUnicode(&dst, &size, &u, 1, enc, 0, GSUniStrict); if (result == NO) { return 0; } return c; } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ unichar chartouni(unsigned char c) { if (defEnc == GSUndefinedEncoding) { defEnc = GetDefEncoding(); } return encode_chartouni(c, defEnc); } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ unsigned char unitochar(unichar u) { if (defEnc == GSUndefinedEncoding) { defEnc = GetDefEncoding(); } return encode_unitochar(u, defEnc); } /** * Uses direct access into a two-level table to map cases.
* The two-level table method is less space efficient (but still not bad) than * a single table and a linear search, but it reduces the number of * conditional statements to just one. */ unichar uni_tolower(unichar ch) { unichar result = gs_tolower_map[ch / 256][ch % 256]; return result ? result : ch; } /** * Uses direct access into a two-level table to map cases.
* The two-level table method is less space efficient (but still not bad) than * a single table and a linear search, but it reduces the number of * conditional statements to just one. */ unichar uni_toupper(unichar ch) { unichar result = gs_toupper_map[ch / 256][ch % 256]; return result ? result : ch; } unsigned char uni_cop(unichar u) { if (u < uni_cop_table[0].code) { return 0; // Special case for latin1 } else { unichar code; unichar count = 0; unichar first = 0; unichar last = uni_cop_table_size; while (first <= last) { if (first != last) { count = (first + last) / 2; code = uni_cop_table[count].code; if (code < u) { first = count+1; } else if (code > u) { last = count-1; } else { return uni_cop_table[count].cop; } } else /* first == last */ { if (u == uni_cop_table[first].code) { return uni_cop_table[first].cop; } return 0; } } return 0; } } BOOL uni_isnonsp(unichar u) { // FIXME check is uni_cop good for this if (uni_cop(u)) return YES; else return NO; } unichar* uni_is_decomp(unichar u) { if (u < uni_dec_table[0].code) { return 0; // Special case for latin1 } else { unichar code; unichar count = 0; unichar first = 0; unichar last = uni_dec_table_size; while (first <= last) { if (first != last) { count = (first + last) / 2; code = uni_dec_table[count].code; if (code < u) { first = count+1; } else if (code > u) { last = count-1; } else { return uni_dec_table[count].decomp; } } else /* first == last */ { if (u == uni_dec_table[first].code) { return uni_dec_table[first].decomp; } return 0; } } return 0; } } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ int encode_ustrtocstr(char *dst, int dl, const unichar *src, int sl, NSStringEncoding enc, BOOL strict) { BOOL result; unsigned int options = (strict == YES) ? GSUniStrict : 0; unsigned int old = dl; result = GSFromUnicode((unsigned char**)&dst, (unsigned int*)&dl, src, sl, enc, 0, options); if (result == NO) { return 0; } return old - dl; // Number of characters. } /** * deprecated * See GSToUnicode() and GSFromUnicode() */ int encode_cstrtoustr(unichar *dst, int dl, const char *src, int sl, NSStringEncoding enc) { BOOL result; unsigned int old = dl; result = GSToUnicode(&dst, &dl, src, sl, enc, 0, 0); if (result == NO) { return 0; } return old - dl; } #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * adjust the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr -= BUFSIZ; \ bsize += BUFSIZ; \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ break; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ grow *= sizeof(unichar); \ \ if (ptr == buf || ptr == *dst) \ { \ unichar *tmp; \ \ tmp = NSZoneMalloc(zone, grow + extra); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize * sizeof(unichar)); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSZoneRealloc(zone, ptr, grow + extra); \ } \ if (ptr == 0) \ { \ result = NO; /* Not enough memory */ \ break; \ } \ bsize = grow / sizeof(unichar); \ } /** * Function to convert from 8-bit character data to 16-bit unicode. *

The dst argument is a pointer to a pointer to a buffer in which the * converted string is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the * converted string. If the zone argument is non-nul, the function is free * to allocate a larger buffer if necessary, and store this new buffer in * the dst argument. It will *NOT* deallocate the original buffer! *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be * altered to the new size. This is measured in characters, not bytes. *

*

The src argument is a pointer to the 8-bit character string which is * to be converted to 16-bit unicode. *

*

The slen argument is the length (bytes) of the 8-bit character string * which is to be converted to 16-bit unicode. * This is measured in characters, not bytes. *

*

The end argument specifies the encoding type of the 8-bit character * string which is to be converted to 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. * If this is nul, the function will fail if the originally supplied buffer * is not big enough (unless dst is a null pointer ... indicating that * converted data is to be discarded). *

* The options argument controls some special behavior. * * If GSUniTerminate is set, the function is expected to null terminate * the output string, and will assume that it is safe to place the nul * just beyond the ned of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra * termination character. * If GSUniTemporary is set, the function will return the results in * an autoreleased buffer rather than in a buffer that the caller must * release. * If GSUniBOM is set, the function will write the first unicode * character as a byte order marker. * If GSUniShortOk is set, the function will return a buffer containing * any decoded characters even if the whole conversion fails. * *

On return, the function result is a flag indicating success (YES) * or failure (NO), and on success, the value stored in size is the number * of characters in the converted string. The converted string itsself is * stored in the location gioven by dst.
* NB. If the value stored in dst has been changed, it is a pointer to * allocated memory which the caller is responsible for freeing, and the * caller is still responsible for freeing the original buffer. *

*/ BOOL GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, unsigned int options) { unichar buf[BUFSIZ]; unichar *ptr; unsigned bsize; unsigned dpos = 0; // Offset into destination buffer. unsigned spos = 0; // Offset into source buffer. unsigned extra = (options & GSUniTerminate) ? sizeof(unichar) : 0; unichar base = 0; unichar *table = 0; BOOL result = YES; /* * Ensure we have an initial buffer set up to decode data into. */ if (dst == 0 || *size == 0) { ptr = buf; bsize = (extra != 0) ? BUFSIZ - 1 : BUFSIZ; } else { ptr = *dst; bsize = *size; } if (options & GSUniBOM) { while (dpos >= bsize) { GROW(); } ptr[dpos++] = (unichar)0xFEFF; // Insert byte order marker. } switch (enc) { case NSUTF8StringEncoding: { result = YES; while (spos < slen) { unsigned char c = src[spos++]; unichar u = c; if (c > 0x7f) { unsigned char c1; if (spos == slen) { result = NO; // Second byte is missing. break; } if (c < 0xe0) { if (c < 0xc2) { /* * Either we are inside a multibyte sequence or * we have a bad multibyte character count. */ result = NO; break; } c1 = src[spos++]; if (!((c1 ^ 0x80) < 0x40)) { /* * Second byte in sequence is not a legal * continuation. */ result = NO; break; } u = ((c & 0x1f) << 6) | (c1 ^ 0x80); } else if (c < 0xf0) { unsigned char c1; unsigned char c2; c1 = src[spos++]; if (spos == slen) { result = NO; // Third byte is missing. break; } c2 = src[spos++]; if (!(((c1 ^ 0x80) < 0x40) && ((c2 ^ 0x80) < 0x40) && (c1 >= 0xa0))) { result = NO; // Invalid sequence. break; } u = ((c & 0x0f) << 12) | ((c1 ^ 0x80) << 6) | (c2 ^ 0x80); } else { /* * Sequence not legal or too long for conversion to * two byte unicode. */ result = NO; break; } } if (dpos >= bsize) { GROW(); } ptr[dpos++] = u; } } break; case NSNonLossyASCIIStringEncoding: case NSASCIIStringEncoding: case NSISOLatin1StringEncoding: case NSUnicodeStringEncoding: while (spos < slen) { if (dpos >= bsize) { GROW(); } ptr[dpos++] = (unichar)((unc)src[spos++]); } break; case NSNEXTSTEPStringEncoding: base = Next_conv_base; table = Next_char_to_uni_table; goto tables; case NSISOCyrillicStringEncoding: base = Cyrillic_conv_base; table = Cyrillic_char_to_uni_table; goto tables; case NSISOLatin2StringEncoding: base = Latin2_conv_base; table = Latin2_char_to_uni_table; goto tables; case NSISOLatin9StringEncoding: base = Latin9_conv_base; table = Latin9_char_to_uni_table; goto tables; case NSISOThaiStringEncoding: base = Thai_conv_base; table = Thai_char_to_uni_table; goto tables; #if 0 case NSSymbolStringEncoding: base = Symbol_conv_base; table = Symbol_char_to_uni_table; goto tables; #endif tables: while (spos < slen) { unc c = (unc)src[spos]; if (dpos >= bsize) { GROW(); } if (c < base) { ptr[dpos++] = c; } else { ptr[dpos++] = table[c - base]; } spos++; } break; case NSGSM0338StringEncoding: while (spos < slen) { unc c = (unc)src[spos]; if (dpos >= bsize) { GROW(); } ptr[dpos] = GSM0338_char_to_uni_table[c]; if (c == 0x1b && spos < slen) { unsigned i = 0; c = (unc)src[spos+1]; while (i < sizeof(GSM0338_escapes)/sizeof(GSM0338_escapes[0])) { if (GSM0338_escapes[i].to == c) { ptr[dpos] = GSM0338_escapes[i].from; spos++; break; } i++; } } dpos++; spos++; } break; default: #ifdef HAVE_ICONV { char *inbuf; char *outbuf; size_t inbytesleft; size_t outbytesleft; size_t rval; iconv_t cd; const char *estr = iconv_stringforencoding(enc); /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ if (estr == 0) { NSLog(@"No iconv for encoding x%02x", enc); result = NO; break; } cd = iconv_open(UNICODE_ENC, estr); if (cd == (iconv_t)-1) { NSLog(@"No iconv for encoding %@ tried to use %s", GetEncodingName(enc), estr); result = NO; break; } inbuf = (char*)src; inbytesleft = slen; outbuf = (char*)ptr; outbytesleft = bsize * sizeof(unichar); while (inbytesleft > 0) { if (dpos >= bsize) { unsigned old = bsize; GROW(); outbuf = (char*)&ptr[dpos]; outbytesleft += (bsize - old) * sizeof(unichar); } rval = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); dpos = (bsize * sizeof(unichar) - outbytesleft) / sizeof(unichar); if (rval == (size_t)-1) { if (errno == E2BIG) { unsigned old = bsize; GROW(); outbuf = (char*)&ptr[dpos]; outbytesleft += (bsize - old) * sizeof(unichar); } else { result = NO; break; } } } // close the converter iconv_close(cd); } #else result = NO; #endif } /* * Post conversion ... set output values. */ if (extra != 0) { ptr[dpos] = (unichar)0; } *size = dpos; if (dst != 0 && (result == YES || (options & GSUniShortOk))) { if (options & GSUniTemporary) { unsigned bytes = dpos * sizeof(unichar) + extra; void *r; /* * Temporary string was requested ... make one. */ r = _fastMallocBuffer(bytes); memcpy(r, ptr, bytes); if (ptr != buf && (dst == 0 || ptr != *dst)) { NSZoneFree(zone, ptr); } ptr = r; } else if (zone != 0 && (ptr == buf || bsize > dpos)) { unsigned bytes = dpos * sizeof(unichar) + extra; /* * Resizing is permitted, try ensure we return a buffer which * is just big enough to hold the converted string. */ if (ptr == buf || ptr == *dst) { unichar *tmp; tmp = NSZoneMalloc(zone, bytes); if (tmp != 0) { memcpy(tmp, ptr, bytes); } ptr = tmp; } else { ptr = NSZoneRealloc(zone, ptr, bytes); } } *dst = ptr; } else if (ptr != buf && dst != 0 && ptr != *dst) { NSZoneFree(zone, ptr); } return result; } #undef GROW #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * adjust the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr -= BUFSIZ; \ bsize += BUFSIZ; \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ break; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ \ if (ptr == buf || ptr == *dst) \ { \ unsigned char *tmp; \ \ tmp = NSZoneMalloc(zone, grow + extra); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSZoneRealloc(zone, ptr, grow + extra); \ } \ if (ptr == 0) \ { \ result = NO; /* Not enough memory */ \ break; \ } \ bsize = grow; \ } static inline int chop(unichar c, _ucc_ *table, int hi) { int lo = 0; while (hi > lo) { int i = (hi + lo) / 2; unichar from = table[i].from; if (from < c) { lo = i + 1; } else if (from > c) { hi = i; } else { return i; // Found } } return -1; // Not found } /** * Function to convert from 16-bit unicode to 8-bit character data. *

The dst argument is a pointer to a pointer to a buffer in which the * converted string is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the * converted string. If the zone argument is non-nul, the function is free * to allocate a larger buffer if necessary, and store this new buffer in * the dst argument. It will *NOT* deallocate the original buffer! *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be * altered to the new size. This is measured in characters, not bytes. *

*

The src argument is a pointer to the 16-bit unicode string which is * to be converted to 8-bit data. *

*

The slen argument is the length (bytes) of the 16-bit unicode string * which is to be converted to 8-bit data. * This is measured in characters, not bytes. *

*

The end argument specifies the encoding type of the 8-bit character * string which is to be produced from the 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. * If this is nul, the function will fail if the originally supplied buffer * is not big enough (unless dst is a null pointer ... indicating that * converted data is to be discarded). *

* The options argument controls some special behavior. * * If GSUniStrict is set, the function will fail if a character is * encountered which can't be displayed in the source. Otherwise, some * approximation or marker will be placed in the destination. * If GSUniTerminate is set, the function is expected to null terminate * the output string, and will assume that it is safe to place the nul * just beyond the ned of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra * termination character. * If GSUniTemporary is set, the function will return the results in * an autoreleased buffer rather than in a buffer that the caller must * release. * If GSUniBOM is set, the function will read the first unicode * character as a byte order marker. * If GSUniShortOk is set, the function will return a buffer containing * any decoded characters even if the whole conversion fails. * *

On return, the function result is a flag indicating success (YES) * or failure (NO), and on success, the value stored in size is the number * of characters in the converted string. The converted string itsself is * stored in the location gioven by dst.
* NB. If the value stored in dst has been changed, it is a pointer to * allocated memory which the caller is responsible for freeing, and the * caller is still responsible for freeing the original buffer. *

*/ BOOL GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, unsigned int options) { unsigned char buf[BUFSIZ]; unsigned char *ptr; unsigned bsize; unsigned dpos = 0; // Offset into destination buffer. unsigned spos = 0; // Offset into source buffer. unsigned extra = (options & GSUniTerminate) ? 1 : 0; BOOL strict = (options & GSUniStrict) ? YES : NO; unichar base = 0; _ucc_ *table = 0; unsigned tsize = 0; unsigned char escape = 0; _ucc_ *etable = 0; unsigned etsize = 0; _ucc_ *ltable = 0; unsigned ltsize = 0; BOOL swapped = NO; BOOL result = YES; if (options & GSUniBOM) { if (slen == 0) { *size = 0; result = NO; // Missing byte order marker. } else { unichar c; c = *src++; slen--; if (c != 0xFEFF) { if (c == 0xFFFE) { swapped = YES; } else { *size = 0; result = NO; // Illegal byte order marker. } } } } /* * Ensure we have an initial buffer set up to decode data into. */ if (dst == 0 || *size == 0) { ptr = buf; bsize = (extra != 0) ? BUFSIZ - 1 : BUFSIZ; } else { ptr = *dst; bsize = *size; } switch (enc) { case NSUTF8StringEncoding: { while (spos < slen) { unichar u = src[spos++]; unsigned multi; if (swapped == YES) { u = ((u & 0xff00 >> 8) + ((u & 0x00ff) << 8)); } if (u < 0x0080) { multi = 0; } else if (u < 0x0800) { multi = 1; } else { multi = 2; } if (dpos + multi >= bsize) { GROW(); } if (u < 0x80) { ptr[dpos++] = u; } else if (u < 0x800) { ptr[dpos++] = (u >> 6) | 0xc0; ptr[dpos++] = (u & 0x3f) | 0x80; } else { ptr[dpos++] = (u >> 12) | 0xe0; ptr[dpos++] = (u >> 6) | 0x80; ptr[dpos++] = (u & 0x3f) | 0x80; } } result = YES; } break; case NSNonLossyASCIIStringEncoding: case NSASCIIStringEncoding: base = 128; goto bases; case NSISOLatin1StringEncoding: case NSUnicodeStringEncoding: base = 256; goto bases; bases: if (strict == NO) { while (spos < slen) { unichar u = src[spos++]; if (swapped == YES) { u = ((u & 0xff00 >> 8) + ((u & 0x00ff) << 8)); } if (dpos >= bsize) { GROW(); } if (u < base) { ptr[dpos++] = (char)u; } else { ptr[dpos++] = '*'; } } } else { while (spos < slen) { unichar u = src[spos++]; if (swapped == YES) { u = ((u & 0xff00 >> 8) + ((u & 0x00ff) << 8)); } if (dpos >= bsize) { GROW(); } if (u < base) { ptr[dpos++] = (char)u; } else { result = NO; break; } } } break; case NSNEXTSTEPStringEncoding: base = (unichar)Next_conv_base; table = Next_uni_to_char_table; tsize = Next_uni_to_char_table_size; goto tables; case NSISOCyrillicStringEncoding: base = (unichar)Cyrillic_conv_base; table = Cyrillic_uni_to_char_table; tsize = Cyrillic_uni_to_char_table_size; goto tables; case NSISOLatin2StringEncoding: base = (unichar)Latin2_conv_base; table = Latin2_uni_to_char_table; tsize = Latin2_uni_to_char_table_size; goto tables; case NSISOLatin9StringEncoding: base = (unichar)Latin9_conv_base; table = Latin9_uni_to_char_table; tsize = Latin9_uni_to_char_table_size; goto tables; case NSISOThaiStringEncoding: base = (unichar)Thai_conv_base; table = Thai_uni_to_char_table; tsize = Thai_uni_to_char_table_size; goto tables; #if 0 case NSSymbolStringEncoding: base = (unichar)Symbol_conv_base; table = Symbol_uni_to_char_table; tsize = Symbol_uni_to_char_table_size; goto tables; #endif case NSGSM0338StringEncoding: base = 0; table = GSM0338_uni_to_char_table; tsize = GSM0338_tsize; escape = 0x1b; etable = GSM0338_escapes; etsize = GSM0338_esize; if (strict == NO) { ltable = GSM0338_lossy; ltsize = GSM0338_lsize; } goto tables; tables: while (spos < slen) { unichar u = src[spos++]; int i; /* Swap byte order if necessary */ if (swapped == YES) { u = ((u & 0xff00 >> 8) + ((u & 0x00ff) << 8)); } /* Grow output buffer to make room if necessary */ if (dpos >= bsize) { GROW(); } if (u < base) { /* * The character set has a lower section whose contents * are identical to unicode, so no mapping is needed. */ ptr[dpos++] = (char)u; } else if (table != 0 && (i = chop(u, table, tsize)) >= 0) { /* * The character mapping is found in a basic table. */ ptr[dpos++] = table[i].to; } else if (etable != 0 && (i = chop(u, etable, etsize)) >= 0) { /* * The character mapping is found in a table of simple * escape sequences consisting of an escape byte followed * by another single byte. */ ptr[dpos++] = escape; if (dpos >= bsize) { GROW(); } ptr[dpos++] = etable[i].to; } else if (ltable != 0 && (i = chop(u, ltable, ltsize)) >= 0) { /* * The character is found in a lossy mapping table. */ ptr[dpos++] = ltable[i].to; } else if (strict == NO) { /* * The default lossy mapping generates an asterisk. */ ptr[dpos++] = '*'; } else { /* * No mapping has been found. */ result = NO; spos = slen; break; } } break; default: #ifdef HAVE_ICONV { iconv_t cd; char *inbuf; char *outbuf; size_t inbytesleft; size_t outbytesleft; size_t rval; const char *estr = iconv_stringforencoding(enc); /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ if (estr == 0) { NSLog(@"No iconv for encoding x%02x", enc); result = NO; break; } cd = iconv_open(estr, UNICODE_ENC); if (cd == (iconv_t)-1) { NSLog(@"No iconv for encoding %@ tried to use %s", GetEncodingName(enc), estr); result = NO; break; } inbuf = (char*)src; inbytesleft = slen * sizeof(unichar); outbuf = (char*)ptr; outbytesleft = bsize; while (inbytesleft > 0) { if (dpos >= bsize) { unsigned old = bsize; GROW(); outbuf = (char*)&ptr[dpos]; outbytesleft += (bsize - old); } rval = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); dpos = bsize - outbytesleft; if (rval != 0) { if (rval == (size_t)-1) { if (errno == E2BIG) { unsigned old = bsize; GROW(); outbuf = (char*)&ptr[dpos]; outbytesleft += (bsize - old); } else if (errno == EILSEQ) { if (strict == YES) { result = NO; break; } /* * If we are allowing lossy conversion, we replace any * unconvertable character with an asterisk. */ if (outbytesleft > 0) { *outbuf++ = '*'; outbytesleft--; inbuf += sizeof(unichar); inbytesleft -= sizeof(unichar); } } else { result = NO; break; } } else if (strict == YES) { /* * A positive return from iconv indicates some * irreversible (ie lossy) conversions took place, * so if we are doing strict conversions we must fail. */ result = NO; break; } } } // close the converter iconv_close(cd); } #else result = NO; break; #endif } /* * Post conversion ... set output values. */ if (extra != 0) { ptr[dpos] = (unsigned char)0; } *size = dpos; if (dst != 0 && (result == YES || (options & GSUniShortOk))) { if (options & GSUniTemporary) { unsigned bytes = dpos + extra; void *r; /* * Temporary string was requested ... make one. */ r = _fastMallocBuffer(bytes); memcpy(r, ptr, bytes); if (ptr != buf && (dst == 0 || ptr != *dst)) { NSZoneFree(zone, ptr); } ptr = r; } else if (zone != 0 && (ptr == buf || bsize > dpos)) { unsigned bytes = dpos + extra; /* * Resizing is permitted - try ensure we return a buffer * which is just big enough to hold the converted string. */ if (ptr == buf || ptr == *dst) { unsigned char *tmp; tmp = NSZoneMalloc(zone, bytes); if (tmp != 0) { memcpy(tmp, ptr, bytes); } ptr = tmp; } else { ptr = NSZoneRealloc(zone, ptr, bytes); } } *dst = ptr; } else if (ptr != buf && dst != 0 && ptr != *dst) { NSZoneFree(zone, ptr); } return result; } #undef GROW