/** Support functions for Unicode implementation Function to determine default c string encoding for GNUstep based on GNUSTEP_STRING_ENCODING environment variable. Copyright (C) 1997 Free Software Foundation, Inc. Written by: Stevo Crvenkovski < stevo@btinternet.com > Date: March 1997 Merged with GetDefEncoding.m and iconv by: Fred Kiefer Date: September 2000 Rewrite by: Richard Frith-Macdonald This file is part of the GNUstep Base Library. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 USA. */ #import "common.h" #if defined(NeXT_Foundation_LIBRARY) #import #else #import "Foundation/NSArray.h" #import "Foundation/NSBundle.h" #import "Foundation/NSDictionary.h" #import "Foundation/NSError.h" #import "Foundation/NSException.h" #import "Foundation/NSLock.h" #import "Foundation/NSPathUtilities.h" #endif #import "GNUstepBase/GSLock.h" #import "GNUstepBase/GSMime.h" #import "GNUstepBase/NSLock+GNUstepBase.h" #import "GNUstepBase/Unicode.h" #import "../GSPrivate.h" #include #include #if HAVE_LANGINFO_CODESET #include #endif typedef struct {unichar from; unsigned char to;} _ucc_; #include "unicode/cyrillic.h" #include "unicode/latin2.h" #include "unicode/latin9.h" #include "unicode/nextstep.h" #include "unicode/caseconv.h" #include "unicode/cop.h" #include "unicode/decomp.h" #include "unicode/gsm0338.h" #include "unicode/thai.h" #ifdef HAVE_ICONV #ifdef HAVE_GICONV_H #include #else #include #endif #include /* * The whole of the GNUstep code stores UNICODE in internal byte order, * so we do the same. We have switched to using UTF16 so the defines here * recognise this. We use the endian specific versions of UTF16 so that * iconv does not introduce a BOM where we do not want it. * If UTF16 does not work, we revert to UCS-2-INTERNAL. */ #ifdef WORDS_BIGENDIAN #define UNICODE_UTF16 "UTF-16BE" #define UNICODE_UTF32 "UTF-32BE" #define UNICODE_INT "UNICODEBIG" #else #define UNICODE_UTF16 "UTF-16LE" #define UNICODE_UTF32 "UTF-32LE" #define UNICODE_INT "UNICODELITTLE" #endif #define UNICODE_ENC ((unicode_enc) ? unicode_enc : internal_unicode_enc()) static const char *unicode_enc = NULL; /* Check to see what type of internal unicode format the library supports */ static const char * internal_unicode_enc(void) { iconv_t conv; unicode_enc = UNICODE_UTF16; conv = iconv_open(unicode_enc, "ASCII"); if (conv != (iconv_t)-1) { iconv_close(conv); return unicode_enc; } fprintf(stderr, "Could not initialise iconv() for UTF16, using UCS-2\n"); fprintf(stderr, "Using characters outside 16 bits may give bad results.\n"); unicode_enc = UNICODE_INT; conv = iconv_open(unicode_enc, "ASCII"); if (conv != (iconv_t)-1) { iconv_close(conv); return unicode_enc; } unicode_enc = "UCS-2-INTERNAL"; conv = iconv_open(unicode_enc, "ASCII"); if (conv != (iconv_t)-1) { iconv_close(conv); return unicode_enc; } unicode_enc = "UCS-2"; /* This had better work */ return unicode_enc; } #else #define UNICODE_UTF32 "" #endif static GSLazyLock *local_lock = nil; typedef unsigned char unc; static NSStringEncoding defEnc = GSUndefinedEncoding; static NSStringEncoding natEnc = GSUndefinedEncoding; static NSStringEncoding *_availableEncodings = 0; struct _strenc_ { NSStringEncoding enc; // Constant representing the encoding. const char *ename; // ASCII string representation of name. const char *iconv; /* Iconv name of encoding. If this * is the empty string, we cannot use * iconv perform conversions to/from * this encoding. * NB. do not put a null pointer in this * field in the table, use "" instread. */ BOOL eightBit; /* Flag to say whether this encoding * can be stored in a byte array ... * ie whether the encoding consists * entirely of single byte characters * and the first 128 are identical to * the ASCII character set. */ char supported; /* Is this supported? Some encodings * have builtin conversion to/from * unicode, but for others we must * check with iconv to see if it * supports them on this platform. * A one means supported. * A negative means unsupported. * A zero means not yet checked. */ const char *lossy; /* Iconv name for lossy encoding */ }; /* * The str_encoding_table is a compact representation of all the string * encoding information we might need. It gets modified at runtime. */ static struct _strenc_ str_encoding_table[] = { {NSASCIIStringEncoding, "NSASCIIStringEncoding","ASCII",1,1,0}, {NSNEXTSTEPStringEncoding, "NSNEXTSTEPStringEncoding","NEXTSTEP",1,1,0}, {NSJapaneseEUCStringEncoding, "NSJapaneseEUCStringEncoding","EUC-JP",0,0,0}, {NSUTF8StringEncoding, "NSUTF8StringEncoding","UTF-8",0,1,0}, {NSISOLatin1StringEncoding, "NSISOLatin1StringEncoding","ISO-8859-1",1,1,0}, {NSSymbolStringEncoding, "NSSymbolStringEncoding","",0,0,0}, {NSNonLossyASCIIStringEncoding, "NSNonLossyASCIIStringEncoding","",1,1,0}, {NSShiftJISStringEncoding, "NSShiftJISStringEncoding","SHIFT-JIS",0,0,0}, {NSISOLatin2StringEncoding, "NSISOLatin2StringEncoding","ISO-8859-2",1,1,0}, {NSUnicodeStringEncoding, "NSUnicodeStringEncoding","",0,1,0}, {NSWindowsCP1251StringEncoding, "NSWindowsCP1251StringEncoding","CP1251",0,0,0}, {NSWindowsCP1252StringEncoding, "NSWindowsCP1252StringEncoding","CP1252",0,0,0}, {NSWindowsCP1253StringEncoding, "NSWindowsCP1253StringEncoding","CP1253",0,0,0}, {NSWindowsCP1254StringEncoding, "NSWindowsCP1254StringEncoding","CP1254",0,0,0}, {NSWindowsCP1250StringEncoding, "NSWindowsCP1250StringEncoding","CP1250",0,0,0}, {NSISO2022JPStringEncoding, "NSISO2022JPStringEncoding","ISO-2022-JP",0,0,0}, {NSMacOSRomanStringEncoding, "NSMacOSRomanStringEncoding","MACINTOSH",0,0,0}, #if defined(GNUSTEP) {NSProprietaryStringEncoding, "NSProprietaryStringEncoding","",0,0,0}, #endif // GNUstep additions {NSISOCyrillicStringEncoding, "NSISOCyrillicStringEncoding","ISO-8859-5",0,1,0}, {NSKOI8RStringEncoding, "NSKOI8RStringEncoding","KOI8-R",0,0,0}, {NSISOLatin3StringEncoding, "NSISOLatin3StringEncoding","ISO-8859-3",0,0,0}, {NSISOLatin4StringEncoding, "NSISOLatin4StringEncoding","ISO-8859-4",0,0,0}, {NSISOArabicStringEncoding, "NSISOArabicStringEncoding","ISO-8859-6",0,0,0}, {NSISOGreekStringEncoding, "NSISOGreekStringEncoding","ISO-8859-7",0,0,0}, {NSISOHebrewStringEncoding, "NSISOHebrewStringEncoding","ISO-8859-8",0,0,0}, {NSISOLatin5StringEncoding, "NSISOLatin5StringEncoding","ISO-8859-9",0,0,0}, {NSISOLatin6StringEncoding, "NSISOLatin6StringEncoding","ISO-8859-10",0,0,0}, {NSISOThaiStringEncoding, "NSISOThaiStringEncoding","ISO-8859-11",1,1,0}, {NSISOLatin7StringEncoding, "NSISOLatin7StringEncoding","ISO-8859-13",0,0,0}, {NSISOLatin8StringEncoding, "NSISOLatin8StringEncoding","ISO-8859-14",0,0,0}, {NSISOLatin9StringEncoding, "NSISOLatin9StringEncoding","ISO-8859-15",1,1,0}, {NSUTF7StringEncoding, "NSUTF7StringEncoding","UTF-7",0,0,0}, {NSGB2312StringEncoding, "NSGB2312StringEncoding","EUC-CN",0,0,0}, {NSGSM0338StringEncoding, "NSGSM0338StringEncoding","",0,1,0}, {NSBIG5StringEncoding, "NSBIG5StringEncoding","BIG5",0,0,0}, {NSKoreanEUCStringEncoding, "NSKoreanEUCStringEncoding","EUC-KR",0,0,0}, /* Now Apple encodings which have high numeric values. */ {NSUTF16BigEndianStringEncoding, "NSUTF16BigEndianStringEncoding","UTF-16BE",0,0,0}, {NSUTF16LittleEndianStringEncoding, "NSUTF16LittleEndianStringEncoding","UTF-16LE",0,0,0}, {NSUTF32StringEncoding, "NSUTF32StringEncoding",UNICODE_UTF32,0,0,0}, {NSUTF32BigEndianStringEncoding, "NSUTF32BigEndianStringEncoding","UTF-32BE",0,0,0}, {NSUTF32LittleEndianStringEncoding, "NSUTF32LittleEndianStringEncoding","UTF-32LE",0,0,0}, {0,"Unknown encoding","",0,0,0} }; static struct _strenc_ **encodingTable = 0; static unsigned encTableSize = 0; static void GSSetupEncodingTable(void) { if (encodingTable == 0) { [GS_INITIALIZED_LOCK(local_lock, GSLazyLock) lock]; if (encodingTable == 0) { static struct _strenc_ **encTable = 0; unsigned count; unsigned i; /* * We want to store pointers to our string encoding info in a * large table so we can do efficient lookup by encoding value. */ #define MAX_ENCODING 128 count = sizeof(str_encoding_table) / sizeof(struct _strenc_); /* * First determine the largest encoding value and create a * large enough table of pointers. */ encTableSize = 0; for (i = 0; i < count; i++) { unsigned tmp = str_encoding_table[i].enc; if (tmp > encTableSize) { if (tmp < MAX_ENCODING) { encTableSize = tmp; } } } encTable = objc_malloc((encTableSize+1)*sizeof(struct _strenc_ *)); memset(encTable, 0, (encTableSize+1)*sizeof(struct _strenc_ *)); /* * Now set up the pointers at the correct location in the table. */ for (i = 0; i < count; i++) { struct _strenc_ *entry = &str_encoding_table[i]; unsigned tmp = entry->enc; if (tmp < MAX_ENCODING) { encTable[tmp] = entry; } #ifdef HAVE_ICONV if (entry->iconv != 0 && *(entry->iconv) != 0) { iconv_t c; char *lossy; /* * See if we can do a lossy conversion. */ lossy = objc_malloc(strlen(entry->iconv) + 12); strcpy(lossy, entry->iconv); strcat(lossy, "//TRANSLIT"); c = iconv_open(UNICODE_ENC, entry->iconv); if (c == (iconv_t)-1) { objc_free(lossy); } else { entry->lossy = lossy; iconv_close(c); } } #endif } encodingTable = encTable; } [local_lock unlock]; } } static struct _strenc_ * EntryForEncoding(NSStringEncoding enc) { struct _strenc_ *entry = 0; if (enc > 0) { GSSetupEncodingTable(); if (enc <= encTableSize) { entry = encodingTable[enc]; } else { unsigned i = 0; while (i < sizeof(str_encoding_table) / sizeof(struct _strenc_)) { if (str_encoding_table[i].enc == enc) { entry = &str_encoding_table[i]; break; } i++; } } } return entry; } static struct _strenc_ * EntrySupported(NSStringEncoding enc) { struct _strenc_ *entry = EntryForEncoding(enc); if (entry == 0) { return NO; } #ifdef HAVE_ICONV if (entry->iconv != 0 && entry->supported == 0) { if (enc == NSUnicodeStringEncoding) { entry->iconv = UNICODE_ENC; entry->supported = 1; } else if (entry->iconv[0] == 0) { /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ entry->supported = -1; } else { iconv_t c; c = iconv_open(UNICODE_ENC, entry->iconv); if (c == (iconv_t)-1) { entry->supported = -1; } else { iconv_close(c); c = iconv_open(entry->iconv, UNICODE_ENC); if (c == (iconv_t)-1) { entry->supported = -1; } else { iconv_close(c); entry->supported = 1; } } } } #endif if (entry->supported == 1) { return entry; } return 0; } BOOL GSPrivateIsEncodingSupported(NSStringEncoding enc) { if (EntrySupported(enc) == 0) { return NO; } return YES; } /** Returns the NSStringEncoding that matches the specified * character set registry and encoding information. For instance, * for the iso8859-5 character set, the registry is iso8859 and * the encoding is 5, and the returned NSStringEncoding is * NSISOCyrillicStringEncoding. If there is no specific encoding, * use @"0". Returns GSUndefinedEncoding if there is no match. */ NSStringEncoding GSEncodingForRegistry (NSString *registry, NSString *encoding) { NSString *charset = registry; if ([encoding length] > 0 && [encoding isEqualToString: @"0"] == NO) { charset = [NSString stringWithFormat: @"%@-%@", registry, encoding]; } return [GSMimeDocument encodingFromCharset: charset]; } /** Try to deduce the string encoding from the locale string * clocale. This function looks in the Locale.encodings file * installed as part of GNUstep Base if the encoding cannot be * deduced from the clocale string itself. If clocale isn't set or * no match can be found, returns GSUndefinedEncoding. */ /* It would be really nice if this could be used in +defaultCStringEncoding, but * there are too many dependancies on other parts of the library to * make this practical (even if everything possible was written in C, * we'd still need some way to find the Locale.encodings file). */ NSStringEncoding GSEncodingFromLocale(const char *clocale) { NSStringEncoding encoding = GSUndefinedEncoding; NSString *encodstr; if (clocale == NULL || strcmp(clocale, "C") == 0 || strcmp(clocale, "POSIX") == 0) { /* Don't make any assumptions. Let caller handle that */ return encoding; } if (strchr (clocale, '.') != NULL) { /* Locale contains the 'codeset' section. Parse it and see if we know what encoding this cooresponds to */ NSString *registry; NSString *charset; NSArray *array; char *s; s = strchr (clocale, '.'); registry = [[NSString stringWithUTF8String: s+1] lowercaseString]; array = [registry componentsSeparatedByString: @"-"]; registry = [array objectAtIndex: 0]; if ([array count] > 1) { charset = [NSString stringWithFormat: @"%@-%@", registry, [array lastObject]]; } else { charset = registry; } encoding = [GSMimeDocument encodingFromCharset: charset]; } else { /* Look up the locale in our table of encodings */ NSBundle *gbundle; NSString *table; #ifdef GNUSTEP gbundle = [NSBundle bundleForLibrary: @"gnustep-base"]; #else gbundle = [NSBundle bundleForClass: NSClassFromString(@"GSXMLNode")]; #endif table = [gbundle pathForResource: @"Locale" ofType: @"encodings" inDirectory: @"Languages"]; if (table != nil) { unsigned count; NSDictionary *dict; dict = [NSDictionary dictionaryWithContentsOfFile: table]; encodstr = [dict objectForKey: [NSString stringWithUTF8String: clocale]]; if (encodstr == nil) return GSUndefinedEncoding; /* Find the matching encoding */ count = 0; while (str_encoding_table[count].enc && strcmp(str_encoding_table[count].ename, [encodstr lossyCString])) { count++; } if (str_encoding_table[count].enc) { encoding = str_encoding_table[count].enc; } if (encoding == GSUndefinedEncoding) { NSLog(@"No known GNUstep encoding for %s = %@", clocale, encodstr); } } } return encoding; } /** * Uses direct access into a two-level table to map cases.
* The two-level table method is less space efficient (but still not bad) than * a single table and a linear search, but it reduces the number of * conditional statements to just one. */ unichar uni_tolower(unichar ch) { unichar result = gs_tolower_map[ch / 256][ch % 256]; return result ? result : ch; } /** * Uses direct access into a two-level table to map cases.
* The two-level table method is less space efficient (but still not bad) than * a single table and a linear search, but it reduces the number of * conditional statements to just one. */ unichar uni_toupper(unichar ch) { unichar result = gs_toupper_map[ch / 256][ch % 256]; return result ? result : ch; } unsigned char GSPrivateUniCop(unichar u) { if (u < uni_cop_table[0].code) { return 0; // Special case for latin1 } else { unichar code; unichar count = 0; unichar first = 0; unichar last = uni_cop_table_size; while (first <= last) { if (first != last) { count = (first + last) / 2; code = uni_cop_table[count].code; if (code < u) { first = count+1; } else if (code > u) { last = count-1; } else { return uni_cop_table[count].cop; } } else /* first == last */ { if (u == uni_cop_table[first].code) { return uni_cop_table[first].cop; } return 0; } } return 0; } } unsigned char uni_cop(unichar u) { return GSPrivateUniCop(u); } BOOL uni_isnonsp(unichar u) { /* * Treating upper surrogates as non-spacing is a convenient solution * to a number of issues with UTF-16 */ if ((u >= 0xdc00) && (u <= 0xdfff)) return YES; // FIXME check is uni_cop good for this if (GSPrivateUniCop(u)) return YES; else return NO; } unichar* uni_is_decomp(unichar u) { if (u < uni_dec_table[0].code) { return 0; // Special case for latin1 } else { unichar code; unichar count = 0; unichar first = 0; unichar last = uni_dec_table_size; while (first <= last) { if (first != last) { count = (first + last) / 2; code = uni_dec_table[count].code; if (code < u) { first = count+1; } else if (code > u) { last = count-1; } else { return uni_dec_table[count].decomp; } } else /* first == last */ { if (u == uni_dec_table[first].code) { return uni_dec_table[first].decomp; } return 0; } } return 0; } } /** * Function to check a block of data for validity as a unicode string and * say whether it contains solely ASCII or solely Latin1 data.
* Any leading BOM must already have been removed and the data must already * be in native byte order.
* Returns the number of characters which were found valid. */ unsigned GSUnicode(const unichar *chars, unsigned length, BOOL *isASCII, BOOL *isLatin1) { unsigned i = 0; unichar c; if (isASCII) *isASCII = YES; if (isLatin1) *isLatin1 = YES; while (i < length) { if ((c = chars[i++]) > 127) { if (isASCII) *isASCII = NO; i--; while (i < length) { if ((c = chars[i++]) > 255) { if (isLatin1) *isLatin1 = NO; i--; while (i < length) { c = chars[i++]; if (c == 0xfffe || c == 0xffff || (c >= 0xfdd0 && c <= 0xfdef)) { return i - 1; // Non-characters. } if (c >= 0xdc00 && c <= 0xdfff) { return i - 1; // Second half of a surrogate pair. } if (c >= 0xd800 && c <= 0xdbff) { // First half of a surrogate pair. if (i >= length) { return i - 1; // Second half missing } c = chars[i]; if (c < 0xdc00 || c > 0xdfff) { return i - 1; // Second half missing } i++; // Step past second half } } } } } } return i; } #if GS_WITH_GC #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * reset the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr = buf - dpos; \ bsize = dpos + BUFSIZ; \ if (extra != 0) \ { \ bsize--; \ } \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ goto done; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ grow *= sizeof(unichar); \ \ if (ptr == buf || ptr == *dst) \ { \ unichar *tmp; \ \ tmp = NSAllocateCollectable(grow + extra, 0); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize * sizeof(unichar)); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSReallocateCollectable(ptr, grow + extra, 0); \ } \ if (ptr == 0) \ { \ return NO; /* Not enough memory */ \ } \ bsize = grow / sizeof(unichar); \ } #else /* GS_WITH_GC */ #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * reset the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr = buf - dpos; \ bsize = dpos + BUFSIZ; \ if (extra != 0) \ { \ bsize--; \ } \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ goto done; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ grow *= sizeof(unichar); \ \ if (ptr == buf || ptr == *dst) \ { \ unichar *tmp; \ \ tmp = NSZoneMalloc(zone, grow + extra); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize * sizeof(unichar)); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSZoneRealloc(zone, ptr, grow + extra); \ } \ if (ptr == 0) \ { \ return NO; /* Not enough memory */ \ } \ bsize = grow / sizeof(unichar); \ } #endif /* GS_WITH_GC */ /** * Function to convert from 8-bit data to 16-bit unicode characters. *

The dst argument is a pointer to a pointer to a buffer in which the * converted string is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the * converted string. If the zone argument is non-nul, the function is free * to allocate a larger buffer if necessary, and store this new buffer in * the dst argument. It will *NOT* deallocate the original buffer! *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be * altered to the new size. This is measured in 16-bit unicode characters, * not bytes. *

*

The src argument is a pointer to the byte sequence which is * to be converted to 16-bit unicode. *

*

The slen argument is the length of the byte sequence * which is to be converted to 16-bit unicode. * This is measured in bytes. *

*

The enc argument specifies the encoding type of the 8-bit byte sequence * which is to be converted to 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. * If this is nul, the function will fail if the originally supplied buffer * is not big enough (unless dst is a null pointer ... indicating that * converted data is to be discarded).
* If the library is built for garbage collecting, the zone argument is used * only as a marker to say whether the function may allocate memory (zone * is non-null) or not (zone is null). *

* The options argument controls some special behavior. * * If GSUniTerminate is set, the function is expected to null terminate * the output string, and will assume that it is safe to place the nul * just beyond the end of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra * termination character. * If GSUniTemporary is set, the function will return the results in * an autoreleased buffer rather than in a buffer that the caller must * release. * If GSUniBOM is set, the function will write the first unicode * character as a byte order marker. * If GSUniShortOk is set, the function will return a buffer containing * any decoded characters even if the whole conversion fails. * *

On return, the function result is a flag indicating success (YES) * or failure (NO), and on success, the value stored in size is the number * of characters in the converted string. The converted string itself is * stored in the location given by dst.
* NB. If the value stored in dst has been changed, it is a pointer to * allocated memory which the caller is responsible for freeing, and the * caller is still responsible for freeing the original buffer. *

*/ BOOL GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, unsigned int options) { unichar buf[BUFSIZ]; unichar *ptr; unsigned bsize; unsigned dpos = 0; // Offset into destination buffer. unsigned spos = 0; // Offset into source buffer. unsigned extra = (options & GSUniTerminate) ? sizeof(unichar) : 0; unichar base = 0; unichar *table = 0; BOOL result = YES; /* * Ensure we have an initial buffer set up to decode data into. */ if (dst == 0 || *size == 0) { ptr = buf; bsize = (extra != 0) ? BUFSIZ - 1 : BUFSIZ; } else { ptr = *dst; bsize = *size; } if (options & GSUniBOM) { while (dpos >= bsize) { GROW(); } ptr[dpos++] = (unichar)0xFEFF; // Insert byte order marker. } switch (enc) { case NSUTF8StringEncoding: { while (spos < slen) { unsigned char c = src[spos]; unsigned long u = c; if (c > 0x7f) { int i, sle = 0; /* calculated the expected sequence length */ while (c & 0x80) { c = c << 1; sle++; } /* legal ? */ if ((sle < 2) || (sle > 6)) { result = NO; goto done; } /* do we have enough bytes ? */ if ((spos + sle) > slen) { result = NO; goto done; } /* get the codepoint */ for (i = 1; i < sle; i++) { if (src[spos + i] < 0x80 || src[spos + i] >= 0xc0) break; u = (u << 6) | (src[spos + i] & 0x3f); } if (i < sle) { result = NO; goto done; } u = u & ~(0xffffffff << ((5 * sle) + 1)); spos += sle; /* * We discard invalid codepoints here. */ if (u > 0x10ffff || u == 0xfffe || u == 0xffff || (u >= 0xfdd0 && u <= 0xfdef)) { result = NO; // Invalid character. goto done; } if ((u >= 0xd800) && (u <= 0xdfff)) { result = NO; // Unmatched half of surrogate pair. goto done; } } else { spos++; } /* * Add codepoint as either a single unichar for BMP * or as a pair of surrogates for codepoints over 16 bits. */ if (dpos >= bsize) { GROW(); } if (u < 0x10000) { ptr[dpos++] = u; } else { unichar ul, uh; u -= 0x10000; ul = u & 0x3ff; uh = (u >> 10) & 0x3ff; ptr[dpos++] = uh + 0xd800; if (dpos >= bsize) { GROW(); } ptr[dpos++] = ul + 0xdc00; } } } break; case NSNonLossyASCIIStringEncoding: case NSASCIIStringEncoding: if (dst == 0) { /* Just counting bytes, and we know there is exactly one * unicode codepoint needed for each ascii character. */ dpos += slen; } else { /* Because we know that each ascii chartacter is exactly * one unicode character, we can check the destination * buffer size and allocate more space in one go, before * entering the loop where we deal with each character. */ if (dpos + slen + (extra ? 1 : 0) > bsize) { if (zone == 0) { result = NO; /* No buffer growth possible ... fail. */ goto done; } else { unsigned grow = (dpos + slen) * sizeof(unichar); unichar *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(grow + extra, 0); #else tmp = NSZoneMalloc(zone, grow + extra); #endif if ((ptr == buf || ptr == *dst) && (tmp != 0)) { memcpy(tmp, ptr, bsize * sizeof(unichar)); } #if !GS_WITH_GC if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = tmp; if (ptr == 0) { return NO; /* Not enough memory */ } bsize = grow / sizeof(unichar); } } while (spos < slen) { unichar c = (unichar)((unc)src[spos++]); if (c > 127) { result = NO; // Non-ascii data found in input. goto done; } ptr[dpos++] = c; } } break; case NSISOLatin1StringEncoding: if (dst == 0) { /* Just counting bytes, and we know there is exactly one * unicode codepoint needed for each latin1 character. */ dpos += slen; } else { /* Because we know that each latin1 chartacter is exactly * one unicode character, we can check the destination * buffer size and allocate more space in one go, before * entering the loop where we deal with each character. */ if (dpos + slen + (extra ? 1 : 0) > bsize) { if (zone == 0) { result = NO; /* No buffer growth possible ... fail. */ goto done; } else { unsigned grow = (dpos + slen) * sizeof(unichar); unichar *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(grow + extra, 0); #else tmp = NSZoneMalloc(zone, grow + extra); #endif if ((ptr == buf || ptr == *dst) && (tmp != 0)) { memcpy(tmp, ptr, bsize * sizeof(unichar)); } #if !GS_WITH_GC if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = tmp; if (ptr == 0) { return NO; /* Not enough memory */ } bsize = grow / sizeof(unichar); } } while (spos < slen) { ptr[dpos++] = (unichar)((unc)src[spos++]); } } break; case NSNEXTSTEPStringEncoding: base = Next_conv_base; table = Next_char_to_uni_table; goto tables; case NSISOCyrillicStringEncoding: base = Cyrillic_conv_base; table = Cyrillic_char_to_uni_table; goto tables; case NSISOLatin2StringEncoding: base = Latin2_conv_base; table = Latin2_char_to_uni_table; goto tables; case NSISOLatin9StringEncoding: base = Latin9_conv_base; table = Latin9_char_to_uni_table; goto tables; case NSISOThaiStringEncoding: base = Thai_conv_base; table = Thai_char_to_uni_table; goto tables; #if 0 case NSSymbolStringEncoding: base = Symbol_conv_base; table = Symbol_char_to_uni_table; goto tables; #endif tables: if (dst == 0) { /* Just counting bytes, and we know there is exactly one * unicode codepoint needed for each character. */ dpos += slen; } else { /* Because we know that each character in the table is exactly * one unicode character, we can check the destination * buffer size and allocate more space in one go, before * entering the loop where we deal with each character. */ if (dpos + slen + (extra ? 1 : 0) > bsize) { if (zone == 0) { result = NO; /* No buffer growth possible ... fail. */ goto done; } else { unsigned grow = (dpos + slen) * sizeof(unichar); unichar *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(grow + extra, 0); #else tmp = NSZoneMalloc(zone, grow + extra); #endif if ((ptr == buf || ptr == *dst) && (tmp != 0)) { memcpy(tmp, ptr, bsize * sizeof(unichar)); } #if !GS_WITH_GC if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = tmp; if (ptr == 0) { return NO; /* Not enough memory */ } bsize = grow / sizeof(unichar); } } while (spos < slen) { unc c = (unc)src[spos]; if (c < base) { ptr[dpos++] = c; } else { ptr[dpos++] = table[c - base]; } spos++; } } break; case NSGSM0338StringEncoding: while (spos < slen) { unc c = (unc)src[spos]; if (dpos >= bsize) { GROW(); } ptr[dpos] = GSM0338_char_to_uni_table[c]; if (c == 0x1b && spos < slen) { unsigned i = 0; c = (unc)src[spos+1]; while (i < sizeof(GSM0338_escapes)/sizeof(GSM0338_escapes[0])) { if (GSM0338_escapes[i].to == c) { ptr[dpos] = GSM0338_escapes[i].from; spos++; break; } i++; } } dpos++; spos++; } break; default: #ifdef HAVE_ICONV { struct _strenc_ *encInfo; unsigned char *inbuf; unsigned char *outbuf; size_t inbytesleft; size_t outbytesleft; size_t rval; iconv_t cd; const char *estr = 0; BOOL done = NO; if ((encInfo = EntrySupported(enc)) != 0) { estr = encInfo->iconv; } /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ if (estr == 0) { NSLog(@"No iconv for encoding x%02x", enc); result = NO; goto done; } if (slen == 0) { break; // Nothing to do } cd = iconv_open(UNICODE_ENC, estr); if (cd == (iconv_t)-1) { NSLog(@"No iconv for encoding %@ tried to use %s", GSPrivateEncodingName(enc), estr); result = NO; goto done; } inbuf = (unsigned char*)src; inbytesleft = slen; outbuf = (unsigned char*)ptr; outbytesleft = bsize * sizeof(unichar); do { if (inbytesleft == 0) { done = YES; // Flush iconv rval = iconv(cd, 0, 0, (void*)&outbuf, &outbytesleft); } else { rval = iconv(cd, (void*)&inbuf, &inbytesleft, (void*)&outbuf, &outbytesleft); } dpos = (bsize * sizeof(unichar) - outbytesleft) / sizeof(unichar); if (rval == (size_t)-1) { if (errno == E2BIG) { unsigned old = bsize; GROW(); outbuf = (unsigned char*)&ptr[dpos]; outbytesleft += (bsize - old) * sizeof(unichar); } else { result = NO; goto done; } } } while (!done || rval != 0); // close the converter iconv_close(cd); } #else result = NO; #endif } done: /* * Post conversion ... terminate if needed, and set output values. */ if (extra != 0 && dst != 0) { ptr[dpos] = (unichar)0; } *size = dpos; if (dst != 0 && (result == YES || (options & GSUniShortOk))) { if (options & GSUniTemporary) { unsigned bytes = dpos * sizeof(unichar) + extra; void *r; /* * Temporary string was requested ... make one. */ #if GS_WITH_GC r = NSAllocateCollectable(bytes, 0); memcpy(r, ptr, bytes); #else r = GSAutoreleasedBuffer(bytes); memcpy(r, ptr, bytes); if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = r; *dst = ptr; } else if (zone != 0 && (ptr == buf || bsize > dpos)) { unsigned bytes = dpos * sizeof(unichar) + extra; /* * Resizing is permitted, try ensure we return a buffer which * is just big enough to hold the converted string. */ if (ptr == buf || ptr == *dst) { unichar *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(bytes, 0); #else tmp = NSZoneMalloc(zone, bytes); #endif if (tmp != 0) { memcpy(tmp, ptr, bytes); } ptr = tmp; } else { #if GS_WITH_GC ptr = NSReallocateCollectable(ptr, bytes, 0); #else ptr = NSZoneRealloc(zone, ptr, bytes); #endif } *dst = ptr; } else if (ptr == buf) { ptr = NULL; result = NO; } else { *dst = ptr; } } #if !GS_WITH_GC else if (ptr != buf && dst != 0 && ptr != *dst) { NSZoneFree(zone, ptr); } #endif if (dst) NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer"); return result; } #undef GROW #if GS_WITH_GC #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * reset the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr = buf - dpos; \ bsize = dpos + BUFSIZ; \ if (extra != 0) \ { \ bsize--; \ } \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ goto done; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ \ if (ptr == buf || ptr == *dst) \ { \ unsigned char *tmp; \ \ tmp = NSAllocateCollectable(grow + extra, 0); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSReallocateCollectable(ptr, grow + extra, 0); \ } \ if (ptr == 0) \ { \ return NO; /* Not enough memory */ \ } \ bsize = grow; \ } #else /* GS_WITH_GC */ #define GROW() \ if (dst == 0) \ { \ /* \ * Data is just being discarded anyway, so we can \ * reset the offset into the local buffer on the \ * stack and pretend the buffer has grown. \ */ \ ptr = buf - dpos; \ bsize = dpos + BUFSIZ; \ if (extra != 0) \ { \ bsize--; \ } \ } \ else if (zone == 0) \ { \ result = NO; /* No buffer growth possible ... fail. */ \ goto done; \ } \ else \ { \ unsigned grow = slen; \ \ if (grow < bsize + BUFSIZ) \ { \ grow = bsize + BUFSIZ; \ } \ \ if (ptr == buf || ptr == *dst) \ { \ unsigned char *tmp; \ \ tmp = NSZoneMalloc(zone, grow + extra); \ if (tmp != 0) \ { \ memcpy(tmp, ptr, bsize); \ } \ ptr = tmp; \ } \ else \ { \ ptr = NSZoneRealloc(zone, ptr, grow + extra); \ } \ if (ptr == 0) \ { \ return NO; /* Not enough memory */ \ } \ bsize = grow; \ } #endif /* GS_WITH_GC */ static inline int chop(unichar c, _ucc_ *table, int hi) { int lo = 0; while (hi > lo) { int i = (hi + lo) / 2; unichar from = table[i].from; if (from < c) { lo = i + 1; } else if (from > c) { hi = i; } else { return i; // Found } } return -1; // Not found } /** * Function to convert from 16-bit unicode to 8-bit data. *

The dst argument is a pointer to a pointer to a buffer in which the * converted data is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the * converted data. If the zone argument is non-nul, the function is free * to allocate a larger buffer if necessary, and store this new buffer in * the dst argument. It will *NOT* deallocate the original buffer! *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be * altered to the new size. This is measured in bytes. *

*

The src argument is a pointer to the 16-bit unicode string which is * to be converted to 8-bit data. *

*

The slen argument is the length of the 16-bit unicode string * which is to be converted to 8-bit data. * This is measured in 16-bit characters, not bytes. *

*

The enc argument specifies the encoding type of the 8-bit byte sequence * which is to be produced from the 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. * If this is nul, the function will fail if the originally supplied buffer * is not big enough (unless dst is a null pointer ... indicating that * converted data is to be discarded).
* If the library is built for garbage collecting, the zone argument is used * only as a marker to say whether the function may allocate memory (zone * is non-null) or not (zone is null). *

* The options argument controls some special behavior. * * If GSUniStrict is set, the function will fail if a character is * encountered in the source which can't be converted. Otherwise, some * approximation or marker will be placed in the destination. * If GSUniTerminate is set, the function is expected to nul terminate * the output data, and will assume that it is safe to place the nul * just beyond the end of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra * termination byte. * If GSUniTemporary is set, the function will return the results in * an autoreleased buffer rather than in a buffer that the caller must * release. * If GSUniBOM is set, the function will read the first unicode * character as a byte order marker. * If GSUniShortOk is set, the function will return a buffer containing * any decoded characters even if the whole conversion fails. * *

On return, the function result is a flag indicating success (YES) * or failure (NO), and on success, the value stored in size is the number * of bytes in the converted data. The converted data itself is * stored in the location given by dst.
* NB. If the value stored in dst has been changed, it is a pointer to * allocated memory which the caller is responsible for freeing, and the * caller is still responsible for freeing the original buffer. *

*/ BOOL GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src, unsigned int slen, NSStringEncoding enc, NSZone *zone, unsigned int options) { unsigned char buf[BUFSIZ]; unsigned char *ptr; unsigned bsize; unsigned dpos = 0; // Offset into destination buffer. unsigned spos = 0; // Offset into source buffer. unsigned extra = (options & GSUniTerminate) ? 1 : 0; BOOL strict = (options & GSUniStrict) ? YES : NO; unichar base = 0; _ucc_ *table = 0; unsigned tsize = 0; unsigned char escape = 0; _ucc_ *etable = 0; unsigned etsize = 0; _ucc_ *ltable = 0; unsigned ltsize = 0; BOOL swapped = NO; BOOL result = YES; if (options & GSUniBOM) { if (slen == 0) { *size = 0; result = NO; // Missing byte order marker. } else { unichar c; c = *src++; slen--; if (c != 0xFEFF) { if (c == 0xFFFE) { swapped = YES; } else { *size = 0; result = NO; // Illegal byte order marker. } } } } /* * Ensure we have an initial buffer set up to decode data into. */ if (dst == 0 || *size == 0) { ptr = buf; bsize = (extra != 0) ? BUFSIZ - 1 : BUFSIZ; } else { ptr = *dst; bsize = *size; } if (result == NO) { goto done; } #ifdef HAVE_ICONV if (strict == NO && enc != NSUTF8StringEncoding && enc != NSGSM0338StringEncoding) { goto iconv_start; // For lossy conversion } #endif switch (enc) { case NSUTF8StringEncoding: { if (swapped == YES) { while (spos < slen) { unichar u1, u2; unsigned char reversed[8]; unsigned long u; int sl; int i; /* get first unichar */ u1 = src[spos++]; u1 = (((u1 & 0xff00) >> 8) + ((u1 & 0x00ff) << 8)); /* Fast track ... if this is actually an ascii character * it just converts straight to utf-8 */ if (u1 <= 0x7f) { if (dpos >= bsize) { GROW(); } ptr[dpos++] = (unsigned char)u1; continue; } // 0xfeff is a zero-width-no-break-space inside text if (u1 == 0xfffe // unexpected BOM || u1 == 0xffff // not a character || (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character || (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing { if (strict) { result = NO; goto done; } continue; // Skip invalid character. } /* possibly get second character and calculate 'u' */ if ((u1 >= 0xd800) && (u1 < 0xdc00)) { if (spos >= slen) { if (strict) { result = NO; goto done; } continue; // At end. } /* get second unichar */ u2 = src[spos++]; u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8)); if ((u2 < 0xdc00) && (u2 > 0xdfff)) { spos--; if (strict) { result = NO; goto done; } continue; // Skip bad half of surrogate pair. } /* make the full value */ u = ((unsigned long)(u1 - 0xd800) * 0x400) + (u2 - 0xdc00) + 0x10000; } else { u = u1; } /* calculate the sequence length * a length of 1 was dealt with earlier */ if (u <= 0x7ff) { sl = 2; } else if (u <= 0xffff) { sl = 3; } else if (u <= 0x1fffff) { sl = 4; } else if (u <= 0x3ffffff) { sl = 5; } else { sl = 6; } /* make sure we have enough space for it */ while (dpos + sl >= bsize) { GROW(); } /* split value into reversed array */ for (i = 0; i < sl; i++) { reversed[i] = (u & 0x3f); u = u >> 6; } ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff); /* add bytes into the output sequence */ for (i = sl - 2; i >= 0; i--) { ptr[dpos++] = reversed[i] | 0x80; } } } else { while (spos < slen) { unichar u1, u2; unsigned char reversed[8]; unsigned long u; int sl; int i; /* get first unichar */ u1 = src[spos++]; /* Fast track ... if this is actually an ascii character * it just converts straight to utf-8 */ if (u1 <= 0x7f) { if (dpos >= bsize) { GROW(); } ptr[dpos++] = (unsigned char)u1; continue; } // 0xfeff is a zero-width-no-break-space inside text if (u1 == 0xfffe // unexpected BOM || u1 == 0xffff // not a character || (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character || (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing { if (strict) { result = NO; goto done; } continue; // Skip invalid character. } /* possibly get second character and calculate 'u' */ if ((u1 >= 0xd800) && (u1 < 0xdc00)) { if (spos >= slen) { if (strict) { result = NO; goto done; } continue; // At end. } /* get second unichar */ u2 = src[spos++]; if ((u2 < 0xdc00) && (u2 > 0xdfff)) { spos--; if (strict) { result = NO; goto done; } continue; // Skip bad half of surrogate pair. } /* make the full value */ u = ((unsigned long)(u1 - 0xd800) * 0x400) + (u2 - 0xdc00) + 0x10000; } else { u = u1; } /* calculate the sequence length * a length of 1 was dealt with earlier */ if (u <= 0x7ff) { sl = 2; } else if (u <= 0xffff) { sl = 3; } else if (u <= 0x1fffff) { sl = 4; } else if (u <= 0x3ffffff) { sl = 5; } else { sl = 6; } /* make sure we have enough space for it */ while (dpos + sl >= bsize) { GROW(); } /* split value into reversed array */ for (i = 0; i < sl; i++) { reversed[i] = (u & 0x3f); u = u >> 6; } ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff); /* add bytes into the output sequence */ for (i = sl - 2; i >= 0; i--) { ptr[dpos++] = reversed[i] | 0x80; } } } } break; case NSNonLossyASCIIStringEncoding: case NSASCIIStringEncoding: base = 128; goto bases; case NSISOLatin1StringEncoding: case NSUnicodeStringEncoding: base = 256; goto bases; bases: if (dst == 0) { /* Just counting bytes, and we know there is exactly one * unicode codepoint needed for each character. */ dpos = slen; } else { /* Because we know that each ascii chartacter is exactly * one unicode character, we can check the destination * buffer size and allocate more space in one go, before * entering the loop where we deal with each character. */ if (slen > bsize) { if (zone == 0) { result = NO; /* No buffer growth possible ... fail. */ goto done; } else { uint8_t *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(slen, 0); #else tmp = NSZoneMalloc(zone, slen); if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = tmp; if (ptr == 0) { return NO; /* Not enough memory */ } bsize = slen; } } } if (strict == NO) { if (swapped == YES) { while (spos < slen) { unichar u = src[spos++]; u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8)); if (u < base) { ptr[dpos++] = (unsigned char)u; } else { ptr[dpos++] = '?'; } } } else { while (spos < slen) { unichar u = src[spos++]; if (u < base) { ptr[dpos++] = (unsigned char)u; } else { ptr[dpos++] = '?'; } } } } else { if (swapped == YES) { while (spos < slen) { unichar u = src[spos++]; u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8)); if (u < base) { ptr[dpos++] = (unsigned char)u; } else { result = NO; goto done; } } } else { while (spos < slen) { unichar u = src[spos++]; if (u < base) { ptr[dpos++] = (unsigned char)u; } else { result = NO; goto done; } } } } break; case NSNEXTSTEPStringEncoding: base = Next_conv_base; table = Next_uni_to_char_table; tsize = Next_uni_to_char_table_size; goto tables; case NSISOCyrillicStringEncoding: base = Cyrillic_conv_base; table = Cyrillic_uni_to_char_table; tsize = Cyrillic_uni_to_char_table_size; goto tables; case NSISOLatin2StringEncoding: base = Latin2_conv_base; table = Latin2_uni_to_char_table; tsize = Latin2_uni_to_char_table_size; goto tables; case NSISOLatin9StringEncoding: base = Latin9_conv_base; table = Latin9_uni_to_char_table; tsize = Latin9_uni_to_char_table_size; goto tables; case NSISOThaiStringEncoding: base = Thai_conv_base; table = Thai_uni_to_char_table; tsize = Thai_uni_to_char_table_size; goto tables; #if 0 case NSSymbolStringEncoding: base = Symbol_conv_base; table = Symbol_uni_to_char_table; tsize = Symbol_uni_to_char_table_size; goto tables; #endif case NSGSM0338StringEncoding: base = 0; table = GSM0338_uni_to_char_table; tsize = GSM0338_tsize; escape = 0x1b; etable = GSM0338_escapes; etsize = GSM0338_esize; if (strict == NO) { ltable = GSM0338_lossy; ltsize = GSM0338_lsize; } goto tables; tables: while (spos < slen) { unichar u = src[spos++]; int i; /* Swap byte order if necessary */ if (swapped == YES) { u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8)); } /* Grow output buffer to make room if necessary */ if (dpos >= bsize) { GROW(); } if (u < base) { /* * The character set has a lower section whose contents * are identical to unicode, so no mapping is needed. */ ptr[dpos++] = (unsigned char)u; } else if (table != 0 && (i = chop(u, table, tsize)) >= 0) { /* * The character mapping is found in a basic table. */ ptr[dpos++] = table[i].to; } else if (etable != 0 && (i = chop(u, etable, etsize)) >= 0) { /* * The character mapping is found in a table of simple * escape sequences consisting of an escape byte followed * by another single byte. */ ptr[dpos++] = escape; if (dpos >= bsize) { GROW(); } ptr[dpos++] = etable[i].to; } else if (ltable != 0 && (i = chop(u, ltable, ltsize)) >= 0) { /* * The character is found in a lossy mapping table. */ ptr[dpos++] = ltable[i].to; } else if (strict == NO) { /* * The default lossy mapping generates a question mark. */ ptr[dpos++] = '?'; } else { /* * No mapping has been found. */ result = NO; spos = slen; goto done; } } break; default: #ifdef HAVE_ICONV iconv_start: { struct _strenc_ *encInfo; iconv_t cd; unsigned char *inbuf; unsigned char *outbuf; size_t inbytesleft; size_t outbytesleft; size_t rval; const char *estr = 0; BOOL done = NO; if ((encInfo = EntrySupported(enc)) != 0) { if (strict == NO) { /* * Try to transliterate where no direct conversion * is available. */ estr = encInfo->lossy; } if (estr == 0) { estr = encInfo->iconv; } } /* explicitly check for empty encoding name since some systems * have buggy iconv_open() code which succeeds on an empty name. */ if (estr == 0) { NSLog(@"No iconv for encoding x%02x", enc); result = NO; goto done; } if (slen == 0) { break; // Nothing to convert. } cd = iconv_open(estr, UNICODE_ENC); if (cd == (iconv_t)-1) { NSLog(@"No iconv for encoding %@ tried to use %s", GSPrivateEncodingName(enc), estr); result = NO; goto done; } inbuf = (unsigned char*)src; inbytesleft = slen * sizeof(unichar); outbuf = (unsigned char*)ptr; outbytesleft = bsize; do { if (inbytesleft == 0) { done = YES; // Flush buffer rval = iconv(cd, 0, 0, (void*)&outbuf, &outbytesleft); } else { rval = iconv(cd, (void*)&inbuf, &inbytesleft, (void*)&outbuf, &outbytesleft); } dpos = bsize - outbytesleft; if (rval != 0) { if (rval == (size_t)-1) { if (errno == E2BIG) { unsigned old = bsize; GROW(); outbuf = (unsigned char*)&ptr[dpos]; outbytesleft += (bsize - old); } else if (errno == EILSEQ) { if (strict == YES) { result = NO; goto done; } /* * If we are allowing lossy conversion, we replace any * unconvertable character with a question mark. */ if (outbytesleft > 0) { *outbuf++ = '?'; outbytesleft--; inbuf += sizeof(unichar); inbytesleft -= sizeof(unichar); } } else { result = NO; goto done; } } else if (strict == YES) { /* * A positive return from iconv indicates some * irreversible (ie lossy) conversions took place, * so if we are doing strict conversions we must fail. */ result = NO; goto done; } } } while (!done || rval != 0); // close the converter iconv_close(cd); } #else result = NO; goto done; #endif } done: /* * Post conversion ... set output values. */ if (extra != 0) { ptr[dpos] = (unsigned char)0; } *size = dpos; if (dst != 0 && (result == YES || (options & GSUniShortOk))) { if (options & GSUniTemporary) { unsigned bytes = dpos + extra; void *r; /* * Temporary string was requested ... make one. */ #if GS_WITH_GC r = NSAllocateCollectable(bytes, 0); memcpy(r, ptr, bytes); #else r = GSAutoreleasedBuffer(bytes); memcpy(r, ptr, bytes); if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif ptr = r; *dst = ptr; } else if (zone != 0 && (ptr == buf || bsize > dpos)) { unsigned bytes = dpos + extra; /* * Resizing is permitted - try ensure we return a buffer * which is just big enough to hold the converted string. */ if (ptr == buf || ptr == *dst) { unsigned char *tmp; #if GS_WITH_GC tmp = NSAllocateCollectable(bytes, 0); #else tmp = NSZoneMalloc(zone, bytes); #endif if (tmp != 0) { memcpy(tmp, ptr, bytes); } ptr = tmp; } else { #if GS_WITH_GC ptr = NSReallocateCollectable(ptr, bytes, 0); #else ptr = NSZoneRealloc(zone, ptr, bytes); #endif } *dst = ptr; } else if (ptr == buf) { ptr = NULL; result = NO; } else { *dst = ptr; } } #if !GS_WITH_GC else if (ptr != buf && ptr != *dst) { NSZoneFree(zone, ptr); } #endif if (dst) NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer"); return result; } #undef GROW NSStringEncoding* GSPrivateAvailableEncodings() { if (_availableEncodings == 0) { GSSetupEncodingTable(); [GS_INITIALIZED_LOCK(local_lock, GSLazyLock) lock]; if (_availableEncodings == 0) { NSStringEncoding *encodings; unsigned pos; unsigned i; /* * Now build up a list of supported encodings ... in the * format needed to support [NSString+availableStringEncodings] * Check to see what iconv support we have as we go along. * This is also the place where we determine the name we use * for iconv to support unicode. */ encodings = objc_malloc(sizeof(NSStringEncoding) * (encTableSize+1)); pos = 0; for (i = 0; i < encTableSize+1; i++) { if (GSPrivateIsEncodingSupported(i) == YES) { encodings[pos++] = i; } } encodings[pos] = 0; _availableEncodings = encodings; } [local_lock unlock]; } return _availableEncodings; } NSStringEncoding GSPrivateDefaultCStringEncoding() { if (defEnc == GSUndefinedEncoding) { char *encoding; #if HAVE_LANGINFO_CODESET char encbuf[BUFSIZ]; #endif unsigned int count; GSSetupEncodingTable(); [GS_INITIALIZED_LOCK(local_lock, GSLazyLock) lock]; if (defEnc != GSUndefinedEncoding) { [local_lock unlock]; return defEnc; } if (natEnc == GSUndefinedEncoding) { /* Encoding not set */ #if HAVE_LANGINFO_CODESET /* Take it from the system locale information. */ [gnustep_global_lock lock]; strncpy(encbuf, nl_langinfo(CODESET), sizeof(encbuf)-1); [gnustep_global_lock unlock]; encbuf[sizeof(encbuf)-1] = '\0'; encoding = encbuf; /* * First handle the fallback response from nl_langinfo() ... * if we are getting the default value we can't assume that * the user has set anything up at all, so we must use the * OpenStep/GNUstep default encopding ... latin1, even though * the nl_langinfo() stuff would say default is ascii. */ if (strcmp(encoding, "ANSI_X3.4-1968") == 0 /* glibc */ || strcmp(encoding, "ISO_646.IRV:1983") == 0 /* glibc */ || strcmp(encoding, "646") == 0 /* Solaris NetBSD */) natEnc = NSISOLatin1StringEncoding; else if (strcmp(encoding, "EUC-JP") == 0 /* glibc */ /* HP-UX IRIX OSF/1 Solaris NetBSD */ || strcmp(encoding, "eucJP") == 0 || strcmp(encoding, "IBM-eucJP") == 0 /* AIX */) natEnc = NSJapaneseEUCStringEncoding; else if (strcmp(encoding, "UTF-8") == 0 /* glibc AIX OSF/1 Solaris */ || strcmp(encoding, "utf8") == 0 /* HP-UX */) natEnc = NSUTF8StringEncoding; else if (strcmp(encoding, "ISO-8859-1") == 0 /* glibc */ /* AIX IRIX OSF/1 Solaris NetBSD */ || strcmp(encoding, "ISO8859-1") == 0 || strcmp(encoding, "iso88591") == 0 /* HP-UX */) natEnc = NSISOLatin1StringEncoding; else if (strcmp(encoding, "IBM-932") == 0 /* AIX */ || strcmp(encoding, "SJIS") == 0 /* HP-UX OSF/1 NetBSD */ || strcmp(encoding, "PCK") == 0 /* Solaris */) natEnc = NSShiftJISStringEncoding; else if (strcmp(encoding, "ISO-8859-2") == 0 /* glibc */ /* AIX IRIX OSF/1 Solaris NetBSD */ || strcmp(encoding, "ISO8859-2") == 0 || strcmp(encoding, "iso88592") == 0 /* HP-UX */) natEnc = NSISOLatin2StringEncoding; else if (strcmp(encoding, "CP1251") == 0 /* glibc */ || strcmp(encoding, "ansi-1251") == 0 /* Solaris */) natEnc = NSWindowsCP1251StringEncoding; else if (strcmp(encoding, "CP1252") == 0 /* */ || strcmp(encoding, "IBM-1252") == 0 /* AIX */) natEnc = NSWindowsCP1252StringEncoding; else if (strcmp(encoding, "ISO-8859-5") == 0 /* glibc */ /* AIX IRIX OSF/1 Solaris NetBSD */ || strcmp(encoding, "ISO8859-5") == 0 || strcmp(encoding, "iso88595") == 0 /* HP-UX */) natEnc = NSISOCyrillicStringEncoding; else if (strcmp(encoding, "KOI8-R") == 0 /* glibc */ || strcmp(encoding, "koi8-r") == 0 /* Solaris */) natEnc = NSKOI8RStringEncoding; else if (strcmp(encoding, "ISO-8859-3") == 0 /* glibc */ || strcmp(encoding, "ISO8859-3") == 0 /* Solaris */) natEnc = NSISOLatin3StringEncoding; else if (strcmp(encoding, "ISO-8859-4") == 0 /* */ || strcmp(encoding, "ISO8859-4") == 0 /* OSF/1 Solaris NetBSD */) natEnc = NSISOLatin4StringEncoding; else if (strcmp(encoding, "ISO-8859-6") == 0 /* glibc */ || strcmp(encoding, "ISO8859-6") == 0 /* AIX Solaris */ || strcmp(encoding, "iso88596") == 0 /* HP-UX */) natEnc = NSISOArabicStringEncoding; else if (strcmp(encoding, "ISO-8859-7") == 0 /* glibc */ || strcmp(encoding, "ISO8859-7") == 0 /* AIX IRIX OSF/1 Solaris */ || strcmp(encoding, "iso88597") == 0 /* HP-UX */) natEnc = NSISOGreekStringEncoding; else if (strcmp(encoding, "ISO-8859-8") == 0 /* glibc */ || strcmp(encoding, "ISO8859-8") == 0 /* AIX OSF/1 Solaris */ || strcmp(encoding, "iso88598") == 0 /* HP-UX */) natEnc = NSISOHebrewStringEncoding; else if (strcmp(encoding, "ISO-8859-9") == 0 /* glibc */ || strcmp(encoding, "ISO8859-9") == 0 /* AIX IRIX OSF/1 Solaris */ || strcmp(encoding, "iso88599") == 0 /* HP-UX */) natEnc = NSISOLatin5StringEncoding; else if (strcmp(encoding, "ISO-8859-10") == 0 /* */ || strcmp(encoding, "ISO8859-10") == 0 /* */) natEnc = NSISOLatin6StringEncoding; else if (strcmp(encoding, "TIS-620") == 0 /* glibc AIX */ || strcmp(encoding, "tis620") == 0 /* HP-UX */ || strcmp(encoding, "TIS620.2533") == 0 /* Solaris */ || strcmp(encoding, "TACTIS") == 0 /* OSF/1 */) natEnc = NSISOThaiStringEncoding; else if (strcmp(encoding, "ISO-8859-13") == 0 /* glibc */ || strcmp(encoding, "ISO8859-13") == 0 /* */ || strcmp(encoding, "IBM-921") == 0 /* AIX */) natEnc = NSISOLatin7StringEncoding; else if (strcmp(encoding, "ISO-8859-14") == 0 /* glibc */ || strcmp(encoding, "ISO8859-14") == 0 /* */) natEnc = NSISOLatin8StringEncoding; else if (strcmp(encoding, "ISO-8859-15") == 0 /* glibc */ /* AIX OSF/1 Solaris NetBSD */ || strcmp(encoding, "ISO8859-15") == 0 || strcmp(encoding, "iso885915") == 0 /* HP-UX */) natEnc = NSISOLatin9StringEncoding; else if (strcmp(encoding, "GB2312") == 0 /* glibc */ || strcmp(encoding, "gb2312") == 0 /* Solaris */ || strcmp(encoding, "eucCN") == 0 /* IRIX NetBSD */ || strcmp(encoding, "IBM-eucCN") == 0 /* AIX */ || strcmp(encoding, "hp15CN") == 0 /* HP-UX */) natEnc = NSGB2312StringEncoding; else if (strcmp(encoding, "BIG5") == 0 /* glibc Solaris NetBSD */ || strcmp(encoding, "big5") == 0 /* AIX HP-UX OSF/1 */) natEnc = NSBIG5StringEncoding; else if (strcmp(encoding, "EUC-KR") == 0 /* glibc */ || strcmp(encoding, "eucKR") == 0 /* HP-UX IRIX OSF/1 NetBSD */ || strcmp(encoding, "IBM-eucKR") == 0 /* AIX */ || strcmp(encoding, "5601") == 0 /* Solaris */) natEnc = NSKoreanEUCStringEncoding; #endif } encoding = getenv("GNUSTEP_STRING_ENCODING"); if (encoding != 0) { count = 0; while (str_encoding_table[count].enc && strcasecmp(str_encoding_table[count].ename, encoding) && strcasecmp(str_encoding_table[count].iconv, encoding)) { count++; } if (str_encoding_table[count].enc) { defEnc = str_encoding_table[count].enc; } else { fprintf(stderr, "WARNING: %s - encoding not supported.\n", encoding); fprintf(stderr, " NSISOLatin1StringEncoding set as default.\n"); defEnc = NSISOLatin1StringEncoding; } } if (defEnc == GSUndefinedEncoding) { defEnc = natEnc; } if (defEnc == GSUndefinedEncoding) { defEnc = NSISOLatin1StringEncoding; } else if (GSPrivateIsEncodingSupported(defEnc) == NO) { fprintf(stderr, "WARNING: %s - encoding not implemented as " "default c string encoding.\n", encoding); fprintf(stderr, " NSISOLatin1StringEncoding set as default.\n"); defEnc = NSISOLatin1StringEncoding; } if (natEnc == GSUndefinedEncoding) { natEnc = defEnc; } [local_lock unlock]; } return defEnc; } NSString* GSPrivateEncodingName(NSStringEncoding encoding) { struct _strenc_ *encInfo; if ((encInfo = EntrySupported(encoding)) == NO) { return @"Unknown encoding"; } return [NSString stringWithUTF8String: encInfo->ename]; } BOOL GSPrivateIsByteEncoding(NSStringEncoding encoding) { struct _strenc_ *encInfo; if ((encInfo = EntrySupported(encoding)) == NO) { return NO; } return encInfo->eightBit; } NSStringEncoding GSPrivateNativeCStringEncoding() { if (natEnc == GSUndefinedEncoding) { /* GSPrivateDefaultCStringEncoding() will actually set the encoding. */ GSPrivateDefaultCStringEncoding(); } return natEnc; }