diff --git a/ChangeLog b/ChangeLog index 474950179..0c2741bcd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2003-07-09 02:41 Alexander Malmberg + + * Source/GSString.m (getCString_u): Rewrite to correctly handle + all encodings, the range argument, and the leftoverRange attribute. + + * Source/Unicode.m (GSToUnicode): Clarify the documentation (and fix + a few typos in it). + (GSFromUnicode): Clarify the documentation. Signal failure correctly + from the utf8 encoder. Make sure dst isn't set to NULL if zone is + NULL but *size is 0. + 2003-07-08 Richard Frith-Macdonald * Source/NSString.m: Avoid using atof() in parsing plists ... we diff --git a/Source/Additions/Unicode.m b/Source/Additions/Unicode.m index 2ee2583b9..e0380beab 100644 --- a/Source/Additions/Unicode.m +++ b/Source/Additions/Unicode.m @@ -939,7 +939,7 @@ else \ } /** - * Function to convert from 8-bit character data to 16-bit unicode. + * Function to convert from 8-bit data to 16-bit unicode characters. *

The dst argument is a pointer to a pointer to a buffer in which the * converted string is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the @@ -949,17 +949,18 @@ else \ *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be - * altered to the new size. This is measured in characters, not bytes. + * altered to the new size. This is measured in 16-bit unicode characters, + * not bytes. *

- *

The src argument is a pointer to the 8-bit character string which is + *

The src argument is a pointer to the byte sequence which is * to be converted to 16-bit unicode. *

- *

The slen argument is the length (bytes) of the 8-bit character string + *

The slen argument is the length of the byte sequence * which is to be converted to 16-bit unicode. - * This is measured in characters, not bytes. + * This is measured in bytes. *

- *

The end argument specifies the encoding type of the 8-bit character - * string which is to be converted to 16-bit unicode. + *

The enc argument specifies the encoding type of the 8-bit byte sequence + * which is to be converted to 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. @@ -971,7 +972,7 @@ else \ * * If GSUniTerminate is set, the function is expected to null terminate * the output string, and will assume that it is safe to place the nul - * just beyond the ned of the stated buffer size. + * just beyond the end of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra * termination character. * If GSUniTemporary is set, the function will return the results in @@ -1033,8 +1034,6 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src, { case NSUTF8StringEncoding: { - result = YES; - while (spos < slen) { unsigned char c = src[spos++]; @@ -1452,27 +1451,27 @@ static inline int chop(unichar c, _ucc_ *table, int hi) } /** - * Function to convert from 16-bit unicode to 8-bit character data. + * Function to convert from 16-bit unicode to 8-bit data. *

The dst argument is a pointer to a pointer to a buffer in which the - * converted string is to be stored. If it is a null pointer, this function + * converted data is to be stored. If it is a null pointer, this function * discards converted data, and is used only to determine the length of the - * converted string. If the zone argument is non-nul, the function is free + * converted data. If the zone argument is non-nul, the function is free * to allocate a larger buffer if necessary, and store this new buffer in * the dst argument. It will *NOT* deallocate the original buffer! *

*

The size argument is a pointer to the initial size of the destination * buffer. If the function changes the buffer size, this value will be - * altered to the new size. This is measured in characters, not bytes. + * altered to the new size. This is measured in bytes. *

*

The src argument is a pointer to the 16-bit unicode string which is * to be converted to 8-bit data. *

- *

The slen argument is the length (bytes) of the 16-bit unicode string + *

The slen argument is the length of the 16-bit unicode string * which is to be converted to 8-bit data. - * This is measured in characters, not bytes. + * This is measured in 16-bit characters, not bytes. *

- *

The end argument specifies the encoding type of the 8-bit character - * string which is to be produced from the 16-bit unicode. + *

The enc argument specifies the encoding type of the 8-bit byte sequence + * which is to be produced from the 16-bit unicode. *

*

The zone argument specifies a memory zone in which the function may * allocate a buffer to return data in. @@ -1483,13 +1482,13 @@ static inline int chop(unichar c, _ucc_ *table, int hi) * The options argument controls some special behavior. * * If GSUniStrict is set, the function will fail if a character is - * encountered which can't be displayed in the source. Otherwise, some + * encountered in the source which can't be converted. Otherwise, some * approximation or marker will be placed in the destination. - * If GSUniTerminate is set, the function is expected to null terminate - * the output string, and will assume that it is safe to place the nul - * just beyond the ned of the stated buffer size. + * If GSUniTerminate is set, the function is expected to nul terminate + * the output data, and will assume that it is safe to place the nul + * just beyond the end of the stated buffer size. * Also, if the function grows the buffer, it will allow for an extra - * termination character. + * termination byte. * If GSUniTemporary is set, the function will return the results in * an autoreleased buffer rather than in a buffer that the caller must * release. @@ -1500,8 +1499,8 @@ static inline int chop(unichar c, _ucc_ *table, int hi) * *

On return, the function result is a flag indicating success (YES) * or failure (NO), and on success, the value stored in size is the number - * of characters in the converted string. The converted string itsself is - * stored in the location gioven by dst.
+ * of bytes in the converted data. The converted data itself is + * stored in the location given by dst.
* NB. If the value stored in dst has been changed, it is a pointer to * allocated memory which the caller is responsible for freeing, and the * caller is still responsible for freeing the original buffer. @@ -1620,7 +1619,6 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src, ptr[dpos++] = (u & 0x3f) | 0x80; } } - result = YES; } break; @@ -1946,6 +1944,7 @@ tables: NSZoneFree(zone, ptr); } ptr = r; + *dst = ptr; } else if (zone != 0 && (ptr == buf || bsize > dpos)) { @@ -1970,13 +1969,13 @@ tables: { ptr = NSZoneRealloc(zone, ptr, bytes); } + *dst = ptr; } else if (ptr == buf) { ptr = NULL; result = NO; } - *dst = ptr; } else if (ptr != buf && dst != 0 && ptr != *dst) { diff --git a/Source/GSString.m b/Source/GSString.m index ed5fe6b60..5b436c2ae 100644 --- a/Source/GSString.m +++ b/Source/GSString.m @@ -1123,38 +1123,82 @@ static inline void getCString_u(ivars self, char *buffer, unsigned int maxLength, NSRange aRange, NSRange *leftoverRange) { - unsigned int len; + /* The primitive we have for converting from unicode, GSFromUnicode, + can't deal with our leftoverRange case, so we need to use a bit of + complexity instead. */ + unsigned int len; - if (maxLength > self->_count) + /* TODO: this is an extremely ugly hack to work around buggy iconvs + that return -1/E2BIG for buffers larger than 0x40000acf */ + if (maxLength > 0x40000000) + maxLength = 0x40000000; + + /* First, try converting the whole thing. */ + len = maxLength; + if (GSFromUnicode((unsigned char **)&buffer, &len, + self->_contents.u + aRange.location, aRange.length, + defEnc, 0, GSUniTerminate | GSUniStrict) == YES) { - maxLength = self->_count; - } - if (maxLength < aRange.length) - { - len = maxLength; - if (leftoverRange != 0) - { - leftoverRange->location = aRange.location + maxLength; - leftoverRange->length = aRange.length - maxLength; - } - } - else - { - len = aRange.length; - if (leftoverRange != 0) - { - leftoverRange->location = 0; - leftoverRange->length = 0; - } + if (leftoverRange) + leftoverRange->location = leftoverRange->length = 0; + return; } - if (GSFromUnicode((unsigned char **)&buffer, &len, self->_contents.u, len, - defEnc, 0, GSUniTerminate | GSUniStrict) == NO) + /* The conversion failed. Either the buffer is too small for the whole + range, or there are characters in it we can't convert. Check for + unconvertable characters first. */ + len = 0; + if (GSFromUnicode(NULL, &len, + self->_contents.u + aRange.location, aRange.length, + defEnc, 0, GSUniTerminate | GSUniStrict) == NO) { [NSException raise: NSCharacterConversionException format: @"Can't get cString from Unicode string."]; + return; } - buffer[len] = '\0'; + + /* The string can be converted, but not all of it. Do a binary search + to find the longest subrange that fits in the buffer. */ + { + unsigned int lo, hi, mid; + + lo = 0; + hi = aRange.length; + while (lo < hi) + { + mid = (lo + hi + 1) / 2; /* round up to get edge case right */ + len = maxLength; + if (GSFromUnicode((unsigned char **)&buffer, &len, + self->_contents.u + aRange.location, mid, + defEnc, 0, GSUniTerminate | GSUniStrict) == YES) + { + lo = mid; + } + else + { + hi = mid - 1; + } + } + + /* lo==hi characters fit. Do the real conversion. */ + len = maxLength; + if (lo == 0) + { + buffer[0] = 0; + } + else if (GSFromUnicode((unsigned char **)&buffer, &len, + self->_contents.u + aRange.location, lo, + defEnc, 0, GSUniTerminate | GSUniStrict) == NO) + { + NSCAssert(NO, @"binary search gave inconsistent results"); + } + + if (leftoverRange) + { + leftoverRange->location = aRange.location + lo; + leftoverRange->length = NSMaxRange(aRange) - leftoverRange->location; + } + } } static inline int