From 3847c54f3f0d6faee0ea508f1632a23df6a8cf56 Mon Sep 17 00:00:00 2001 From: David Chisnall Date: Thu, 27 Dec 2018 13:42:12 +0000 Subject: [PATCH] Add NSConstantString implementation for v2 ABI. This is largely the version from the newabi branch, but with a few cleanups made possible by other bug fixes in the GSString implementations. --- Headers/Foundation/NSString.h | 41 +++++++++ Source/GSString.m | 155 ++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) diff --git a/Headers/Foundation/NSString.h b/Headers/Foundation/NSString.h index b13acd344..95f00c7a1 100644 --- a/Headers/Foundation/NSString.h +++ b/Headers/Foundation/NSString.h @@ -881,6 +881,12 @@ typedef NSUInteger NSStringEncodingConversionOptions; @end +#ifdef __OBJC_GNUSTEP_RUNTIME_ABI__ +# if __OBJC_GNUSTEP_RUNTIME_ABI__ >= 20 +# define GNUSTEP_NEW_STRING_ABI +# endif +#endif + /** *

The NXConstantString class is used to hold constant 8-bit character * string objects produced by the compiler where it sees @"..." in the @@ -908,8 +914,43 @@ typedef NSUInteger NSStringEncodingConversionOptions; @interface NXConstantString : NSString { @public +#ifdef GNUSTEP_NEW_STRING_ABI + /** + * Flags. The low 16 bits are reserved for the compiler, the top 16 for use + * by the Foundation Framework. Currently only the low 2 bits are used, to + * indicate the encoding of the string, with the following values: + * + * 0. ASCII (UTF-8 using only 7-bit characters) + * 1. UTF-8 + * 2. UTF-16 + * 3. UTF-32 + * + */ + uint32_t flags; + /** + * The number of characters (UTF-16 code units) in the string. + */ + uint32_t nxcslen; + /** + * The number of bytes in the string. For fixed-length encodings, this is a + * fixed multiple of nxcslen, but for UTF-8 it can be different. + */ + uint32_t size; + /** + * Hash value. + */ + uint32_t hash; + /** + * Pointer to the byte data of the string. Note that `char*` is the correct + * type only if the low two bits of the flags indicate that this is an ASCII + * or UTF-8 string, otherwise it is a pointer to 16- or 32-bit characters in + * native byte order. + */ + const char * const nxcsptr; +#else const char * const nxcsptr; const unsigned int nxcslen; +#endif } @end diff --git a/Source/GSString.m b/Source/GSString.m index f5cc6877b..e97202a03 100644 --- a/Source/GSString.m +++ b/Source/GSString.m @@ -298,6 +298,32 @@ nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n) static BOOL literalIsEqualInternal(NXConstantString *s, GSStr o) { +#ifdef GNUSTEP_NEW_STRING_ABI + if (s->nxcslen != o->_count) + { + return NO; + } + size_t end = s->nxcslen; + static const int buffer_size = 64; + unichar buffer1[buffer_size]; + unichar buffer2[buffer_size]; + NSRange r = { 0, buffer_size }; + do + { + if (r.location + r.length > end) + { + r.length = s->nxcslen - r.location; + } + [s getCharacters: buffer1 range: r]; + [o getCharacters: buffer2 range: r]; + if (memcmp(buffer1, buffer2, r.length * sizeof(unichar)) != 0) + { + return NO; + } + r.location += buffer_size; + } while (r.location < end); + return YES; +#else unsigned len = o->_count; /* Since UTF-8 is a multibyte character set, it must have at least @@ -450,6 +476,7 @@ literalIsEqualInternal(NXConstantString *s, GSStr o) } return YES; } +#endif } @@ -5664,6 +5691,13 @@ literalIsEqual(NXConstantString *self, id anObject) return NO; } +#ifdef GNUSTEP_NEW_STRING_ABI +# define CONSTANT_STRING_ENCODING() (flags & 3) +# define CONSTANT_STRING_HAS_HASH() ((flags & (1<<16)) == (1<<16)) +# define CONSTANT_STRING_SET_HAS_HASH() do { flags |= (1<<16); } while(0) +#endif + + /** *

The NXConstantString class is used by the compiler for constant * strings, as such its ivar layout is determined by the compiler @@ -5682,11 +5716,52 @@ literalIsEqual(NXConstantString *self, id anObject) - (const char*) UTF8String { +#ifdef GNUSTEP_NEW_STRING_ABI + switch (CONSTANT_STRING_ENCODING()) + { + case 0: // ASCII + case 1: // UTF-8 + return nxcsptr; + case 2: // UTF-16 + { + unsigned int l = 0; + unsigned char *r = 0; + + if (GSFromUnicode(&r, &l, (const unichar*)(void*)nxcsptr, nxcslen, NSUTF8StringEncoding, + NSDefaultMallocZone(), GSUniTerminate|GSUniTemporary|GSUniStrict) == NO) + { + [NSException raise: NSCharacterConversionException + format: @"Can't get UTF8 from Unicode string."]; + } + return (const char*)r; + } + case 4: // UTF-32 + return [super UTF8String]; + } + GS_UNREACHABLE(); +#else return nxcsptr; +#endif } - (unichar) characterAtIndex: (NSUInteger)index { +#ifdef GNUSTEP_NEW_STRING_ABI + if (index >= nxcslen) + { + [NSException raise: NSInvalidArgumentException + format: @"-characterAtIndex: index out of range"]; + } + switch (CONSTANT_STRING_ENCODING()) + { + case 0: // ASCII + case 1: // UTF-8 + return nxcsptr[index]; + case 2: // UTF-16 + return ((unichar*)(void*)nxcsptr)[index]; + } + GS_UNREACHABLE(); +#else NSUInteger l = 0; unichar u; unichar n = 0; @@ -5704,8 +5779,11 @@ literalIsEqual(NXConstantString *self, id anObject) [NSException raise: NSInvalidArgumentException format: @"-characterAtIndex: index out of range"]; return 0; +#endif } +#ifndef GNUSTEP_NEW_STRING_ABI + - (BOOL) canBeConvertedToEncoding: (NSStringEncoding)encoding { /* If the string contains bad (non-utf8) data, the lengthUTF8() function @@ -5809,6 +5887,8 @@ literalIsEqual(NXConstantString *self, id anObject) return [super dataUsingEncoding: encoding allowLossyConversion: flag]; } +#endif + - (void) dealloc { GSNOSUPERDEALLOC; @@ -5817,6 +5897,26 @@ literalIsEqual(NXConstantString *self, id anObject) - (void) getCharacters: (unichar*)buffer range: (NSRange)aRange { +#ifdef GNUSTEP_NEW_STRING_ABI + GS_RANGE_CHECK(aRange, nxcslen); + switch (CONSTANT_STRING_ENCODING()) + { + case 0: // ASCII + for (int i=0 ; i 0) { uint32_t s0 = 0; @@ -5960,6 +6072,7 @@ literalIsEqual(NXConstantString *self, id anObject) { return 0x0ffffffe; /* Hash for an empty string. */ } +#endif } - (id) initWithBytes: (const void*)bytes @@ -5981,6 +6094,16 @@ literalIsEqual(NXConstantString *self, id anObject) return nil; } +#ifdef GNUSTEP_NEW_STRING_ABI +- (NSUInteger) length +{ + // In the new encoding, nxcslen is always the length of the string in UTF-16 + // codepoints + return nxcslen; +} + +#else + - (BOOL) isEqual: (id)anObject { return literalIsEqual(self, anObject); @@ -6129,6 +6252,7 @@ literalIsEqual(NXConstantString *self, id anObject) format: @"-rangeOfComposedCharacterSequenceAtIndex: index out of range"]; return NSMakeRange(NSNotFound, 0); } +#endif // GNUSTEP_NEW_STRING_ABI - (id) retain { @@ -6157,12 +6281,43 @@ literalIsEqual(NXConstantString *self, id anObject) - (NSStringEncoding) fastestEncoding { +#ifdef GNUSTEP_NEW_STRING_ABI + switch (CONSTANT_STRING_ENCODING()) + { + case 0: // ASCII + return NSASCIIStringEncoding; + case 1: // UTF-8 + return NSUTF8StringEncoding; + case 2: // UTF-16 + return NSUTF16StringEncoding; + case 3: // UTF-32 + return NSUTF32StringEncoding; + } + GS_UNREACHABLE(); +#else return NSUTF8StringEncoding; +#endif } - (NSStringEncoding) smallestEncoding { +#ifdef GNUSTEP_NEW_STRING_ABI + // UTF-16 might not be the smallest encoding for UTF-16 strings, but for now + // we'll pretend that it is. + switch (CONSTANT_STRING_ENCODING()) + { + case 0: // ASCII + return NSASCIIStringEncoding; + case 1: // UTF-8 + return NSUTF8StringEncoding; + case 2: // UTF-16 + case 3: // UTF-32 + return NSUTF16StringEncoding; + } + GS_UNREACHABLE(); +#else return NSUTF8StringEncoding; +#endif } @end