From 3847c54f3f0d6faee0ea508f1632a23df6a8cf56 Mon Sep 17 00:00:00 2001
From: David Chisnall <David.Chisnall@microsoft.com>
Date: Thu, 27 Dec 2018 13:42:12 +0000
Subject: [PATCH] Add NSConstantString implementation for v2 ABI.

This is largely the version from the newabi branch, but with a few
cleanups made possible by other bug fixes in the GSString
implementations.
---
 Headers/Foundation/NSString.h |  41 +++++++++
 Source/GSString.m             | 155 ++++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+)
diff --git a/Headers/Foundation/NSString.h b/Headers/Foundation/NSString.h
index b13acd344..95f00c7a1 100644
--- a/Headers/Foundation/NSString.h
+++ b/Headers/Foundation/NSString.h
@@ -881,6 +881,12 @@ typedef NSUInteger NSStringEncodingConversionOptions;
 
 @end
 
+#ifdef __OBJC_GNUSTEP_RUNTIME_ABI__
+#  if __OBJC_GNUSTEP_RUNTIME_ABI__ >= 20
+#    define GNUSTEP_NEW_STRING_ABI
+#  endif
+#endif
+
 /**
  * <p>The NXConstantString class is used to hold constant 8-bit character
  * string objects produced by the compiler where it sees @"..." in the
@@ -908,8 +914,43 @@ typedef NSUInteger NSStringEncodingConversionOptions;
 @interface NXConstantString : NSString
 {
 @public
+#ifdef GNUSTEP_NEW_STRING_ABI
+  /**
+   * Flags.  The low 16 bits are reserved for the compiler, the top 16 for use
+   * by the Foundation Framework.  Currently only the low 2 bits are used, to
+   * indicate the encoding of the string, with the following values:
+   *
+   * 0. ASCII (UTF-8 using only 7-bit characters)
+   * 1. UTF-8
+   * 2. UTF-16
+   * 3. UTF-32
+   *
+   */
+  uint32_t flags;
+  /**
+   * The number of characters (UTF-16 code units) in the string.
+   */
+  uint32_t nxcslen;
+  /**
+   * The number of bytes in the string.  For fixed-length encodings, this is a
+   * fixed multiple of nxcslen, but for UTF-8 it can be different.
+   */
+  uint32_t size;
+  /**
+   * Hash value.
+   */
+  uint32_t hash;
+  /**
+   * Pointer to the byte data of the string.  Note that `char*` is the correct
+   * type only if the low two bits of the flags indicate that this is an ASCII
+   * or UTF-8 string, otherwise it is a pointer to 16- or 32-bit characters in
+   * native byte order.
+   */
+  const char * const nxcsptr;
+#else
   const char * const nxcsptr;
   const unsigned int nxcslen;
+#endif
 }
 @end
 
diff --git a/Source/GSString.m b/Source/GSString.m
index f5cc6877b..e97202a03 100644
--- a/Source/GSString.m
+++ b/Source/GSString.m
@@ -298,6 +298,32 @@ nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n)
 static BOOL
 literalIsEqualInternal(NXConstantString *s, GSStr o)
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  if (s->nxcslen != o->_count)
+    {
+      return NO;
+    }
+  size_t end = s->nxcslen;
+  static const int buffer_size = 64;
+  unichar buffer1[buffer_size];
+  unichar buffer2[buffer_size];
+  NSRange r = { 0, buffer_size };
+  do
+    {
+      if (r.location + r.length > end)
+	{
+	  r.length = s->nxcslen - r.location;
+	}
+      [s getCharacters: buffer1 range: r];
+      [o getCharacters: buffer2 range: r];
+      if (memcmp(buffer1, buffer2, r.length * sizeof(unichar)) != 0)
+	{
+	  return NO;
+	}
+      r.location += buffer_size;
+    } while (r.location < end);
+  return YES;
+#else
   unsigned	len = o->_count;
 
   /* Since UTF-8 is a multibyte character set, it must have at least
@@ -450,6 +476,7 @@ literalIsEqualInternal(NXConstantString *s, GSStr o)
 	}
       return YES;
     }
+#endif
 }
 
 
@@ -5664,6 +5691,13 @@ literalIsEqual(NXConstantString *self, id anObject)
   return NO;
 }
 
+#ifdef GNUSTEP_NEW_STRING_ABI
+#  define CONSTANT_STRING_ENCODING() (flags & 3)
+#  define CONSTANT_STRING_HAS_HASH() ((flags & (1<<16)) == (1<<16))
+#  define CONSTANT_STRING_SET_HAS_HASH() do { flags |= (1<<16); } while(0)
+#endif
+
+
 /**
  * <p>The NXConstantString class is used by the compiler for constant
  * strings, as such its ivar layout is determined by the compiler
@@ -5682,11 +5716,52 @@ literalIsEqual(NXConstantString *self, id anObject)
 
 - (const char*) UTF8String
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  switch (CONSTANT_STRING_ENCODING())
+  {
+      case 0: // ASCII
+      case 1: // UTF-8
+	  return nxcsptr;
+      case 2: // UTF-16
+	{
+	  unsigned int l = 0;
+	  unsigned char *r = 0;
+
+	  if (GSFromUnicode(&r, &l, (const unichar*)(void*)nxcsptr, nxcslen, NSUTF8StringEncoding,
+	    NSDefaultMallocZone(), GSUniTerminate|GSUniTemporary|GSUniStrict) == NO)
+	    {
+	      [NSException raise: NSCharacterConversionException
+			  format: @"Can't get UTF8 from Unicode string."];
+	    }
+	  return (const char*)r;
+	}
+      case 4: // UTF-32
+	return [super UTF8String];
+  }
+  GS_UNREACHABLE();
+#else
   return nxcsptr;
+#endif
 }
 
 - (unichar) characterAtIndex: (NSUInteger)index
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  if (index >= nxcslen)
+    {
+      [NSException raise: NSInvalidArgumentException
+		  format: @"-characterAtIndex: index out of range"];
+    }
+  switch (CONSTANT_STRING_ENCODING())
+  {
+      case 0: // ASCII
+      case 1: // UTF-8
+	  return nxcsptr[index];
+      case 2: // UTF-16
+	  return ((unichar*)(void*)nxcsptr)[index];
+  }
+  GS_UNREACHABLE();
+#else
   NSUInteger	l = 0;
   unichar	u;
   unichar	n = 0;
@@ -5704,8 +5779,11 @@ literalIsEqual(NXConstantString *self, id anObject)
   [NSException raise: NSInvalidArgumentException
 	      format: @"-characterAtIndex: index out of range"];
   return 0;
+#endif
 }
 
+#ifndef GNUSTEP_NEW_STRING_ABI
+
 - (BOOL) canBeConvertedToEncoding: (NSStringEncoding)encoding
 {
   /* If the string contains bad (non-utf8) data, the lengthUTF8() function
@@ -5809,6 +5887,8 @@ literalIsEqual(NXConstantString *self, id anObject)
   return [super dataUsingEncoding: encoding allowLossyConversion: flag];
 }
 
+#endif
+
 - (void) dealloc
 {
   GSNOSUPERDEALLOC;
@@ -5817,6 +5897,26 @@ literalIsEqual(NXConstantString *self, id anObject)
 - (void) getCharacters: (unichar*)buffer
 		 range: (NSRange)aRange
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  GS_RANGE_CHECK(aRange, nxcslen);
+  switch (CONSTANT_STRING_ENCODING())
+  {
+      case 0: // ASCII
+	for (int i=0 ; i<aRange.length ; i++)
+	  {
+	    buffer[i] = (unichar)nxcsptr[aRange.location + i];
+	  }
+	return;
+      case 1: // UTF-8
+	NSAssert(0, @"UTF-8 constant strings not yet supported");
+      case 2: // UTF-16
+	memcpy(buffer, nxcsptr + (aRange.location * sizeof(unichar)), aRange.length * sizeof(unichar));
+	return;
+      case 3:
+	NSAssert(0, @"UTF-32 constant strings not yet supported");
+  }
+  GS_UNREACHABLE();
+#else
   unichar	n = 0;
   unsigned	i = 0;
   NSUInteger	max = NSMaxRange(aRange);
@@ -5846,8 +5946,12 @@ literalIsEqual(NXConstantString *self, id anObject)
 	@"in %s, range { %"PRIuPTR", %"PRIuPTR" } extends beyond string",
         GSNameFromSelector(_cmd), aRange.location, aRange.length];
     }
+#endif
 }
 
+// This method was deprecated on Mac OS X 10.5, so if we provide an improved
+// version here then we should do it using the newer version.
+#ifndef GNUSTEP_NEW_STRING_ABI
 - (BOOL) getCString: (char*)buffer
 	  maxLength: (NSUInteger)maxLength
 	   encoding: (NSStringEncoding)encoding
@@ -5912,12 +6016,20 @@ literalIsEqual(NXConstantString *self, id anObject)
     }
   return [super getCString: buffer maxLength: maxLength encoding: encoding];
 }
+#endif
 
 /* Must match the implementation in NSString
  * To avoid allocating memory, we build the hash incrementally.
  */
 - (NSUInteger) hash
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  if (CONSTANT_STRING_HAS_HASH())
+    return hash;
+  hash = [super hash];
+  CONSTANT_STRING_SET_HAS_HASH();
+  return hash;
+#else
   if (nxcslen > 0)
     {
       uint32_t  s0 = 0;
@@ -5960,6 +6072,7 @@ literalIsEqual(NXConstantString *self, id anObject)
     {
       return 0x0ffffffe;	/* Hash for an empty string.	*/
     }
+#endif
 }
 
 - (id) initWithBytes: (const void*)bytes
@@ -5981,6 +6094,16 @@ literalIsEqual(NXConstantString *self, id anObject)
   return nil;
 }
 
+#ifdef GNUSTEP_NEW_STRING_ABI
+- (NSUInteger) length
+{
+  // In the new encoding, nxcslen is always the length of the string in UTF-16
+  // codepoints
+  return nxcslen;
+}
+
+#else
+
 - (BOOL) isEqual: (id)anObject
 {
   return literalIsEqual(self, anObject);
@@ -6129,6 +6252,7 @@ literalIsEqual(NXConstantString *self, id anObject)
     format: @"-rangeOfComposedCharacterSequenceAtIndex: index out of range"];
   return NSMakeRange(NSNotFound, 0);
 }
+#endif // GNUSTEP_NEW_STRING_ABI
 
 - (id) retain
 {
@@ -6157,12 +6281,43 @@ literalIsEqual(NXConstantString *self, id anObject)
 
 - (NSStringEncoding) fastestEncoding
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  switch (CONSTANT_STRING_ENCODING())
+  {
+      case 0: // ASCII
+	return NSASCIIStringEncoding;
+      case 1: // UTF-8
+	  return NSUTF8StringEncoding;
+      case 2: // UTF-16
+	  return NSUTF16StringEncoding;
+      case 3: // UTF-32
+	  return NSUTF32StringEncoding;
+  }
+  GS_UNREACHABLE();
+#else
   return NSUTF8StringEncoding;
+#endif
 }
 
 - (NSStringEncoding) smallestEncoding
 {
+#ifdef GNUSTEP_NEW_STRING_ABI
+  // UTF-16 might not be the smallest encoding for UTF-16 strings, but for now
+  // we'll pretend that it is.
+  switch (CONSTANT_STRING_ENCODING())
+  {
+      case 0: // ASCII
+	return NSASCIIStringEncoding;
+      case 1: // UTF-8
+	  return NSUTF8StringEncoding;
+      case 2: // UTF-16
+      case 3: // UTF-32
+	  return NSUTF16StringEncoding;
+  }
+  GS_UNREACHABLE();
+#else
   return NSUTF8StringEncoding;
+#endif
 }
 
 @end