Add some optimisation for converting to UTF-8

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@28334 72102866-910b-0410-8b05-ffd578937521
2025-05-31 16:50:58 +00:00 · 2009-06-08 15:18:49 +00:00 · 2009-06-08 15:18:49 +00:00 · f074015e89
commit f074015e89
parent 08c9289397
2 changed files with 226 additions and 99 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
 2009-06-08 Richard Frith-Macdonald <rfm@gnu.org>
 	* Source/Additions/Unicode.m: Optimise somewhat for converting
 	from unicode (UTF-2) to UTF-8
 2009-06-06 Richard Frith-Macdonald <rfm@gnu.org>
 	* Source/Additions/Unicode.m: Optimise case where we are converting
--- a/Source/Additions/Unicode.m
+++ b/Source/Additions/Unicode.m
@ -1786,127 +1786,249 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
    {
      case NSUTF8StringEncoding:
 	{
-	  while (spos < slen)
+	  if (swapped == YES)
 	    {
-	      unichar 		u1, u2;
+	      while (spos < slen)
 	      unsigned long	u;
 	      int		sl = 0;
 	      /* get first unichar */
 	      u1 = src[spos++];
 	      if (swapped == YES)
 		{
 		  unichar 	u1, u2;
 		  unsigned char	reversed[8];
 		  unsigned long	u;
 		  int		sl;
 		  int		i;
 		  /* get first unichar */
 		  u1 = src[spos++];
 		  u1 = (((u1 & 0xff00) >> 8) + ((u1 & 0x00ff) << 8));
 		}
 	      // 0xfeff is a zero-width-no-break-space inside text (not a BOM).
 	      if (u1 == 0xfffe				// unexpected BOM
 	        || u1 == 0xffff				// not a character
 		|| (u1 >= 0xfdd0 && u1 <= 0xfdef)	// invalid character
 		|| (u1 >= 0xdc00 && u1 <= 0xdfff))	// bad pairing
 	        {
 		  if (strict)
 		    {
 		      result = NO;
 		      goto done;
                    }
 		  continue;	// Skip invalid character.
 	        }
-	      /* possibly get second character and calculate 'u' */
+		  /* Fast track ... if this is actually an ascii character
-	      if ((u1 >= 0xd800) && (u1 < 0xdc00))
+		   * it just converts straight to utf-8
-                {
+		   */
-	  	  if (spos >= slen)
+		  if (u1 <= 0x7f)
-                    {
+		    {
-		      if (strict)
+		      if (dpos >= bsize)
 			{
-			  result = NO;
+			  GROW();
 			  goto done;
 			}
-		      continue;	// At end.
+		      ptr[dpos++] = (unsigned char)u1;
-                    }
+		      continue;
 	          /* get second unichar */
 	          u2 = src[spos++];
 	          if (swapped == YES)
 		    {
 		      u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8));
 		    }
-	          if ((u2 < 0xdc00) && (u2 > 0xdfff))
+		  // 0xfeff is a zero-width-no-break-space inside text
-                    {
+		  if (u1 == 0xfffe			// unexpected BOM
-		      spos--;
+		    || u1 == 0xffff			// not a character
 		    || (u1 >= 0xfdd0 && u1 <= 0xfdef)	// invalid character
 		    || (u1 >= 0xdc00 && u1 <= 0xdfff))	// bad pairing
 		    {
 		      if (strict)
 			{
 			  result = NO;
 			  goto done;
 			}
-		      continue;		// Skip bad half of surrogate pair.
+		      continue;	// Skip invalid character.
-                    }
+		    }
-                  /* make the full value */
+		  /* possibly get second character and calculate 'u' */
-		  u = ((unsigned long)(u1 - 0xd800) * 0x400)
+		  if ((u1 >= 0xd800) && (u1 < 0xdc00))
-		    + (u2 - 0xdc00) + 0x10000;
+		    {
-                }
+		      if (spos >= slen)
-              else
+			{
-		{
+			  if (strict)
-		  u = u1;
+			    {
-		}
+			      result = NO;
 			      goto done;
 			    }
 			  continue;	// At end.
 			}
-              /* calculate the sequence length */
+		      /* get second unichar */
-              if (u <= 0x7f)
+		      u2 = src[spos++];
-		{
+		      u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8));
 		  sl = 1;
 		}
              else if (u <= 0x7ff)
 		{
 		  sl = 2;
 		}
              else if (u <= 0xffff)
 		{
 		  sl = 3;
 		}
              else if (u <= 0x1fffff)
 		{
 		  sl = 4;
 		}
              else if (u <= 0x3ffffff)
 		{
 		  sl = 5;
 		}
              else
 		{
 		  sl = 6;
 		}
-              /* make sure we have enough space for it */
+		      if ((u2 < 0xdc00) && (u2 > 0xdfff))
-	      while (dpos + sl >= bsize)
+			{
-		{
+			  spos--;
-		  GROW();
+			  if (strict)
-		}
+			    {
 			      result = NO;
 			      goto done;
 			    }
 			  continue;	// Skip bad half of surrogate pair.
 			}
-	      if (sl == 1)
+		      /* make the full value */
-                {
+		      u = ((unsigned long)(u1 - 0xd800) * 0x400)
-	          ptr[dpos++] = u & 0x7f;
+			+ (u2 - 0xdc00) + 0x10000;
-                }
+		    }
-              else
+		  else
-                {
+		    {
-                  int		i;
+		      u = u1;
-                  unsigned char	reversed[8];
+		    }
-                  /* split value into reversed array */
+		  /* calculate the sequence length
-                  for (i = 0; i < sl; i++)
+		   * a length of 1 was dealt with earlier
-                    {
+		   */
-                      reversed[i] = (u & 0x3f);
+		  if (u <= 0x7ff)
-                      u = u >> 6;
+		    {
-                    }
+		      sl = 2;
 		    }
 		  else if (u <= 0xffff)
 		    {
 		      sl = 3;
 		    }
 		  else if (u <= 0x1fffff)
 		    {
 		      sl = 4;
 		    }
 		  else if (u <= 0x3ffffff)
 		    {
 		      sl = 5;
 		    }
 		  else
 		    {
 		      sl = 6;
 		    }
-	          ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
+		  /* make sure we have enough space for it */
-                  /* add bytes into the output sequence */
+		  while (dpos + sl >= bsize)
-                  for (i = sl - 2; i >= 0; i--)
+		    {
 		      GROW();
 		    }
 		  /* split value into reversed array */
 		  for (i = 0; i < sl; i++)
 		    {
 		      reversed[i] = (u & 0x3f);
 		      u = u >> 6;
 		    }
 		  ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
 		  /* add bytes into the output sequence */
 		  for (i = sl - 2; i >= 0; i--)
 		    {
 		      ptr[dpos++] = reversed[i] | 0x80;
 		    }
-                }
+		}
 	    }
 	  else
 	    {
 	      while (spos < slen)
 		{
 		  unichar 	u1, u2;
 		  unsigned char	reversed[8];
 		  unsigned long	u;
 		  int		sl;
 		  int		i;
 		  /* get first unichar */
 		  u1 = src[spos++];
 		  /* Fast track ... if this is actually an ascii character
 		   * it just converts straight to utf-8
 		   */
 		  if (u1 <= 0x7f)
 		    {
 		      if (dpos >= bsize)
 			{
 			  GROW();
 			}
 		      ptr[dpos++] = (unsigned char)u1;
 		      continue;
 		    }
 		  // 0xfeff is a zero-width-no-break-space inside text
 		  if (u1 == 0xfffe			// unexpected BOM
 		    || u1 == 0xffff			// not a character
 		    || (u1 >= 0xfdd0 && u1 <= 0xfdef)	// invalid character
 		    || (u1 >= 0xdc00 && u1 <= 0xdfff))	// bad pairing
 		    {
 		      if (strict)
 			{
 			  result = NO;
 			  goto done;
 			}
 		      continue;	// Skip invalid character.
 		    }
 		  /* possibly get second character and calculate 'u' */
 		  if ((u1 >= 0xd800) && (u1 < 0xdc00))
 		    {
 		      if (spos >= slen)
 			{
 			  if (strict)
 			    {
 			      result = NO;
 			      goto done;
 			    }
 			  continue;	// At end.
 			}
 		      /* get second unichar */
 		      u2 = src[spos++];
 		      if ((u2 < 0xdc00) && (u2 > 0xdfff))
 			{
 			  spos--;
 			  if (strict)
 			    {
 			      result = NO;
 			      goto done;
 			    }
 			  continue;	// Skip bad half of surrogate pair.
 			}
 		      /* make the full value */
 		      u = ((unsigned long)(u1 - 0xd800) * 0x400)
 			+ (u2 - 0xdc00) + 0x10000;
 		    }
 		  else
 		    {
 		      u = u1;
 		    }
 		  /* calculate the sequence length
 		   * a length of 1 was dealt with earlier
 		   */
 		  if (u <= 0x7ff)
 		    {
 		      sl = 2;
 		    }
 		  else if (u <= 0xffff)
 		    {
 		      sl = 3;
 		    }
 		  else if (u <= 0x1fffff)
 		    {
 		      sl = 4;
 		    }
 		  else if (u <= 0x3ffffff)
 		    {
 		      sl = 5;
 		    }
 		  else
 		    {
 		      sl = 6;
 		    }
 		  /* make sure we have enough space for it */
 		  while (dpos + sl >= bsize)
 		    {
 		      GROW();
 		    }
 		  /* split value into reversed array */
 		  for (i = 0; i < sl; i++)
 		    {
 		      reversed[i] = (u & 0x3f);
 		      u = u >> 6;
 		    }
 		  ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
 		  /* add bytes into the output sequence */
 		  for (i = sl - 2; i >= 0; i--)
 		    {
 		      ptr[dpos++] = reversed[i] | 0x80;
 		    }
 		}
 	    }
        }
        break;