Improved unicode support

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@17648 72102866-910b-0410-8b05-ffd578937521
2025-05-31 08:41:03 +00:00 · 2003-09-10 08:14:45 +00:00 · 2003-09-10 08:14:45 +00:00 · 68700f54ef
commit 68700f54ef
parent 2651bf6b19
2 changed files with 193 additions and 110 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
 2003-09-10  Pete French <pete@twisted.org.uk>
 	* Source/Additions/Unicode.m: patch to use UTF16 rather than
 	UCS2 internally.
 2003-09-10  Richard Frith-Macdonald <rfm@gnu.org>
 	* Source/NSCalendarDate.m:
--- a/Source/Additions/Unicode.m
+++ b/Source/Additions/Unicode.m
@ -67,11 +67,16 @@ typedef struct {unichar from; unsigned char to;} _ucc_;
 /*
 * The whole of the GNUstep code stores UNICODE in internal byte order,
- * so we do the same. This should be UCS-2-INTERNAL for libiconv
+ * so we do the same. We have switched to using UTF16 so the defines here
 * recognise this. We try the generic UTF16 first, followed by the endian
 * specifi versions. If not we try the original defines and then back to
 * UCS-2-INTERNAL.
 */
 #ifdef WORDS_BIGENDIAN
 #define UNICODE_UTF16 "UTF-16BE"
 #define UNICODE_INT "UNICODEBIG"
 #else
 #define UNICODE_UTF16 "UTF-16LE"
 #define UNICODE_INT "UNICODELITTLE"
 #endif
@ -84,6 +89,23 @@ static const char *
 internal_unicode_enc(void)
 {
  iconv_t conv;
  unicode_enc = "UTF-16";
  conv = iconv_open(unicode_enc, "ASCII");
  if (conv != (iconv_t)-1)
    {
      iconv_close(conv);
      return unicode_enc;
    }
  unicode_enc = UNICODE_UTF16;
  conv = iconv_open(unicode_enc, "ASCII");
  if (conv != (iconv_t)-1)
    {
      iconv_close(conv);
      return unicode_enc;
    }
  NSLog(@"Could not initialise iconv() for UTF16, using UCS-2");
  NSLog(@"Using characters outside 16 bits may give incorrect results");
  unicode_enc = UNICODE_INT;
  conv = iconv_open(unicode_enc, "ASCII");
  if (conv != (iconv_t)-1)
@ -805,6 +827,13 @@ uni_cop(unichar u)
 BOOL
 uni_isnonsp(unichar u)
 {
  /*
   * Treating upper surrogates as non-spacing is a convenient solution
   * to a number of issues with UTF-16
   */
  if ((u >= 0xdc00) && (u <= 0xdfff))
    return YES;
 // FIXME check is uni_cop good for this
  if (uni_cop(u))
    return YES;
@ -1053,96 +1082,89 @@ GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
 	{
 	  while (spos < slen)
 	    {
-	      unsigned char	c = src[spos++];
+	      unsigned char	c = src[spos];
-	      unichar		u = c;
+	      unsigned long	u = c;
 	      if (c > 0x7f)
                {
                  int i, sle = 0;
 		  /* calculated the expected sequence length */
                  while (c & 0x80)
                    {
                      c = c << 1;
                      sle++;
                    }
 		  /* legal ? */
 		  if ((sle < 2) || (sle > 6))
                    {
 	               result = NO;
 		       break;
 	            }
 		  /* do we have enough bytes ? */
 		  if ((spos + sle) > slen)
                    {
 	               result = NO;
 		       break;
 	            }
 		  /* get the codepoint */
 		  for (i = 1; i < sle; i++)
 		    {
 		      u = (u << 6) | (src[spos + i] & 0x3f);
 		    }
 	          u = u & ~(0xffffffff << ((5 * sle) + 1));
 		  spos += sle;
                }
              else
 		{
-		  unsigned char	c1;
+		  spos++;
 		  if (spos == slen)
 		    {
 		      result = NO;	// Second byte is missing.
 		      break;
 		    }
 		  if (c < 0xe0)
 		    {
 		      if (c < 0xc1)
 			{
 			  /*
 			   * Either we are inside a multibyte sequence or
 			   * we have a bad multibyte character count.
 			   */
 			  result = NO;
 			  break;
 			}
 		      c1 = src[spos++];
 		      if ((c1 ^ 0x80) >= 0x40)
 			{
 			  /*
 			   * Second byte in sequence is not a legal
 			   * continuation.
 			   */
 			  result = NO;
 			  break;
 			}
 		      u = ((c & 0x1f) << 6) | (c1 & 0x3f);
 		    }
 		  else if (c < 0xf0)
 		    {
 		      unsigned char	c1;
 		      unsigned char	c2;
 		      c1 = src[spos++];
 		      if (spos == slen)
 			{
 			  result = NO;	// Third byte is missing.
 			  break;
 			}
 		      c2 = src[spos++];
 		      if (((c1 ^ 0x80) >= 0x40) || ((c2 ^ 0x80) >= 0x40)
 			|| (c == 0xe0 && c1 == 0x80))
 			{
 			  result = NO;	// Invalid sequence.
 			  break;
 			}
 		      u = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6)
 			| (c2 & 0x3f);
 		      if (u >= 0xd800 && u <= 0xdfff)
 			{
 			  /*
 			   * Sequence not legal ... in utf-16 surrogates.
 			   */
 			  result = NO;
 			  break;
 			}
 		      if (u >= 0xfffe)
 			{
 			  /*
 			   * Sequence not legal ... in utf-16 surrogates.
 			   */
 			  result = NO;
 			  break;
 			}
 		    }
 		  else
 		    {
 		      /*
 		       * Sequence not legal or too long for conversion to
 		       * two byte unicode.
 		       */
 		      result = NO;
 		      break;
 		    }
 		}
 	      /*
 	       * Add codepoint as either a single unichar for BMP
 	       * or as a pair of surrogates for codepoints over 16 bits.
 	       * We also discard invalid codepoints here.
 	       */
 	      if ((u >= 0xd800) && (u <= 0xdfff))
                {
 	          result = NO;
 		  break;
 	        }
 	      if (u > 0x10ffff)
                {
 	          result = NO;
 		  break;
 	        }
 	      if (dpos >= bsize)
 		{
 		  GROW();
 		}
-	      ptr[dpos++] = u;
+	      if (u < 0x10000)
 	        {
 	          ptr[dpos++] = u;
 	        }
 	      else
 	        {
                  unichar ul, uh;
                  u -= 0x10000;
                  ul = u & 0x3ff;
                  uh = (u >> 10) & 0x3ff;
 	          ptr[dpos++] = uh + 0xd800;
 	          if (dpos >= bsize)
 		    {
 		      GROW();
 		    }
 	          ptr[dpos++] = ul + 0xdc00;
 	        }
 	    }
 	}
 	break;
@ -1607,47 +1629,103 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
 	{
 	  while (spos < slen)
 	    {
-	      unichar	u = src[spos++];
+	      unichar 		u1, u2;
-	      unsigned	multi;
+	      unsigned long	u;
 	      int		sl = 0;
 	      /* get first unichar */
 	      u1 = src[spos++];
 	      if (swapped == YES)
 		{
-		  u = ((u & 0xff00 >> 8) + ((u & 0x00ff) << 8));
+		  u1 = ((u1 & 0xff00 >> 8) + ((u1 & 0x00ff) << 8));
 		}
-	      if (u < 0x0080)
+	      /* possibly get second character and caculate 'u' */
 	      if ((u1 >= 0xd800) && (u1 < 0xdc00))
                {
 	  	  if (spos >= slen)
                    {
 		      result = NO;
 		      break;
                    }
 	          /* get second unichar */
 	          u2 = src[spos++];
 	          if (swapped == YES)
 		    {
 		      u2 = ((u1 & 0xff00 >> 8) + ((u1 & 0x00ff) << 8));
 		    }
 	          if ((u2 < 0xdc00) && (u2 > 0xdfff))
                    {
 		      result = NO;
 		      break;
                    }
                  /* make the full value */
 		  u = ((unsigned long)(u1 - 0xd800) * 0x400)
 		    + (u2 - 0xdc00) + 0x10000;
                }
              else
 		{
-		  multi = 0;
+		  u = u1;
 		}
 	      else if (u < 0x0800)
 		{
 		  multi = 1;
 		}
 	      else
 		{
 		  multi = 2;
 		}
-	      if (dpos + multi >= bsize)
+              /* calculate the sequence length */
              if (u <= 0x7f)
 		{
 		  sl = 1;
 		}
              else if (u <= 0x7ff)
 		{
 		  sl = 2;
 		}
              else if (u <= 0xffff)
 		{
 		  sl = 3;
 		}
              else if (u <= 0x1fffff)
 		{
 		  sl = 4;
 		}
              else if (u <= 0x3ffffff)
 		{
 		  sl = 5;
 		}
              else
 		{
 		  sl = 6;
 		}
              /* make sure we have enough space for it */
 	      while (dpos + sl >= bsize)
 		{
 		  GROW();
 		}
-	      if (u < 0x80)
+	      if (sl == 1)
-		{
+                {
-		  ptr[dpos++] = u;
+	          ptr[dpos++] = u & 0x7f;
-		}
+                }
-	      else if (u < 0x800)
+              else
-	        {
+                {
-		  ptr[dpos++] = (u >> 6) | 0xc0;
+                  unsigned	i;
-		  ptr[dpos++] = (u & 0x3f) | 0x80;
+                  unsigned char	reversed[8];
-		}
+
-	      else
+                  /* split value into reversed array */
-		{
+                  for (i = 0; i < sl; i++)
-		  ptr[dpos++] = (u >> 12) | 0xe0;
+                    {
-		  ptr[dpos++] = ((u >> 6) & 0x3f) | 0x80;
+                      reversed[i] = (u & 0x3f);
-		  ptr[dpos++] = (u & 0x3f) | 0x80;
+                      u = u >> 6;
-	        }
+                    }
 	          ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
                  /* add bytes into the output sequence */
                  for (i = sl - 2; i >= 0; i--)
 		    {
 		      ptr[dpos++] = reversed[i] | 0x80;
 		    }
                }
 	    }
        }
        break;