Add some optimisation for converting to UTF-8

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@28334 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
rfm 2009-06-08 15:18:49 +00:00
parent 08c9289397
commit f074015e89
2 changed files with 226 additions and 99 deletions

View file

@ -1,3 +1,8 @@
2009-06-08 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/Unicode.m: Optimise somewhat for converting
from unicode (UTF-2) to UTF-8
2009-06-06 Richard Frith-Macdonald <rfm@gnu.org> 2009-06-06 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/Unicode.m: Optimise case where we are converting * Source/Additions/Unicode.m: Optimise case where we are converting

View file

@ -1786,127 +1786,249 @@ GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
{ {
case NSUTF8StringEncoding: case NSUTF8StringEncoding:
{ {
while (spos < slen) if (swapped == YES)
{ {
unichar u1, u2; while (spos < slen)
unsigned long u;
int sl = 0;
/* get first unichar */
u1 = src[spos++];
if (swapped == YES)
{ {
unichar u1, u2;
unsigned char reversed[8];
unsigned long u;
int sl;
int i;
/* get first unichar */
u1 = src[spos++];
u1 = (((u1 & 0xff00) >> 8) + ((u1 & 0x00ff) << 8)); u1 = (((u1 & 0xff00) >> 8) + ((u1 & 0x00ff) << 8));
}
// 0xfeff is a zero-width-no-break-space inside text (not a BOM).
if (u1 == 0xfffe // unexpected BOM
|| u1 == 0xffff // not a character
|| (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character
|| (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing
{
if (strict)
{
result = NO;
goto done;
}
continue; // Skip invalid character.
}
/* possibly get second character and calculate 'u' */ /* Fast track ... if this is actually an ascii character
if ((u1 >= 0xd800) && (u1 < 0xdc00)) * it just converts straight to utf-8
{ */
if (spos >= slen) if (u1 <= 0x7f)
{ {
if (strict) if (dpos >= bsize)
{ {
result = NO; GROW();
goto done;
} }
continue; // At end. ptr[dpos++] = (unsigned char)u1;
} continue;
/* get second unichar */
u2 = src[spos++];
if (swapped == YES)
{
u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8));
} }
if ((u2 < 0xdc00) && (u2 > 0xdfff)) // 0xfeff is a zero-width-no-break-space inside text
{ if (u1 == 0xfffe // unexpected BOM
spos--; || u1 == 0xffff // not a character
|| (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character
|| (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing
{
if (strict) if (strict)
{ {
result = NO; result = NO;
goto done; goto done;
} }
continue; // Skip bad half of surrogate pair. continue; // Skip invalid character.
} }
/* make the full value */ /* possibly get second character and calculate 'u' */
u = ((unsigned long)(u1 - 0xd800) * 0x400) if ((u1 >= 0xd800) && (u1 < 0xdc00))
+ (u2 - 0xdc00) + 0x10000; {
} if (spos >= slen)
else {
{ if (strict)
u = u1; {
} result = NO;
goto done;
}
continue; // At end.
}
/* calculate the sequence length */ /* get second unichar */
if (u <= 0x7f) u2 = src[spos++];
{ u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8));
sl = 1;
}
else if (u <= 0x7ff)
{
sl = 2;
}
else if (u <= 0xffff)
{
sl = 3;
}
else if (u <= 0x1fffff)
{
sl = 4;
}
else if (u <= 0x3ffffff)
{
sl = 5;
}
else
{
sl = 6;
}
/* make sure we have enough space for it */ if ((u2 < 0xdc00) && (u2 > 0xdfff))
while (dpos + sl >= bsize) {
{ spos--;
GROW(); if (strict)
} {
result = NO;
goto done;
}
continue; // Skip bad half of surrogate pair.
}
if (sl == 1) /* make the full value */
{ u = ((unsigned long)(u1 - 0xd800) * 0x400)
ptr[dpos++] = u & 0x7f; + (u2 - 0xdc00) + 0x10000;
} }
else else
{ {
int i; u = u1;
unsigned char reversed[8]; }
/* split value into reversed array */ /* calculate the sequence length
for (i = 0; i < sl; i++) * a length of 1 was dealt with earlier
{ */
reversed[i] = (u & 0x3f); if (u <= 0x7ff)
u = u >> 6; {
} sl = 2;
}
else if (u <= 0xffff)
{
sl = 3;
}
else if (u <= 0x1fffff)
{
sl = 4;
}
else if (u <= 0x3ffffff)
{
sl = 5;
}
else
{
sl = 6;
}
ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff); /* make sure we have enough space for it */
/* add bytes into the output sequence */ while (dpos + sl >= bsize)
for (i = sl - 2; i >= 0; i--) {
GROW();
}
/* split value into reversed array */
for (i = 0; i < sl; i++)
{
reversed[i] = (u & 0x3f);
u = u >> 6;
}
ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
/* add bytes into the output sequence */
for (i = sl - 2; i >= 0; i--)
{ {
ptr[dpos++] = reversed[i] | 0x80; ptr[dpos++] = reversed[i] | 0x80;
} }
} }
}
else
{
while (spos < slen)
{
unichar u1, u2;
unsigned char reversed[8];
unsigned long u;
int sl;
int i;
/* get first unichar */
u1 = src[spos++];
/* Fast track ... if this is actually an ascii character
* it just converts straight to utf-8
*/
if (u1 <= 0x7f)
{
if (dpos >= bsize)
{
GROW();
}
ptr[dpos++] = (unsigned char)u1;
continue;
}
// 0xfeff is a zero-width-no-break-space inside text
if (u1 == 0xfffe // unexpected BOM
|| u1 == 0xffff // not a character
|| (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character
|| (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing
{
if (strict)
{
result = NO;
goto done;
}
continue; // Skip invalid character.
}
/* possibly get second character and calculate 'u' */
if ((u1 >= 0xd800) && (u1 < 0xdc00))
{
if (spos >= slen)
{
if (strict)
{
result = NO;
goto done;
}
continue; // At end.
}
/* get second unichar */
u2 = src[spos++];
if ((u2 < 0xdc00) && (u2 > 0xdfff))
{
spos--;
if (strict)
{
result = NO;
goto done;
}
continue; // Skip bad half of surrogate pair.
}
/* make the full value */
u = ((unsigned long)(u1 - 0xd800) * 0x400)
+ (u2 - 0xdc00) + 0x10000;
}
else
{
u = u1;
}
/* calculate the sequence length
* a length of 1 was dealt with earlier
*/
if (u <= 0x7ff)
{
sl = 2;
}
else if (u <= 0xffff)
{
sl = 3;
}
else if (u <= 0x1fffff)
{
sl = 4;
}
else if (u <= 0x3ffffff)
{
sl = 5;
}
else
{
sl = 6;
}
/* make sure we have enough space for it */
while (dpos + sl >= bsize)
{
GROW();
}
/* split value into reversed array */
for (i = 0; i < sl; i++)
{
reversed[i] = (u & 0x3f);
u = u >> 6;
}
ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
/* add bytes into the output sequence */
for (i = sl - 2; i >= 0; i--)
{
ptr[dpos++] = reversed[i] | 0x80;
}
}
} }
} }
break; break;