Implement optimisations suggested by Fred's vcachegrind profiling.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@34093 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
rfm 2011-10-31 08:12:26 +00:00
parent e3594c5773
commit 84a297282f
3 changed files with 444 additions and 200 deletions

View file

@ -1,3 +1,10 @@
2011-10-28 Richard Frith-Macdonald <rfm@gnu.org>
* Source/GSString.m: Implement more efficient equality tests for
utf-8 literal strings and strings in internal representation.
Implement rangeOfComposedCharacterSequenceAtIndex: for utf8 literal
strings.
2011-10-29 Fred Kiefer <FredKiefer@gmx.de>
* Source/GSeq.h: Replace unsigned with NSUInteger.

View file

@ -862,6 +862,7 @@ typedef NSUInteger NSStringEncodingConversionOptions;
*/
@interface NXConstantString : NSString
{
@public
const char * const nxcsptr;
const unsigned int nxcslen;
}

View file

@ -70,6 +70,227 @@ static BOOL isByteEncoding(NSStringEncoding enc)
struct objc_class _NSConstantStringClassReference;
#endif
/* Determine the length of the UTF-8 string as a unicode (UTF-16) string.
* sets the ascii flag according to the content found.
*/
static NSUInteger
lengthUTF8(const uint8_t *p, unsigned l, BOOL *ascii, BOOL *latin1)
{
const uint8_t *e = p + l;
BOOL a = YES;
BOOL l1 = YES;
l = 0;
while (p < e)
{
uint8_t c = *p;
uint32_t u = c;
if (c > 0x7f)
{
int i, sle = 0;
a = NO;
/* calculated the expected sequence length */
while (c & 0x80)
{
c = c << 1;
sle++;
}
/* legal ? */
if ((sle < 2) || (sle > 6))
{
[NSException raise: NSInternalInconsistencyException
format: @"Bad sequence length in constant string"];
}
if (p + sle > e)
{
[NSException raise: NSInternalInconsistencyException
format: @"Short data in constant string"];
}
/* get the codepoint */
for (i = 1; i < sle; i++)
{
if (p[i] < 0x80 || p[i] >= 0xc0)
break;
u = (u << 6) | (p[i] & 0x3f);
}
if (i < sle)
{
[NSException raise: NSInternalInconsistencyException
format: @"Codepoint out of range in constant string"];
}
u = u & ~(0xffffffff << ((5 * sle) + 1));
p += sle;
/*
* We check for invalid codepoints here.
*/
if (u > 0x10ffff || u == 0xfffe || u == 0xffff
|| (u >= 0xfdd0 && u <= 0xfdef))
{
[NSException raise: NSInternalInconsistencyException
format: @"Codepoint invalid in constant string"];
}
if ((u >= 0xd800) && (u <= 0xdfff))
{
[NSException raise: NSInternalInconsistencyException
format: @"Bad surrogate pair in constant string"];
}
}
else
{
p++;
}
/*
* Add codepoint as either a single unichar for BMP
* or as a pair of surrogates for codepoints over 16 bits.
*/
if (u < 0x10000)
{
l++;
if (u > 255)
{
l1 = NO;
}
}
else
{
l += 2;
}
}
if (0 != ascii)
{
*ascii = a;
}
if (0 != latin1)
{
*latin1 = l1;
}
return l;
}
/* Sequentially extracts characters from UTF-8 string
* p = pointer to the utf-8 data
* l = length (bytes) of the utf-8 data
* o = pointer to current offset within the data
* n = pointer to either zero or the next pre-read part of a surrogate pair.
* The condition for having read the entire string is that the offset (*o)
* is the number of bytes in the string, and the unichar pointed to by *n
* is zero (meaning there is no second part of a surrogate pair remaining).
*/
static inline unichar
nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n)
{
unsigned i;
/* If we still have the second part of a surrogate pair, return it.
*/
if (*n > 0)
{
unichar u = *n;
*n = 0;
return u;
}
if ((i = *o) < l)
{
uint8_t c = p[i];
uint32_t u = c;
if (c > 0x7f)
{
int j, sle = 0;
/* calculated the expected sequence length */
while (c & 0x80)
{
c = c << 1;
sle++;
}
/* legal ? */
if ((sle < 2) || (sle > 6))
{
[NSException raise: NSInvalidArgumentException
format: @"bad multibyte character length"];
}
if (sle + i > l)
{
[NSException raise: NSInvalidArgumentException
format: @"multibyte character extends beyond data"];
}
/* get the codepoint */
for (j = 1; j < sle; j++)
{
uint8_t b = p[i + j];
if (b < 0x80 || b >= 0xc0)
break;
u = (u << 6) | (b & 0x3f);
}
if (j < sle)
{
[NSException raise: NSInvalidArgumentException
format: @"bad data in multibyte character"];
}
u = u & ~(0xffffffff << ((5 * sle) + 1));
i += sle;
/*
* We discard invalid codepoints here.
*/
if (u > 0x10ffff || u == 0xfffe || u == 0xffff
|| (u >= 0xfdd0 && u <= 0xfdef))
{
[NSException raise: NSInvalidArgumentException
format: @"invalid unicode codepoint"];
}
if ((u >= 0xd800) && (u <= 0xdfff))
{
[NSException raise: NSInvalidArgumentException
format: @"unmatched half of surrogate pair"];
}
}
else
{
i++;
}
/*
* Add codepoint as either a single unichar for BMP
* or as a pair of surrogates for codepoints over 16 bits.
*/
if (u >= 0x10000)
{
unichar ul, uh;
u -= 0x10000;
ul = u & 0x3ff;
uh = (u >> 10) & 0x3ff;
*n = ul + 0xdc00; // record second part of pair
u = uh + 0xd800; // return first part.
}
*o = i; // Return new index
return (unichar)u;
}
[NSException raise: NSInvalidArgumentException
format: @"no more data in UTF-8 string"];
return 0;
}
/*
* GSPlaceholderString - placeholder class for objects awaiting intialisation.
@ -2127,6 +2348,43 @@ isEqual_c(GSStr self, id anObject)
return NO;
}
c = object_getClass(anObject);
if (c == NSConstantStringClass)
{
NXConstantString *other = (NXConstantString*)anObject;
if (internalEncoding == NSASCIIStringEncoding)
{
if (self->_count == other->nxcslen
&& 0 == memcmp(self->_contents.c, other->nxcsptr, other->nxcslen))
{
return YES;
}
return NO;
}
if (internalEncoding == NSISOLatin1StringEncoding)
{
NSUInteger pos = 0;
unichar n = 0;
unsigned i = 0;
unichar u;
while (i < other->nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)other->nxcsptr, other->nxcslen,
&i, &n);
if (pos >= self->_count || (unichar)self->_contents.c[pos] != u)
{
return NO;
}
pos++;
}
if (pos != self->_count)
{
return NO;
}
return YES;
}
}
if (c == GSMutableStringClass || GSObjCIsKindOf(c, GSStringClass) == YES)
{
GSStr other = (GSStr)anObject;
@ -2186,6 +2444,29 @@ isEqual_u(GSStr self, id anObject)
return NO;
}
c = object_getClass(anObject);
if (c == NSConstantStringClass)
{
NXConstantString *other = (NXConstantString*)anObject;
NSUInteger pos = 0;
unichar n = 0;
unsigned i = 0;
unichar u;
while (i < other->nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)other->nxcsptr, other->nxcslen, &i, &n);
if (pos >= self->_count || self->_contents.u[pos] != u)
{
return NO;
}
pos++;
}
if (pos != self->_count)
{
return NO;
}
return YES;
}
if (c == GSMutableStringClass || GSObjCIsKindOf(c, GSStringClass) == YES)
{
GSStr other = (GSStr)anObject;
@ -4727,225 +5008,126 @@ NSAssert(_flags.owned == 1 && _zone != 0, NSInternalInconsistencyException);
/* Determine the length of the UTF-8 string as a unicode (UTF-16) string.
* sets the ascii flag according to the content found.
*/
static NSUInteger
lengthUTF8(const uint8_t *p, unsigned l, BOOL *ascii, BOOL *latin1)
static BOOL
literalIsEqual(NXConstantString *self, id anObject)
{
const uint8_t *e = p + l;
BOOL a = YES;
BOOL l1 = YES;
Class c;
l = 0;
while (p < e)
if (anObject == (id)self)
{
uint8_t c = *p;
uint32_t u = c;
return YES;
}
if (anObject == nil)
{
return NO;
}
if (GSObjCIsInstance(anObject) == NO)
{
return NO;
}
c = object_getClass(anObject);
if (c == NSConstantStringClass)
{
NXConstantString *other = (NXConstantString*)anObject;
if (c > 0x7f)
if (other->nxcslen != self->nxcslen
|| strcmp(other->nxcsptr, self->nxcsptr) != 0)
{
int i, sle = 0;
return NO;
}
return YES;
}
else if (YES == [anObject isKindOfClass: NSStringClass]) // may be proxy
{
unichar (*imp)(id, SEL, NSUInteger);
NSUInteger len = [anObject length];
NSUInteger pos = 0;
unichar n = 0;
unsigned i = 0;
unichar u;
a = NO;
/* calculated the expected sequence length */
while (c & 0x80)
{
c = c << 1;
sle++;
}
/* legal ? */
if ((sle < 2) || (sle > 6))
{
[NSException raise: NSInternalInconsistencyException
format: @"Bad sequence length in constant string"];
}
if (p + sle > e)
{
[NSException raise: NSInternalInconsistencyException
format: @"Short data in constant string"];
}
/* get the codepoint */
for (i = 1; i < sle; i++)
{
if (p[i] < 0x80 || p[i] >= 0xc0)
break;
u = (u << 6) | (p[i] & 0x3f);
}
if (i < sle)
{
[NSException raise: NSInternalInconsistencyException
format: @"Codepoint out of range in constant string"];
}
u = u & ~(0xffffffff << ((5 * sle) + 1));
p += sle;
/*
* We check for invalid codepoints here.
if (len < self->nxcslen)
{
/* Since UTF-8 is a multibyte character set, it must have at least
* as many bytes as another string of the same length. So if the
* other is shorter, the two cannot be equal.
*/
if (u > 0x10ffff || u == 0xfffe || u == 0xffff
|| (u >= 0xfdd0 && u <= 0xfdef))
{
[NSException raise: NSInternalInconsistencyException
format: @"Codepoint invalid in constant string"];
}
if ((u >= 0xd800) && (u <= 0xdfff))
{
[NSException raise: NSInternalInconsistencyException
format: @"Bad surrogate pair in constant string"];
}
}
else
{
p++;
return NO;
}
/*
* Add codepoint as either a single unichar for BMP
* or as a pair of surrogates for codepoints over 16 bits.
/* Try to do a fast comparison for well known scring classes.
*/
if (u < 0x10000)
if (c == GSMutableStringClass || GSObjCIsKindOf(c, GSStringClass) == YES)
{
l++;
if (u > 255)
GSStr o = (GSStr)anObject;
if (o->_flags.wide == 1)
{
l1 = NO;
/* Other string is a unichar buffer
*/
while (i < self->nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)self->nxcsptr,
self->nxcslen, &i, &n);
if (pos >= len || o->_contents.u[pos] != u)
{
return NO;
}
pos++;
}
if (pos != len)
{
return NO;
}
return YES;
}
else if (internalEncoding == NSISOLatin1StringEncoding
|| internalEncoding == NSASCIIStringEncoding)
{
/* Other string is a buffer containing ascii or latin1 so
* we can compare buffer contents with unichar values directly.
*/
while (i < self->nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)self->nxcsptr,
self->nxcslen, &i, &n);
if (pos >= len || (unichar)o->_contents.c[pos] != u)
{
return NO;
}
pos++;
}
if (pos != len)
{
return NO;
}
return YES;
}
}
else
{
l += 2;
}
}
if (0 != ascii)
{
*ascii = a;
}
if (0 != latin1)
{
*latin1 = l1;
}
return l;
}
/* Sequentially extracts characters from UTF-8 string
* p = pointer to the utf-8 data
* l = length (bytes) of the utf-8 data
* o = pointer to current offset within the data
* n = pointer to either zero or the next pre-read part of a surrogate pair.
* The condition for having read the entire string is that the offset (*o)
* is the number of bytes in the string, and the unichar pointed to by *n
* is zero (meaning there is no second part of a surrogate pair remaining).
*/
static inline unichar
nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n)
{
unsigned i;
/* If we still have the second part of a surrogate pair, return it.
*/
if (*n > 0)
{
unichar u = *n;
*n = 0;
return u;
}
if ((i = *o) < l)
{
uint8_t c = p[i];
uint32_t u = c;
if (c > 0x7f)
{
int j, sle = 0;
/* calculated the expected sequence length */
while (c & 0x80)
{
c = c << 1;
sle++;
}
/* legal ? */
if ((sle < 2) || (sle > 6))
{
[NSException raise: NSInvalidArgumentException
format: @"bad multibyte character length"];
}
if (sle + i > l)
{
[NSException raise: NSInvalidArgumentException
format: @"multibyte character extends beyond data"];
}
/* get the codepoint */
for (j = 1; j < sle; j++)
{
uint8_t b = p[i + j];
if (b < 0x80 || b >= 0xc0)
break;
u = (u << 6) | (b & 0x3f);
}
if (j < sle)
{
[NSException raise: NSInvalidArgumentException
format: @"bad data in multibyte character"];
}
u = u & ~(0xffffffff << ((5 * sle) + 1));
i += sle;
/*
* We discard invalid codepoints here.
*/
if (u > 0x10ffff || u == 0xfffe || u == 0xffff
|| (u >= 0xfdd0 && u <= 0xfdef))
{
[NSException raise: NSInvalidArgumentException
format: @"invalid unicode codepoint"];
}
if ((u >= 0xd800) && (u <= 0xdfff))
{
[NSException raise: NSInvalidArgumentException
format: @"unmatched half of surrogate pair"];
}
}
else
{
i++;
}
/*
* Add codepoint as either a single unichar for BMP
* or as a pair of surrogates for codepoints over 16 bits.
/* Fall through to do a character by character comparison
* using characterAtIndex:
*/
if (u >= 0x10000)
imp = (unichar(*)(id,SEL,NSUInteger))[anObject methodForSelector:
@selector(characterAtIndex:)];
while (i < self->nxcslen || n > 0)
{
unichar ul, uh;
u -= 0x10000;
ul = u & 0x3ff;
uh = (u >> 10) & 0x3ff;
*n = ul + 0xdc00; // record second part of pair
u = uh + 0xd800; // return first part.
u = nextUTF8((const uint8_t *)self->nxcsptr,
self->nxcslen, &i, &n);
if (pos >= len
|| (*imp)(anObject, @selector(characterAtIndex:), pos) != u)
{
return NO;
}
pos++;
}
*o = i; // Return new index
return (unichar)u;
if (pos != len)
{
return NO;
}
return YES;
}
[NSException raise: NSInvalidArgumentException
format: @"no more data in UTF-8 string"];
return 0;
return NO;
}
/**
@ -5164,6 +5346,16 @@ nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n)
return nil;
}
- (BOOL) isEqual: (id)anObject
{
return literalIsEqual(self, anObject);
}
- (BOOL) isEqualToString: (NSString*)other
{
return literalIsEqual(self, other);
}
- (NSUInteger) length
{
return lengthUTF8((const uint8_t*)nxcsptr, nxcslen, 0, 0);
@ -5235,6 +5427,50 @@ nextUTF8(const uint8_t *p, unsigned l, unsigned *o, unichar *n)
return range;
}
- (NSRange) rangeOfComposedCharacterSequenceAtIndex: (NSUInteger)anIndex
{
NSUInteger start = 0;
NSUInteger pos = 0;
unichar n = 0;
unsigned i = 0;
unichar u;
/* A composed character sequence consists of a single base character
* followed by zero or more non-base characters.
*/
while (i < nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)nxcsptr, nxcslen, &i, &n);
if (!uni_isnonsp(u))
{
/* This may be the base character at the start of the sequence.
*/
start = pos;
}
if (pos++ == anIndex)
{
/* Look ahead to see if the character at the specified index is
* followed by one or more non-base characters. If it is, we
* make the range longer before returning it.
*/
while (i < nxcslen || n > 0)
{
u = nextUTF8((const uint8_t *)nxcsptr, nxcslen, &i, &n);
if (!uni_isnonsp(u))
{
break;
}
pos++;
}
return NSMakeRange(start, pos - start);
}
}
[NSException raise: NSInvalidArgumentException
format: @"-rangeOfComposedCharacterSequenceAtIndex: index out of range"];
return NSMakeRange(NSNotFound, 0);
}
- (id) retain
{
return self;