Minor unicode range handling improvements

This commit is contained in:
Richard Frith-Macdonald 2018-04-09 11:55:46 +01:00
parent 7bf1179f60
commit 7274cbaa55
4 changed files with 43 additions and 29 deletions

View file

@ -1,3 +1,13 @@
2018-04-09 Richard Frith-Macdonald <rfm@gnu.org>
* Source/Additions/Unicode.m:
* Source/NSString.m:
Move uni_isnonsp() to NSString.m and make it use nonBaseCharacterSet
so that it correctly copes with both surrogate pairs and traditional
composed character sequences. NB. David points out that this is not
full/correct unicode grapheme cluster handling (it's the main part
of the handling for 'legacy' grapheme clusters).
2018-04-04 Richard Frith-Macdonald <rfm@gnu.org>
* Headers/Foundation/NSLock.h:

View file

@ -652,22 +652,7 @@ uni_cop(unichar u)
return GSPrivateUniCop(u);
}
BOOL
uni_isnonsp(unichar u)
{
/*
* Treating upper surrogates as non-spacing is a convenient solution
* to a number of issues with UTF-16
*/
if ((u >= 0xdc00) && (u <= 0xdfff))
return YES;
// FIXME check is uni_cop good for this
if (GSPrivateUniCop(u))
return YES;
else
return NO;
}
// uni_isnonsp(unichar u) now implemented in NSString.m
unichar*
uni_is_decomp(unichar u)

View file

@ -1,4 +1,7 @@
/* COP table */
/* COP table
* This records diacriticals and their copmbining class
* FIXME ... needs updating to latest unicode
*/
/*
Copyright (C) 2005 Free Software Foundation

View file

@ -151,7 +151,9 @@ static GSPlaceholderString *defaultPlaceholderString;
static NSMapTable *placeholderMap;
static NSLock *placeholderLock;
static SEL cMemberSel = 0;
static SEL cMemberSel = 0;
static NSCharacterSet *nonBase = nil;
static BOOL (*nonBaseImp)(id, SEL, unichar) = 0;
/* Macro to return the receiver if it is already immutable, but an
* autoreleased copy otherwise. Used where we have to return an
@ -196,6 +198,24 @@ static void setupWhitespace(void)
}
}
/* A non-spacing character is one which is part of a 'user-perceived character'
* where the user perceived character consists of a base character followed
* by a sequence of non-spacing characters. Non-spacing characters do not
* exist in isolation.
* eg. an accented 'a' might be represented as the 'a' followed by the accent.
*/
inline BOOL
uni_isnonsp(unichar u)
{
/* Treating upper surrogates as non-spacing is a convenient solution
* to a number of issues with UTF-16
*/
if ((u >= 0xdc00) && (u <= 0xdfff))
return YES;
return (*nonBaseImp)(nonBase, cMemberSel, u);
}
/*
* Include sequence handling code with instructions to generate search
* and compare functions for NSString objects.
@ -778,6 +798,11 @@ GSICUCollatorOpen(NSStringCompareOptions mask, NSLocale *locale)
gcrSel = @selector(getCharacters:range:);
ranSel = @selector(rangeOfComposedCharacterSequenceAtIndex:);
nonBase = [NSCharacterSet nonBaseCharacterSet];
nonBase = [NSObject leakAt: &nonBase];
nonBaseImp
= (BOOL(*)(id,SEL,unichar))[nonBase methodForSelector: cMemberSel];
_DefaultStringEncoding = GSPrivateDefaultCStringEncoding();
_ByteEncodingOk = GSPrivateIsByteEncoding(_DefaultStringEncoding);
@ -2764,9 +2789,6 @@ GSICUCollatorOpen(NSStringCompareOptions mask, NSLocale *locale)
*/
- (NSRange) rangeOfComposedCharacterSequenceAtIndex: (NSUInteger)anIndex
{
static NSCharacterSet *nonbase = nil;
static SEL nbSel;
static BOOL (*nbImp)(id, SEL, unichar) = 0;
unsigned start;
unsigned end;
unsigned length = [self length];
@ -2778,22 +2800,16 @@ static BOOL (*nbImp)(id, SEL, unichar) = 0;
caiImp = (unichar (*)(NSString*,SEL,NSUInteger))
[self methodForSelector: caiSel];
if (nil == nonbase)
{
nonbase = [[NSCharacterSet nonBaseCharacterSet] retain];
nbSel = @selector(characterIsMember:);
nbImp = (BOOL(*)(id,SEL,unichar))[nonbase methodForSelector: nbSel];
}
for (start = anIndex; start > 0; start--)
{
ch = (*caiImp)(self, caiSel, start);
if ((*nbImp)(nonbase, nbSel, ch) == NO)
if (uni_isnonsp(ch) == NO)
break;
}
for (end = start+1; end < length; end++)
{
ch = (*caiImp)(self, caiSel, end);
if ((*nbImp)(nonbase, nbSel, ch) == NO)
if (uni_isnonsp(ch) == NO)
break;
}