regular expression range search

This commit is contained in:
rfm 2024-05-12 10:03:15 +01:00
parent 1fdf6395bd
commit 35bb9f48ef
5 changed files with 112 additions and 57 deletions

View file

@ -1,3 +1,12 @@
2024-05-12 ethanc8R (github user)
* Headers/Foundation/NSRegularExpression.h:
* Headers/Foundation/NSString.h:
* Source/NSRegularExpression.m:
* Source/NSString.m:
* Tests/base/NSString/enumerateSubstringsInRange.m:
Added regular expression search methods.
2024-05-08 Hugo Melder <hugo@algoriddim.com> 2024-05-08 Hugo Melder <hugo@algoriddim.com>
* Source/NSIndexSet.m: * Source/NSIndexSet.m:

View file

@ -150,7 +150,7 @@ GS_EXPORT_CLASS
offset: (NSInteger)offset offset: (NSInteger)offset
template: (NSString*)templat; template: (NSString*)templat;
#if OS_API_VERSION(MAC_OS_X_VERSION_10_7, GS_API_LATEST) #if OS_API_VERSION(MAC_OS_X_VERSION_10_7, GS_API_LATEST)
+ (NSString *)escapedPatternForString:(NSString *)string; + (NSString *) escapedPatternForString: (NSString *)string;
#endif #endif
#if GS_HAS_DECLARED_PROPERTIES #if GS_HAS_DECLARED_PROPERTIES
@property (readonly) NSRegularExpressionOptions options; @property (readonly) NSRegularExpressionOptions options;

View file

@ -527,7 +527,7 @@ GS_EXPORT_CLASS
length: (NSUInteger)length; length: (NSUInteger)length;
+ (instancetype) stringWithCString: (const char*)byteString; + (instancetype) stringWithCString: (const char*)byteString;
+ (instancetype) stringWithFormat: (NSString*)format, ... NS_FORMAT_FUNCTION(1,2); + (instancetype) stringWithFormat: (NSString*)format, ... NS_FORMAT_FUNCTION(1,2);
+ (instancetype) stringWithContentsOfFile:(NSString *)path; + (instancetype) stringWithContentsOfFile: (NSString *)path;
// Initializing Newly Allocated Strings // Initializing Newly Allocated Strings
- (instancetype) init; - (instancetype) init;

View file

@ -640,11 +640,12 @@ prepareResult(NSRegularExpression *regex,
{ {
__block NSUInteger count = 0; __block NSUInteger count = 0;
GSRegexBlock block;
opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportProgress;
opts &= ~NSMatchingReportCompletion; opts &= ~NSMatchingReportCompletion;
GSRegexBlock block = block =
^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop)
{ {
count++; count++;
@ -660,12 +661,13 @@ prepareResult(NSRegularExpression *regex,
options: (NSMatchingOptions)opts options: (NSMatchingOptions)opts
range: (NSRange)range range: (NSRange)range
{ {
__block NSTextCheckingResult *r = nil; __block NSTextCheckingResult *r = nil;
GSRegexBlock block;
opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportProgress;
opts &= ~NSMatchingReportCompletion; opts &= ~NSMatchingReportCompletion;
GSRegexBlock block = block =
^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop)
{ {
r = result; r = result;
@ -683,11 +685,12 @@ prepareResult(NSRegularExpression *regex,
range:(NSRange)range range:(NSRange)range
{ {
NSMutableArray *array = [NSMutableArray array]; NSMutableArray *array = [NSMutableArray array];
GSRegexBlock block;
opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportProgress;
opts &= ~NSMatchingReportCompletion; opts &= ~NSMatchingReportCompletion;
GSRegexBlock block = block =
^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop)
{ {
[array addObject: result]; [array addObject: result];
@ -703,12 +706,13 @@ prepareResult(NSRegularExpression *regex,
options: (NSMatchingOptions)opts options: (NSMatchingOptions)opts
range: (NSRange)range range: (NSRange)range
{ {
__block NSRange r = {NSNotFound, 0}; __block NSRange r = {NSNotFound, 0};
GSRegexBlock block;
opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportProgress;
opts &= ~NSMatchingReportCompletion; opts &= ~NSMatchingReportCompletion;
GSRegexBlock block = block =
^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop)
{ {
r = [result range]; r = [result range];
@ -1064,9 +1068,11 @@ prepareResult(NSRegularExpression *regex,
} }
#endif #endif
+ (NSString *)escapedPatternForString:(NSString *)string { + (NSString*) escapedPatternForString: (NSString *)string
// https://unicode-org.github.io/icu/userguide/strings/regexp.html {
// Need to escape * ? + [ ( ) { } ^ $ | \ . /* https://unicode-org.github.io/icu/userguide/strings/regexp.html
* Need to escape * ? + [ ( ) { } ^ $ | \ .
*/
return [[NSRegularExpression return [[NSRegularExpression
regularExpressionWithPattern: @"([*?+\\[(){}^$|\\\\.])" regularExpressionWithPattern: @"([*?+\\[(){}^$|\\\\.])"
options: 0 options: 0

View file

@ -6294,29 +6294,47 @@ static NSFileManager *fm = nil;
currentLocation = range.location; currentLocation = range.location;
} }
if (substringType == NSStringEnumerationByLines || substringType == NSStringEnumerationByParagraphs) if (substringType == NSStringEnumerationByLines
|| substringType == NSStringEnumerationByParagraphs)
{ {
BOOL isLineSep = substringType == NSStringEnumerationByLines; BOOL isLineSep = substringType == NSStringEnumerationByLines;
while (YES) while (YES)
{ {
// contains the index of the first character of the line containing the beginning of aRange. /* Contains the index of the first character of the line
NSUInteger start; * containing the beginning of aRange.
// contains the index of the first character past the terminator of the line containing the end of aRange. */
NSUInteger end; NSUInteger start;
// contains the index of the first character of the terminator of the line containing the end of aRange.
NSUInteger contentsEnd; /* Contains the index of the first character past the
NSRange currentLocationRange = NSMakeRange(currentLocation, 0); * terminator of the line containing the end of aRange.
*/
NSUInteger end;
/* Contains the index of the first character of the terminator
* of the line containing the end of aRange.
*/
NSUInteger contentsEnd;
NSRange currentLocationRange = NSMakeRange(currentLocation, 0);
NSUInteger substringStart;
NSRange substringRange;
[self _getStart: &start [self _getStart: &start
end: &end end: &end
contentsEnd: &contentsEnd contentsEnd: &contentsEnd
forRange: currentLocationRange forRange: currentLocationRange
lineSep: isLineSep]; lineSep: isLineSep];
// If the enumerated range starts after the line/paragraph, we start at the beginning of the enumerated range
NSUInteger substringStart = start > range.location ? start : range.location; /* If the enumerated range starts after the line/paragraph,
NSRange substringRange = NSMakeRange(substringStart, contentsEnd - substringStart); * we start at the beginning of the enumerated range
*/
substringStart = start > range.location ? start : range.location;
substringRange
= NSMakeRange(substringStart, contentsEnd - substringStart);
CALL_BLOCK(block, CALL_BLOCK(block,
substringNotRequired ? nil : [self substringWithRange: substringRange], substringNotRequired
? nil
: [self substringWithRange: substringRange],
substringRange, substringRange,
NSMakeRange(start, end - start), NSMakeRange(start, end - start),
&stop); &stop);
@ -6327,21 +6345,31 @@ static NSFileManager *fm = nil;
} }
else if (substringType == NSStringEnumerationByComposedCharacterSequences) else if (substringType == NSStringEnumerationByComposedCharacterSequences)
{ {
// We could also use rangeOfComposedCharacterSequenceAtIndex:, but then we would need different logic. /* We could also use rangeOfComposedCharacterSequenceAtIndex:,
* but then we would need different logic.
*/
while (YES) while (YES)
{ {
// Since all characters are in a composed character sequence, enclosingRange == substringRange NSRange enclosingRange;
NSRange enclosingRange = [self rangeOfComposedCharacterSequenceAtIndex: currentLocation];
/* Since all characters are in a composed character sequence,
* enclosingRange == substringRange
*/
enclosingRange
= [self rangeOfComposedCharacterSequenceAtIndex: currentLocation];
CALL_BLOCK(block, CALL_BLOCK(block,
substringNotRequired ? nil : [self substringWithRange: enclosingRange], substringNotRequired
? nil
: [self substringWithRange: enclosingRange],
enclosingRange, enclosingRange,
enclosingRange, enclosingRange,
&stop); &stop);
if(stop) break; if (stop) break;
currentLocation = enclosingRange.location + enclosingRange.length; currentLocation = enclosingRange.location + enclosingRange.length;
} }
} }
else if (substringType == NSStringEnumerationByWords || substringType == NSStringEnumerationBySentences) else if (substringType == NSStringEnumerationByWords
|| substringType == NSStringEnumerationBySentences)
{ {
#if GS_USE_ICU #if GS_USE_ICU
// These macros may be useful elsewhere. // These macros may be useful elsewhere.
@ -6355,63 +6383,75 @@ static NSFileManager *fm = nil;
errorCode = U_ZERO_ERROR; \ errorCode = U_ZERO_ERROR; \
} while (NO) } while (NO)
BOOL byWords = substringType == NSStringEnumerationByWords; BOOL byWords = substringType == NSStringEnumerationByWords;
NSUInteger length = range.length; NSUInteger length = range.length;
UChar characters[length]; UChar characters[length];
UErrorCode errorCode = U_ZERO_ERROR;
const char *locale;
UBreakIterator *breakIterator;
[self getCharacters: characters range: range]; [self getCharacters: characters range: range];
UErrorCode errorCode = U_ZERO_ERROR; /* @ss=standard will use lists of common abbreviations,
const char* locale = localized * such as Mr., Mrs., etc.
? [[[[NSLocale currentLocale] */
localeIdentifier] locale = localized
// @ss=standard will use lists of common abbreviations, such as Mr., Mrs., etc. ? [[[[NSLocale currentLocale] localeIdentifier]
stringByAppendingString: @"@ss=standard"] stringByAppendingString: @"@ss=standard"] UTF8String]
UTF8String]
: "en_US_POSIX"; : "en_US_POSIX";
UBreakIterator* breakIterator = ubrk_open(byWords ? UBRK_WORD : UBRK_SENTENCE, // type breakIterator = ubrk_open(
locale, // locale byWords ? UBRK_WORD : UBRK_SENTENCE, // type
characters, // text locale, // locale
length, // textLength characters, // text
&errorCode); length, // textLength
&errorCode);
GS_U_HANDLE_ERROR(errorCode, @"opening ICU break iterator"); GS_U_HANDLE_ERROR(errorCode, @"opening ICU break iterator");
ubrk_first(breakIterator); ubrk_first(breakIterator);
while (YES) while (YES)
{ {
// Make sure it's a valid substring. // Make sure it's a valid substring.
BOOL isValidSubstring = YES; BOOL isValidSubstring = YES;
int32_t nextPosition;
NSUInteger nextLocation;
NSRange enclosingRange;
if (byWords) if (byWords)
{ {
int32_t ruleStatus = ubrk_getRuleStatus(breakIterator); int32_t ruleStatus = ubrk_getRuleStatus(breakIterator);
// From ICU User Guide: /* From ICU User Guide:
// A status value UBRK_WORD_NONE indicates that the boundary does * A status value UBRK_WORD_NONE indicates that the boundary
// not start a word or number. * does not start a word or number.
// However, valid words seem to be UBRK_WORD_NONE, and invalid words * However, valid words seem to be UBRK_WORD_NONE, and invalid
// seem to be UBRK_WORD_NONE_LIMIT. * words seem to be UBRK_WORD_NONE_LIMIT.
*/
isValidSubstring = ruleStatus != UBRK_WORD_NONE_LIMIT; isValidSubstring = ruleStatus != UBRK_WORD_NONE_LIMIT;
NSLog(@"Status for position %d (%d): %d", (int)currentLocation, (int)ubrk_current(breakIterator), (int) ruleStatus); // NSLog(@"Status for position %d (%d): %d", (int)currentLocation, (int)ubrk_current(breakIterator), (int) ruleStatus);
} }
int32_t nextPosition = ubrk_next(breakIterator); nextPosition = ubrk_next(breakIterator);
if (nextPosition == UBRK_DONE) break; if (nextPosition == UBRK_DONE) break;
NSUInteger nextLocation = range.location + nextPosition; nextLocation = range.location + nextPosition;
// Same as substringRange // Same as substringRange
NSRange enclosingRange = NSMakeRange(currentLocation, nextLocation - currentLocation); enclosingRange
= NSMakeRange(currentLocation, nextLocation - currentLocation);
if (isValidSubstring) if (isValidSubstring)
{ {
CALL_BLOCK(block, CALL_BLOCK(block,
substringNotRequired ? nil : [self substringWithRange: enclosingRange], substringNotRequired
? nil
: [self substringWithRange: enclosingRange],
enclosingRange, enclosingRange,
enclosingRange, enclosingRange,
&stop); &stop);
if(stop) break; if (stop) break;
} }
currentLocation = nextLocation; currentLocation = nextLocation;
} }
#else #else
NSWarnLog(@"NSStringEnumerationByWords and NSStringEnumerationBySentences are not supported when GNUstep-base is compiled without ICU."); NSWarnLog(@"NSStringEnumerationByWords and NSStringEnumerationBySentences"
@" are not supported when GNUstep-base is compiled without ICU.");
return; return;
#endif #endif
} }