From 35bb9f48ef11c8b742039213ab22341954e0805c Mon Sep 17 00:00:00 2001 From: rfm Date: Sun, 12 May 2024 10:03:15 +0100 Subject: [PATCH] regular expression range search --- ChangeLog | 9 ++ Headers/Foundation/NSRegularExpression.h | 2 +- Headers/Foundation/NSString.h | 2 +- Source/NSRegularExpression.m | 24 +++-- Source/NSString.m | 132 +++++++++++++++-------- 5 files changed, 112 insertions(+), 57 deletions(-) diff --git a/ChangeLog b/ChangeLog index e81bf7550..6df9d8fef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2024-05-12 ethanc8R (github user) + + * Headers/Foundation/NSRegularExpression.h: + * Headers/Foundation/NSString.h: + * Source/NSRegularExpression.m: + * Source/NSString.m: + * Tests/base/NSString/enumerateSubstringsInRange.m: + Added regular expression search methods. + 2024-05-08 Hugo Melder * Source/NSIndexSet.m: diff --git a/Headers/Foundation/NSRegularExpression.h b/Headers/Foundation/NSRegularExpression.h index f0e9d495a..7f9a6ca9d 100644 --- a/Headers/Foundation/NSRegularExpression.h +++ b/Headers/Foundation/NSRegularExpression.h @@ -150,7 +150,7 @@ GS_EXPORT_CLASS offset: (NSInteger)offset template: (NSString*)templat; #if OS_API_VERSION(MAC_OS_X_VERSION_10_7, GS_API_LATEST) -+ (NSString *)escapedPatternForString:(NSString *)string; ++ (NSString *) escapedPatternForString: (NSString *)string; #endif #if GS_HAS_DECLARED_PROPERTIES @property (readonly) NSRegularExpressionOptions options; diff --git a/Headers/Foundation/NSString.h b/Headers/Foundation/NSString.h index e6362970c..339e892d7 100644 --- a/Headers/Foundation/NSString.h +++ b/Headers/Foundation/NSString.h @@ -527,7 +527,7 @@ GS_EXPORT_CLASS length: (NSUInteger)length; + (instancetype) stringWithCString: (const char*)byteString; + (instancetype) stringWithFormat: (NSString*)format, ... NS_FORMAT_FUNCTION(1,2); -+ (instancetype) stringWithContentsOfFile:(NSString *)path; ++ (instancetype) stringWithContentsOfFile: (NSString *)path; // Initializing Newly Allocated Strings - (instancetype) init; diff --git a/Source/NSRegularExpression.m b/Source/NSRegularExpression.m index b4afa473d..548bec106 100644 --- a/Source/NSRegularExpression.m +++ b/Source/NSRegularExpression.m @@ -640,11 +640,12 @@ prepareResult(NSRegularExpression *regex, { __block NSUInteger count = 0; + GSRegexBlock block; opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportCompletion; - GSRegexBlock block = + block = ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) { count++; @@ -660,12 +661,13 @@ prepareResult(NSRegularExpression *regex, options: (NSMatchingOptions)opts range: (NSRange)range { - __block NSTextCheckingResult *r = nil; + __block NSTextCheckingResult *r = nil; + GSRegexBlock block; opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportCompletion; - GSRegexBlock block = + block = ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) { r = result; @@ -683,11 +685,12 @@ prepareResult(NSRegularExpression *regex, range:(NSRange)range { NSMutableArray *array = [NSMutableArray array]; + GSRegexBlock block; opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportCompletion; - GSRegexBlock block = + block = ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) { [array addObject: result]; @@ -703,12 +706,13 @@ prepareResult(NSRegularExpression *regex, options: (NSMatchingOptions)opts range: (NSRange)range { - __block NSRange r = {NSNotFound, 0}; + __block NSRange r = {NSNotFound, 0}; + GSRegexBlock block; opts &= ~NSMatchingReportProgress; opts &= ~NSMatchingReportCompletion; - GSRegexBlock block = + block = ^(NSTextCheckingResult *result, NSMatchingFlags flags, BOOL *stop) { r = [result range]; @@ -1064,9 +1068,11 @@ prepareResult(NSRegularExpression *regex, } #endif -+ (NSString *)escapedPatternForString:(NSString *)string { - // https://unicode-org.github.io/icu/userguide/strings/regexp.html - // Need to escape * ? + [ ( ) { } ^ $ | \ . ++ (NSString*) escapedPatternForString: (NSString *)string +{ + /* https://unicode-org.github.io/icu/userguide/strings/regexp.html + * Need to escape * ? + [ ( ) { } ^ $ | \ . + */ return [[NSRegularExpression regularExpressionWithPattern: @"([*?+\\[(){}^$|\\\\.])" options: 0 diff --git a/Source/NSString.m b/Source/NSString.m index 970856a85..5ca6aa1d2 100644 --- a/Source/NSString.m +++ b/Source/NSString.m @@ -6294,29 +6294,47 @@ static NSFileManager *fm = nil; currentLocation = range.location; } - if (substringType == NSStringEnumerationByLines || substringType == NSStringEnumerationByParagraphs) + if (substringType == NSStringEnumerationByLines + || substringType == NSStringEnumerationByParagraphs) { BOOL isLineSep = substringType == NSStringEnumerationByLines; while (YES) { - // contains the index of the first character of the line containing the beginning of aRange. - NSUInteger start; - // contains the index of the first character past the terminator of the line containing the end of aRange. - NSUInteger end; - // contains the index of the first character of the terminator of the line containing the end of aRange. - NSUInteger contentsEnd; - NSRange currentLocationRange = NSMakeRange(currentLocation, 0); + /* Contains the index of the first character of the line + * containing the beginning of aRange. + */ + NSUInteger start; + + /* Contains the index of the first character past the + * terminator of the line containing the end of aRange. + */ + NSUInteger end; + + /* Contains the index of the first character of the terminator + * of the line containing the end of aRange. + */ + NSUInteger contentsEnd; + NSRange currentLocationRange = NSMakeRange(currentLocation, 0); + NSUInteger substringStart; + NSRange substringRange; + [self _getStart: &start end: &end contentsEnd: &contentsEnd forRange: currentLocationRange lineSep: isLineSep]; - // If the enumerated range starts after the line/paragraph, we start at the beginning of the enumerated range - NSUInteger substringStart = start > range.location ? start : range.location; - NSRange substringRange = NSMakeRange(substringStart, contentsEnd - substringStart); + + /* If the enumerated range starts after the line/paragraph, + * we start at the beginning of the enumerated range + */ + substringStart = start > range.location ? start : range.location; + substringRange + = NSMakeRange(substringStart, contentsEnd - substringStart); CALL_BLOCK(block, - substringNotRequired ? nil : [self substringWithRange: substringRange], + substringNotRequired + ? nil + : [self substringWithRange: substringRange], substringRange, NSMakeRange(start, end - start), &stop); @@ -6327,21 +6345,31 @@ static NSFileManager *fm = nil; } else if (substringType == NSStringEnumerationByComposedCharacterSequences) { - // We could also use rangeOfComposedCharacterSequenceAtIndex:, but then we would need different logic. + /* We could also use rangeOfComposedCharacterSequenceAtIndex:, + * but then we would need different logic. + */ while (YES) { - // Since all characters are in a composed character sequence, enclosingRange == substringRange - NSRange enclosingRange = [self rangeOfComposedCharacterSequenceAtIndex: currentLocation]; + NSRange enclosingRange; + + /* Since all characters are in a composed character sequence, + * enclosingRange == substringRange + */ + enclosingRange + = [self rangeOfComposedCharacterSequenceAtIndex: currentLocation]; CALL_BLOCK(block, - substringNotRequired ? nil : [self substringWithRange: enclosingRange], + substringNotRequired + ? nil + : [self substringWithRange: enclosingRange], enclosingRange, enclosingRange, &stop); - if(stop) break; + if (stop) break; currentLocation = enclosingRange.location + enclosingRange.length; } } - else if (substringType == NSStringEnumerationByWords || substringType == NSStringEnumerationBySentences) + else if (substringType == NSStringEnumerationByWords + || substringType == NSStringEnumerationBySentences) { #if GS_USE_ICU // These macros may be useful elsewhere. @@ -6355,63 +6383,75 @@ static NSFileManager *fm = nil; errorCode = U_ZERO_ERROR; \ } while (NO) - BOOL byWords = substringType == NSStringEnumerationByWords; - NSUInteger length = range.length; - UChar characters[length]; + BOOL byWords = substringType == NSStringEnumerationByWords; + NSUInteger length = range.length; + UChar characters[length]; + UErrorCode errorCode = U_ZERO_ERROR; + const char *locale; + UBreakIterator *breakIterator; + [self getCharacters: characters range: range]; - UErrorCode errorCode = U_ZERO_ERROR; - const char* locale = localized - ? [[[[NSLocale currentLocale] - localeIdentifier] - // @ss=standard will use lists of common abbreviations, such as Mr., Mrs., etc. - stringByAppendingString: @"@ss=standard"] - UTF8String] + /* @ss=standard will use lists of common abbreviations, + * such as Mr., Mrs., etc. + */ + locale = localized + ? [[[[NSLocale currentLocale] localeIdentifier] + stringByAppendingString: @"@ss=standard"] UTF8String] : "en_US_POSIX"; - UBreakIterator* breakIterator = ubrk_open(byWords ? UBRK_WORD : UBRK_SENTENCE, // type - locale, // locale - characters, // text - length, // textLength - &errorCode); + breakIterator = ubrk_open( + byWords ? UBRK_WORD : UBRK_SENTENCE, // type + locale, // locale + characters, // text + length, // textLength + &errorCode); GS_U_HANDLE_ERROR(errorCode, @"opening ICU break iterator"); ubrk_first(breakIterator); while (YES) { // Make sure it's a valid substring. - BOOL isValidSubstring = YES; + BOOL isValidSubstring = YES; + int32_t nextPosition; + NSUInteger nextLocation; + NSRange enclosingRange; if (byWords) { int32_t ruleStatus = ubrk_getRuleStatus(breakIterator); - // From ICU User Guide: - // A status value UBRK_WORD_NONE indicates that the boundary does - // not start a word or number. - // However, valid words seem to be UBRK_WORD_NONE, and invalid words - // seem to be UBRK_WORD_NONE_LIMIT. + /* From ICU User Guide: + * A status value UBRK_WORD_NONE indicates that the boundary + * does not start a word or number. + * However, valid words seem to be UBRK_WORD_NONE, and invalid + * words seem to be UBRK_WORD_NONE_LIMIT. + */ isValidSubstring = ruleStatus != UBRK_WORD_NONE_LIMIT; - NSLog(@"Status for position %d (%d): %d", (int)currentLocation, (int)ubrk_current(breakIterator), (int) ruleStatus); +// NSLog(@"Status for position %d (%d): %d", (int)currentLocation, (int)ubrk_current(breakIterator), (int) ruleStatus); } - int32_t nextPosition = ubrk_next(breakIterator); + nextPosition = ubrk_next(breakIterator); if (nextPosition == UBRK_DONE) break; - NSUInteger nextLocation = range.location + nextPosition; + nextLocation = range.location + nextPosition; // Same as substringRange - NSRange enclosingRange = NSMakeRange(currentLocation, nextLocation - currentLocation); + enclosingRange + = NSMakeRange(currentLocation, nextLocation - currentLocation); if (isValidSubstring) { CALL_BLOCK(block, - substringNotRequired ? nil : [self substringWithRange: enclosingRange], + substringNotRequired + ? nil + : [self substringWithRange: enclosingRange], enclosingRange, enclosingRange, &stop); - if(stop) break; + if (stop) break; } currentLocation = nextLocation; } #else - NSWarnLog(@"NSStringEnumerationByWords and NSStringEnumerationBySentences are not supported when GNUstep-base is compiled without ICU."); + NSWarnLog(@"NSStringEnumerationByWords and NSStringEnumerationBySentences" + @" are not supported when GNUstep-base is compiled without ICU."); return; #endif }