From 1b447ab2fb33f35d1393606e4e0f1197594c527c Mon Sep 17 00:00:00 2001 From: Richard Frith-Macdonald Date: Sun, 20 Jun 2021 15:56:58 +0100 Subject: [PATCH] Changes to support auto restart of hung processes --- ChangeLog | 15 +++ Control.plist | 17 ++- EcCommand.m | 341 +++++++++++++++++++++++++++++++++++++++++++------- EcProcess.m | 6 +- EcTest.m | 3 +- 5 files changed, 329 insertions(+), 53 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea4acfe..6cd7747 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2021-06-20 Richard Frith-Macdonald + + * EcCommand.m: + * EcProcess.m: + * EcTest.m: + * Control.plist: + Improve building when TLS stuff is not available. + Add code to allow a process launch configuration to contain a + HungTime setting for automatic restart of apparently hung processes. + Add HungTool configuration to specify a tool to be executed to + gather information about a hung process before it is restarted (if + not defined, the gcore tool is used). + Add Ping configuration to control how long a process needs to be + umresponsive to pings before it is considered hung. + 2021-06-18 Richard Frith-Macdonald * EcCommand.m: diff --git a/Control.plist b/Control.plist index 995e020..ce577c3 100644 --- a/Control.plist +++ b/Control.plist @@ -50,19 +50,32 @@ * AddE = (dictionary) key/value pairs added to the environment * SetE = (dictionary) key/value pairs to be used as environment * Deps = (array) Names of other processes needed by this one - * KeepStandardInput = (boolean) Don't close stdin * KeepStandardOutput = (boolean) Don't close stdout * KeepStandardError = (boolean) Don't close stderr * * ValgrindPath = (string) Run under valgrind (unless empty) * ValgrindArgs = (array) Run under valgrind using these args + * + * Ping = (integer) If greater than zero, this is the ping delay in + * seconds (otherwise the default ofd 240 is used). + * When a process has been unresponsive to the + * Command server for this number of seconds, that + * process is considered to be hung. + * HungTime = (integer) If greater than zero this the interval after + * which an apparently hung process has a restart + * scheduled. If/when the restart is scheduled + * either the HungTool or gcore is executed + * to save information about the hung process. + * HungTool = (string) The name of a tool or script to be executed + * after HungTime in order to gather information + * about the hung process. It's arguments will + * be the process name and the process ID. */ Foo = { Prog = "/usr/GNUstep/Local/Tools/Foo"; // Full path to binary Home = "Test"; // Directory to run in Args = ("-Debug", "YES"); // Args to launch with Auto = NO; // Auto-launch? - KeepStandardInput = YES; // Don't close stdin KeepStandardOutput = YES; // Don't close stdout KeepStandardError = YES; // Don't close stderr }; diff --git a/EcCommand.m b/EcCommand.m index 3dd99e3..3df2f09 100644 --- a/EcCommand.m +++ b/EcCommand.m @@ -236,7 +236,6 @@ desiredName(Desired state) /** The current task (if launched by us). */ NSTask *task; - NSFileHandle *taskInput; /** The client instance representing a registered distributed objects * connection from the process into the Command server. @@ -427,8 +426,8 @@ desiredName(Desired state) - (NSTimeInterval) delay; - (Desired) desired; - (BOOL) disabled; +- (NSTimeInterval) hungDate; - (BOOL) isActive; -- (BOOL) isHung; - (BOOL) isStarting; - (BOOL) isStopping; - (BOOL) launch; @@ -589,6 +588,7 @@ desiredName(Desired state) - (NSString*) host; - (void) housekeeping: (NSTimer*)t; - (void) _housekeeping: (NSTimer*)t; +- (void) hungRestart: (LaunchInfo*)l; - (void) information: (NSString*)inf from: (NSString*)s type: (EcLogType)t; @@ -836,8 +836,7 @@ desiredName(Desired state) */ [[NSNotificationCenter defaultCenter] removeObserver: l]; l->client = nil; - DESTROY(l->taskInput); - DESTROY(l->task); + [l taskCleanup: l->task]; [l->startingTimer invalidate]; l->startingTimer = nil; [l->stoppingTimer invalidate]; @@ -895,6 +894,15 @@ desiredName(Desired state) */ - (BOOL) checkActive { + if ([self hungDate] > 0.0 && identifier > 0) + { + /* A hung process may be considered active but there is no point + * trying to send a DO message to communicate with it and ask it + * to respond to us, because 'hung' means that we have a DO connection + * to the process but it's not talking to us over it. + */ + return YES; + } if (nil == client) { ENTER_POOL @@ -1030,8 +1038,10 @@ desiredName(Desired state) RELEASE(client); RELEASE(name); RELEASE(conf); - RELEASE(taskInput); - RELEASE(task); + if (task) + { + [self taskCleanup: task]; + } [super dealloc]; } @@ -1154,6 +1164,11 @@ desiredName(Desired state) return [[conf objectForKey: @"Disabled"] boolValue]; } +- (NSTimeInterval) hungDate +{ + return hungDate; +} + /* Returns YES if the client is in a state where it can be sent commands. */ - (BOOL) isActive @@ -1161,11 +1176,6 @@ desiredName(Desired state) return (client != nil && NO == [self isStopping]) ? YES : NO; } -- (BOOL) isHung -{ - return (hungDate > 0.0) ? YES : NO; -} - - (BOOL) isStarting { return (startingDate > 0.0) ? YES : NO; @@ -1454,7 +1464,8 @@ valgrindLog(NSString *name) if (nil == opt) { opt = [GSTLSObject dataForTLSFile: @"self-signed-key"]; - if (nil == opt) + if (nil == opt && [GSTLSCredentials + respondsToSelector: @selector(selfSigned:)]) { (void)[GSTLSCredentials selfSigned: YES]; opt = [GSTLSObject @@ -1514,13 +1525,16 @@ valgrindLog(NSString *name) hiddenArguments = m; p = [NSPipe pipe]; - ASSIGN(taskInput, [p fileHandleForWriting]); [task setStandardInput: p]; if (nil == vgPath) { - NSFileHandle *hdl = [NSFileHandle fileHandleWithNullDevice]; + static NSFileHandle *hdl = nil; + if (nil == hdl) + { + ASSIGN(hdl, [NSFileHandle fileHandleWithNullDevice]); + } s = [conf objectForKey: @"KeepStandardOutput"]; if (NO == [s respondsToSelector: @selector(boolValue)] || NO == [s boolValue]) @@ -1553,9 +1567,15 @@ valgrindLog(NSString *name) selector: @selector(taskTerminated:) name: NSTaskDidTerminateNotification object: task]; + [[NSNotificationCenter defaultCenter] + addObserver: self + selector: @selector(taskWritten:) + name: GSFileHandleWriteCompletionNotification + object: [p fileHandleForWriting]]; launchDate = [NSDate timeIntervalSinceReferenceDate]; [task launch]; - [taskInput writeInBackgroundAndNotify: hiddenArguments]; + [[p fileHandleForWriting] + writeInBackgroundAndNotify: hiddenArguments]; identifier = [task processIdentifier]; [[command logFile] printf: @"%@ launched %@ with %@ and hidden values for %@\n", @@ -1570,8 +1590,7 @@ valgrindLog(NSString *name) name: NSTaskDidTerminateNotification object: task]; launchDate = 0.0; - DESTROY(taskInput); - DESTROY(task); + [self taskCleanup: task]; failed = @"failed to launch"; m = [NSString stringWithFormat: cmdLogFormat(LT_CONSOLE, @"failed to launch (%@) %@"), localException, name]; @@ -1951,8 +1970,8 @@ valgrindLog(NSString *name) clientLostDate = 0.0; clientQuitDate = 0.0; [launchQueue removeObject: self]; + [self clearHung]; pingDate = 0.0; - hungDate = 0.0; queuedDate = 0.0; if ([self isStarting]) { @@ -2007,8 +2026,7 @@ valgrindLog(NSString *name) name: NSTaskDidTerminateNotification object: task]; launchDate = 0.0; - DESTROY(taskInput); - DESTROY(task); + [self taskCleanup: task]; } registrationDate = [NSDate timeIntervalSinceReferenceDate]; identifier = newPid; @@ -2261,8 +2279,7 @@ valgrindLog(NSString *name) name: NSTaskDidTerminateNotification object: task]; launchDate = 0.0; - DESTROY(taskInput); - DESTROY(task); + [self taskCleanup: task]; } kill(identifier, SIGKILL); identifier = 0; @@ -2468,7 +2485,7 @@ valgrindLog(NSString *name) status = [NSString stringWithFormat: @"Stopping (pid:%d) since %@", [self processIdentifier], date(stoppingDate)]; } - else if ([self isHung]) + else if ([self hungDate] > 0.0) { status = [NSString stringWithFormat: @"Hung (pid:%d) since %@", [self processIdentifier], date(hungDate)]; @@ -2524,7 +2541,15 @@ valgrindLog(NSString *name) [self resetDelay]; stoppingDate = [NSDate timeIntervalSinceReferenceDate]; abortDate = abortDateFromStoppingDate(stoppingDate); - if (nil == client) + if ([self hungDate] > 0.0 && identifier > 0) + { + /* The process is hung and we assume it can't shut down gracefully. + * We should have started a subtask to get process status information + * and will abort the process later. + */ + NSLog(@"-stop hung process %d for %@", identifier, self); + } + else if (nil == client) { /* No connection to client established ... try to shut it down * using a signal. @@ -2609,6 +2634,7 @@ valgrindLog(NSString *name) awakenedDate = 0.0; stableDate = 0.0; abortDate = 0.0; + [self clearHung]; if (clientLostDate > 0.0 || clientQuitDate > 0.0) { @@ -2862,12 +2888,7 @@ valgrindLog(NSString *name) */ if (nil != task) { - [[NSNotificationCenter defaultCenter] - removeObserver: self - name: NSTaskDidTerminateNotification - object: task]; - DESTROY(taskInput); - DESTROY(task); + [self taskCleanup: task]; } terminationSignal = -1; // We use -1 to indicate a forced quit terminationStatus = 0; @@ -2918,10 +2939,8 @@ valgrindLog(NSString *name) return AUTORELEASE(RETAIN(task)); } -- (void) taskTerminated: (NSNotification*)n +- (void) taskCleanup: (NSTask*)t { - NSTask *t = (NSTask*)[n object]; - if (nil == t) { /* For a fake termination, use existing task, if any. @@ -2931,12 +2950,47 @@ valgrindLog(NSString *name) if (nil != t) { - /* If we have a task, stop observing notifications for it. - */ + NSFileHandle *fh = [t standardInput]; + [[NSNotificationCenter defaultCenter] removeObserver: self name: NSTaskDidTerminateNotification object: t]; + + if ([fh isKindOfClass: [NSPipe class]]) + { + fh = [(NSPipe*)fh fileHandleForWriting]; + } + if (fh) + { + [[NSNotificationCenter defaultCenter] + removeObserver: self + name: GSFileHandleWriteCompletionNotification + object: fh]; + NS_DURING + { + [fh closeFile]; + } + NS_HANDLER + NS_ENDHANDLER + } + } + + if (t == task) + { + DESTROY(task); + } +} + +- (void) taskTerminated: (NSNotification*)n +{ + NSTask *t = (NSTask*)[n object]; + + if (nil == t) + { + /* For a fake termination, use existing task, if any. + */ + t = task; } if (t == task) @@ -2963,8 +3017,6 @@ valgrindLog(NSString *name) terminationStatusKnown = YES; terminationDate = [NSDate timeIntervalSinceReferenceDate]; launchDate = 0.0; - DESTROY(taskInput); - DESTROY(task); if (terminationSignal != 0) { NSLog(@"Termination signal %d for %@ (pid %d)", @@ -2976,8 +3028,29 @@ valgrindLog(NSString *name) terminationStatus, name, identifier); } identifier = 0; + [self taskCleanup: t]; [self stopping: nil]; } + else if (t) + { + [self taskCleanup: t]; + } +} + +- (void) taskWritten: (NSNotification*)n +{ + NSFileHandle *fh = (NSFileHandle*)[n object]; + + [[NSNotificationCenter defaultCenter] + removeObserver: self + name: GSFileHandleWriteCompletionNotification + object: fh]; + NS_DURING + { + [fh closeFile]; + } + NS_HANDLER + NS_ENDHANDLER } - (NSArray*) unfulfilled @@ -3843,7 +3916,7 @@ NSLog(@"Problem %@", localException); LaunchInfo *l = [LaunchInfo existing: n]; [l setPing]; // Record the fact that we have a ping response. - if ([l isHung]) + if ([l hungDate] > 0.0) { /* Had a ping response, so the process is no longer hung. */ @@ -4217,7 +4290,7 @@ NSLog(@"Problem %@", localException); { s = " starting"; } - else if ([l isHung]) + else if ([l hungDate] > 0.0) { s = " hung"; } @@ -4502,7 +4575,7 @@ NSLog(@"Problem %@", localException); { NSString *when = @"shortly"; - if ([l isHung]) + if ([l hungDate] > 0.0) { when = [NSString stringWithFormat: @"in about %d seconds", (int)quitTime]; @@ -4510,7 +4583,14 @@ NSLog(@"Problem %@", localException); m = [m stringByAppendingFormat: @" The process '%@' should restart %@.\n", [l name], when]; - [l restart: reason]; + if ([l hungDate] > 0.0 && NO == [l isStopping]) + { + [self hungRestart: l]; + } + else + { + [l restart: reason]; + } found = YES; } } @@ -5932,29 +6012,65 @@ NSLog(@"Problem %@", localException); count = [a count]; while (count-- > 0) { - EcClientI *r = [a objectAtIndex: count]; - LaunchInfo *l = [LaunchInfo existing: [r name]]; - NSDate *d = [r outstanding]; + EcClientI *r = [a objectAtIndex: count]; + LaunchInfo *l = [LaunchInfo existing: [r name]]; + NSDate *d = [r outstanding]; + NSString *s; + NSTimeInterval delay = 0.0; if ([clients indexOfObjectIdenticalTo: r] == NSNotFound) { continue; } - if (d != nil && [d timeIntervalSinceDate: now] < -pingDelay) + s = [[l configuration] objectForKey: @"Ping"]; + if ([s respondsToSelector: @selector(intValue)]) + { + delay = (NSTimeInterval)[s intValue]; + } + if (delay <= 0.0) + { + delay = pingDelay; // Default + } + if (d != nil && [d timeIntervalSinceDate: now] < -delay) { - if (nil == l || NO == [l isHung]) + if ([l hungDate] > 0.0) + { + id r; + + /* We have what looks like a hung process: + * See if it is not yet stopping and is configured to + * restart after an interval when hung. + */ + if (NO == [l isStopping] + && (r = [[l configuration] objectForKey: @"HungTime"]) + && [r respondsToSelector: @selector(intValue)]) + { + NSTimeInterval ti; + + /* See how long the process has been hung for, + * and if it exceeds the time allowed, restart. + */ + ti = [now timeIntervalSinceReferenceDate]; + ti -= [l hungDate]; + if (ti > (NSTimeInterval)[r intValue]) + { + [self hungRestart: l]; + } + } + } + else { NSString *m; [l setHung]; m = [NSString stringWithFormat: - @"failed to respond for over %d seconds", (int)pingDelay]; + @"failed to respond for over %d seconds", (int)delay]; [self alarmCode: ACProcessHung procName: [r name] addText: m]; m = [NSString stringWithFormat: cmdLogFormat(LT_CONSOLE, @"Client '%@' failed to respond for over %d seconds"), - [r name], (int)pingDelay]; + [r name], (int)delay]; [self information: m from: nil to: nil type: LT_CONSOLE]; } } @@ -6143,6 +6259,135 @@ NSLog(@"Problem %@", localException); } } +/** Initiate a restart of a hung process + */ +- (void) hungRestart: (LaunchInfo*)l +{ + NSString *prog; + NSTask *task; + NSArray *args; + NSString *base; + NSString *home; + NSString *pid; + + base = [[self ecUserDirectory] stringByAppendingPathComponent: @"DebugLogs"]; + + home = [[l configuration] objectForKey: @"Home"]; + if (home) + { + home = [home stringByStandardizingPath]; + if (![home isAbsolutePath]) + { + home = [base stringByAppendingPathComponent: home]; + } + } + else + { + home = base; + } + + pid = [NSString stringWithFormat: @"%d", [l processIdentifier]]; + + task = [NSTask new]; + + prog = [[l configuration] objectForKey: @"HungTool"]; + if ([prog isKindOfClass: [NSString class]]) + { + /* Allow an empty tool name to mean that we shold not execute a tool. + */ + prog = [prog stringByTrimmingSpaces]; + if ([prog length] == 0) + { + NSLog(@"Empty HungTool for %@; No info gathering.", [l name]); + } + else + { + [task setLaunchPath: prog]; + if ([task validatedLaunchPath] == nil) + { + NSLog(@"Failed to find HungTool (%@). Fallback to gcore", prog); + prog = nil; + } + } + } + if (prog) + { + /* Run the supplied program using the process name and pid as args + */ + args = [NSArray arrayWithObjects: [l name], pid, nil]; + } + else + { + /* Run gcode to get a coredump + */ + prog = @"gcore"; + args = [NSArray arrayWithObjects: pid, nil]; + } + + [task setArguments: args]; + [task setLaunchPath: prog]; + if (home != nil && [home length] > 0) + { + [task setCurrentDirectoryPath: home]; + } + + if ([task validatedLaunchPath] == nil) + { + if ([prog length] > 0) + { + NSLog(@"Failed to find gcore to get info for hung process %@", pid); + } + RELEASE(task); + } + else + { + NSString *msg; + + [task setStandardInput: [NSFileHandle fileHandleWithNullDevice]]; + + [[NSNotificationCenter defaultCenter] + addObserver: self + selector: @selector(hungToolTerminated:) + name: NSTaskDidTerminateNotification + object: task]; + msg = [NSString stringWithFormat: + @"Gathering hung process info using task %p for %@ (%d).", + task, [l name], [l processIdentifier]]; + [self information: msg + from: nil + to: nil + type: LT_CONSOLE]; + [task launch]; + } + + /* Here we need to run a subprocess to get a core dump of the hung process + */ + [l restart: @"Process appears to be hung"]; +} + +- (void) hungToolTerminated: (NSNotification*)n +{ + NSTask *task = (NSTask*)[n object]; + NSString *msg; + + if (NSTaskTerminationReasonUncaughtSignal == [task terminationReason]) + { + msg = [NSString stringWithFormat: + @"Gathering hung process info using task %p" + @" exited with signal %d.", task, [task terminationStatus]]; + } + else + { + msg = [NSString stringWithFormat: + @"Gathering hung process info using task %p" + @" returned status %d", task, [task terminationStatus]]; + } + [self information: msg + from: nil + to: nil + type: LT_CONSOLE]; + RELEASE(task); +} - (NSMutableArray*) unconfiguredClients { diff --git a/EcProcess.m b/EcProcess.m index 0d76409..ecd37ee 100644 --- a/EcProcess.m +++ b/EcProcess.m @@ -1652,7 +1652,8 @@ findMode(NSDictionary* d, NSString* s) [opts setObject: @"self-signed-crt" forKey: GSTLSCertificateFile]; } - [NSSocketPort setOptionsForTLS: opts]; + [NSSocketPort performSelector: @selector(setOptionsForTLS:) + withObject: opts]; } } #endif @@ -3137,7 +3138,8 @@ NSLog(@"Ignored attempt to set timer interval to %g ... using 10.0", interval); */ if ([NSSocketPort respondsToSelector: @selector(setOptionsForTLS:)]) { - [NSSocketPort setOptionsForTLS: [NSDictionary dictionary]]; + [NSSocketPort performSelector: @selector(setOptionsForTLS:) + withObject: [NSDictionary dictionary]]; } #endif ecLock = [NSRecursiveLock new]; diff --git a/EcTest.m b/EcTest.m index 8189989..aae0f14 100644 --- a/EcTest.m +++ b/EcTest.m @@ -53,7 +53,8 @@ setup() */ if ([NSSocketPort respondsToSelector: @selector(setOptionsForTLS:)]) { - [NSSocketPort setOptionsForTLS: [NSDictionary dictionary]]; + [NSSocketPort performSelector: @selector(setOptionsForTLS:) + withObject: [NSDictionary dictionary]]; } } }