diff --git a/tests/runner.pm b/tests/runner.pm index 65dbd6f9b9..813c669890 100644 --- a/tests/runner.pm +++ b/tests/runner.pm @@ -1171,7 +1171,7 @@ sub runner_test_run { # Async call runner_clearlocks # Called by controller sub runnerac_clearlocks { - controlleripccall(\&runner_clearlocks, @_); + return controlleripccall(\&runner_clearlocks, @_); } # Async call runner_shutdown @@ -1180,36 +1180,38 @@ sub runnerac_clearlocks { # Called by controller sub runnerac_shutdown { my ($runnerid)=$_[0]; - controlleripccall(\&runner_shutdown, @_); + my $err = controlleripccall(\&runner_shutdown, @_); # These have no more use close($controllerw{$runnerid}); undef $controllerw{$runnerid}; close($controllerr{$runnerid}); undef $controllerr{$runnerid}; + return $err; } # Async call of runner_stopservers # Called by controller sub runnerac_stopservers { - controlleripccall(\&runner_stopservers, @_); + return controlleripccall(\&runner_stopservers, @_); } # Async call of runner_test_preprocess # Called by controller sub runnerac_test_preprocess { - controlleripccall(\&runner_test_preprocess, @_); + return controlleripccall(\&runner_test_preprocess, @_); } # Async call of runner_test_run # Called by controller sub runnerac_test_run { - controlleripccall(\&runner_test_run, @_); + return controlleripccall(\&runner_test_run, @_); } ################################################################### # Call an arbitrary function via IPC # The first argument is the function reference, the second is the runner ID +# Returns 0 on success, -1 on error writing to runner # Called by controller (indirectly, via a more specific function) sub controlleripccall { my $funcref = shift @_; @@ -1223,30 +1225,44 @@ sub controlleripccall { my $margs = freeze \@_; # Send IPC call via pipe - syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs); + my $err; + while(! defined ($err = syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs)) || $err <= 0) { + if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) { + # Runner has likely died + return -1; + } + # system call was interrupted, probably by ^C; restart it so we stay in sync + } if(!$multiprocess) { # Call the remote function here in single process mode ipcrecv(); } + return 0; } ################################################################### # Receive async response of a previous call via IPC -# The first return value is the runner ID +# The first return value is the runner ID or undef on error # Called by controller sub runnerar { my ($runnerid) = @_; my $err; my $datalen; while(! defined ($err = sysread($controllerr{$runnerid}, $datalen, 4)) || $err <= 0) { - $!{EINTR} || die "error in runnerar: $!\n"; + if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) { + # Runner is likely dead and closed the pipe + return undef; + } # system call was interrupted, probably by ^C; restart it so we stay in sync } my $len=unpack("L", $datalen); my $buf; while(! defined ($err = sysread($controllerr{$runnerid}, $buf, $len)) || $err <= 0) { - $!{EINTR} || die "error in runnerar: $!\n"; + if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) { + # Runner is likely dead and closed the pipe + return undef; + } # system call was interrupted, probably by ^C; restart it so we stay in sync } @@ -1260,7 +1276,9 @@ sub runnerar { } ################################################################### -# Returns runner ID if a response from an async call is ready +# Returns runner ID if a response from an async call is ready or error +# First value is ready, second is error, however an error case shows up +# as ready in Linux, so you can't trust it. # argument is 0 for nonblocking, undef for blocking, anything else for timeout # Called by controller sub runnerar_ready { @@ -1279,19 +1297,26 @@ sub runnerar_ready { $maxfileno || die "Internal error: no runners are available to wait on\n"; # Wait for any pipe from any runner to be ready + # This may be interrupted and return EINTR, but this is ignored and the + # caller will need to later call this function again. # TODO: this is relatively slow with hundreds of fds - # TODO: handle errors - if(select(my $rout=$rin, undef, undef, $blocking)) { + my $ein = $rin; + if(select(my $rout=$rin, undef, my $eout=$ein, $blocking) >= 1) { for my $fd (0..$maxfileno) { + # Return an error condition first in case it's both + if(vec($eout, $fd, 1)) { + return (undef, $idbyfileno{$fd}); + } if(vec($rout, $fd, 1)) { - return $idbyfileno{$fd}; + return ($idbyfileno{$fd}, undef); } } die "Internal pipe readiness inconsistency\n"; } - return undef; + return (undef, undef); } + ################################################################### # Cleanly abort and exit the runner # This uses print since there is no longer any controller to write logs. diff --git a/tests/runtests.pl b/tests/runtests.pl index 2c10c12779..d7a6843c52 100755 --- a/tests/runtests.pl +++ b/tests/runtests.pl @@ -272,6 +272,7 @@ sub catch_usr1 { } $SIG{USR1} = \&catch_usr1; +$SIG{PIPE} = 'IGNORE'; # these errors are captured in the read/write calls ########################################################################## # Clear all possible '*_proxy' environment variables for various protocols @@ -1505,12 +1506,20 @@ sub singletest_check { if(!$filename) { logmsg "ERROR: section verify=>file$partsuffix ". "has no name attribute\n"; - runnerac_stopservers($runnerid); - # TODO: this is a blocking call that will stall the controller, - # but this error condition should never happen except during - # development. - my ($rid, $unexpected, $logs) = runnerar($runnerid); - logmsg $logs; + if (runnerac_stopservers($runnerid)) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + } else { + + # TODO: this is a blocking call that will stall the controller, + # but this error condition should never happen except during + # development. + my ($rid, $unexpected, $logs) = runnerar($runnerid); + if(!$rid) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + } else { + logmsg $logs; + } + } # timestamp test result verification end $timevrfyend{$testnum} = Time::HiRes::time(); return -1; @@ -1740,7 +1749,11 @@ sub singletest { && $clearlocks) { # On Windows, lock files can't be deleted when the process still # has them open, so kill those processes first - runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR"); + if(runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR")) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } $singletest_state{$runnerid} = ST_CLEARLOCKS; } else { $singletest_state{$runnerid} = ST_INITED; @@ -1751,6 +1764,11 @@ sub singletest { } elsif($singletest_state{$runnerid} == ST_CLEARLOCKS) { my ($rid, $logs) = runnerar($runnerid); + if(!$rid) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } logmsg $logs; my $logdir = getrunnerlogdir($runnerid); cleardir($logdir); @@ -1776,11 +1794,20 @@ sub singletest { # Register the test case with the CI environment citest_starttest($testnum); - runnerac_test_preprocess($runnerid, $testnum); + if(runnerac_test_preprocess($runnerid, $testnum)) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } $singletest_state{$runnerid} = ST_PREPROCESS; } elsif($singletest_state{$runnerid} == ST_PREPROCESS) { my ($rid, $why, $error, $logs, $testtimings) = runnerar($runnerid); + if(!$rid) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } logmsg $logs; updatetesttimings($testnum, %$testtimings); if($error == -2) { @@ -1813,11 +1840,20 @@ sub singletest { my $CURLOUT; my $tool; my $usedvalgrind; - runnerac_test_run($runnerid, $testnum); + if(runnerac_test_run($runnerid, $testnum)) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } $singletest_state{$runnerid} = ST_RUN; } elsif($singletest_state{$runnerid} == ST_RUN) { my ($rid, $error, $logs, $testtimings, $cmdres, $CURLOUT, $tool, $usedvalgrind) = runnerar($runnerid); + if(!$rid) { + logmsg "ERROR: runner $runnerid seems to have died\n"; + $singletest_state{$runnerid} = ST_INIT; + return (-1, 0); + } logmsg $logs; updatetesttimings($testnum, %$testtimings); if($error == -1) { @@ -2782,8 +2818,13 @@ while () { # this runner is busy running a test $runnersrunning{$runnerid} = $testnum; } else { - # We make this assumption to avoid having to handle $error here - die "Internal error: test must not complete on first call"; + runnerready($runnerid); + if($error >= 0) { + # We make this simplifying assumption to avoid having to handle + # $error properly here, but we must handle the case of runner + # death without abending here. + die "Internal error: test must not complete on first call"; + } } } @@ -2799,7 +2840,15 @@ while () { # one immediately. If all runners are busy, wait a fraction of a second # for one to finish so we can still loop around to check the abort flag. my $runnerwait = scalar(@runnersidle) && scalar(@runtests) ? 0 : 0.5; - my $ridready = runnerar_ready($runnerwait); + my ($ridready, $riderror) = runnerar_ready($runnerwait); + if($ridready && ! defined $runnersrunning{$ridready}) { + # On Linux, a closed pipe still shows up as ready instead of error. + # Detect this here by seeing if we are expecting it to be ready and + # treat it as an error if not. + logmsg "ERROR: Runner $ridready is unexpectedly ready; is probably actually dead\n"; + $riderror = $ridready; + undef $ridready; + } if($ridready) { # This runner is ready to be serviced my $testnum = $runnersrunning{$ridready}; @@ -2812,7 +2861,6 @@ while () { } else { # Test is complete runnerready($ridready); -print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning)) . "\n"; if($error < 0) { # not a test we can run @@ -2847,6 +2895,11 @@ print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning)) } } } + if($riderror) { + logmsg "ERROR: runner $riderror is dead! aborting test run\n"; + delete $runnersrunning{$riderror} if(defined $runnersrunning{$riderror}); + $globalabort = 1; + } } my $sofar = time() - $start;