From e34540f79be9c915ce1d6191575267066370f6ec Mon Sep 17 00:00:00 2001 From: Paul Smith Date: Sun, 1 Aug 1999 08:12:06 +0000 Subject: [PATCH] * Release 3.77.92. * Complete implementation of new jobserver algorithm. * A few minor fixups. --- ChangeLog | 39 +++++++++++++++++- arscan.c | 7 +--- configure.in | 2 +- function.c | 6 +-- job.c | 109 ++++++++++++++++++++++++++++----------------------- main.c | 11 ++---- make.h | 8 ++++ remake.c | 7 +--- 8 files changed, 117 insertions(+), 72 deletions(-) diff --git a/ChangeLog b/ChangeLog index 59d403d5..14f2c6d1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,43 @@ +1999-08-01 Paul D. Smith + + New jobserver algorithm to avoid a possible hole where we could + miss SIGCHLDs and get into a deadlock. The original algorithm was + suggested by Roland McGrath with a nice refinement by Paul Eggert. + Many thanks as well to Tim Magill and Howard Chu, who also + provided many viable ideas and critiques. We all had a fun week + dreaming up interesting ways to use and abuse UNIX syscalls :). + + Previously we could miss a SIGCHLD if it happened after we reaped + the children but before we re-entered the blocking read. If this + happened to all makes and/or all children, make would never wake + up. + + We avoid this by having the SIGCHLD handler reset the blocking bit + on the jobserver pipe read FD (normally read does block in this + algorithm). Now if the handler is called between the time we reap + and the time we read(), and there are no tokens available, the + read will merely return with EAGAIN instead of blocking. + + * main.c (main): Set the blocking bit explicitly here. + * job.c (child_handler): If we have a jobserver pipe, set the + non-blocking bit for it. + (start_waiting_job): Move the token stuff back to new_job; if we + do it here then we're not controlling the number of remote jobs + started! + (new_job): Move the check for job slots to _after_ we've created a + child structure. If the read returns without getting a token, set + the blocking bit then try to reap_children. + + * make.h (EINTR_SET): Define to test errno if EINTR is available, + or 0 otherwise. Just some code cleanup. + * arscan.c (ar_member_touch): Use it. + * function.c (func_shell): Use it. + * job.c (reap_children): Use it. + * remake.c (touch_file): Use it. + 1999-07-28 Paul D. Smith - * make.h: Define _() and N_() macros as a passthrough to initiate + * make.h: Define _() and N_() macros as passthrough to initiate NLS support. * : Add _()/N_() around translatable strings. diff --git a/arscan.c b/arscan.c index cc9b9fd2..25e739a7 100644 --- a/arscan.c +++ b/arscan.c @@ -775,11 +775,8 @@ ar_member_touch (arname, memname) if (AR_HDR_SIZE != write (fd, (char *) &ar_hdr, AR_HDR_SIZE)) goto lose; /* The file's mtime is the time we we want. */ -#ifdef EINTR - while (fstat (fd, &statbuf) < 0 && errno == EINTR); -#else - fstat (fd, &statbuf); -#endif + while (fstat (fd, &statbuf) < 0 && EINTR_SET) + ; #if defined(ARFMAG) || defined(ARFZMAG) || defined(AIAMAG) || defined(WINDOWS32) /* Advance member's time to that time */ for (i = 0; i < sizeof ar_hdr.ar_date; i++) diff --git a/configure.in b/configure.in index 096df541..a864e17c 100644 --- a/configure.in +++ b/configure.in @@ -3,7 +3,7 @@ AC_REVISION([$Id$]) AC_PREREQ(2.13)dnl dnl Minimum Autoconf version required. AC_INIT(vpath.c)dnl dnl A distinctive file to look for in srcdir. -AM_INIT_AUTOMAKE(make, 3.77.91) +AM_INIT_AUTOMAKE(make, 3.77.92) AM_CONFIG_HEADER(config.h) dnl Regular configure stuff diff --git a/function.c b/function.c index 07712744..4047339f 100644 --- a/function.c +++ b/function.c @@ -1381,11 +1381,7 @@ func_shell (o, argv, funcname) if (cc > 0) i += cc; } -#ifdef EINTR - while (cc > 0 || errno == EINTR); -#else - while (cc > 0); -#endif + while (cc > 0 || EINTR_SET); /* Close the read side of the pipe. */ #ifdef __MSDOS__ diff --git a/job.c b/job.c index 17743a22..575a0303 100644 --- a/job.c +++ b/job.c @@ -145,6 +145,17 @@ extern int wait (); #endif /* Don't have `union wait'. */ +/* How to set close-on-exec for a file descriptor. */ + +#if !defined F_SETFD +# define CLOSE_ON_EXEC(_d) +#else +# ifndef FD_CLOEXEC +# define FD_CLOEXEC 1 +# endif +# define CLOSE_ON_EXEC(_d) (void) fcntl ((_d), F_SETFD, FD_CLOEXEC) +#endif + #ifdef VMS static int vms_jobsefnmask = 0; #endif /* !VMS */ @@ -218,9 +229,6 @@ int w32_kill(int pid, int sig) #endif /* WINDOWS32 */ -#ifndef MAKE_JOBSERVER -# define free_job_token(c) -#else static void free_job_token (child) struct child *child; @@ -248,7 +256,6 @@ free_job_token (child) child->job_token = '-'; } -#endif /* Write an error message describing the exit status given in @@ -302,14 +309,13 @@ vmsWaitForChildren(int *status) /* Handle a dead child. This handler may or may not ever be installed. If we're using the jobserver blocking read, we need it. First, installing - it ensures the read will interrupt on SIGCHLD. Second, we close the dup'd - read side of the pipe to ensure we don't enter another blocking read - without reaping all the dead children. In this case we don't need the - dead_children count. + it ensures the read will interrupt on SIGCHLD. Second, we reset the + blocking bit on the read side of the pipe to ensure we don't enter another + blocking read without reaping all the dead children. In this case we + don't need the dead_children count. If we don't have either waitpid or wait3, then make is unreliable, but we - use the dead_children count to reap children as best we can. In this case - job_rfd will never be >= 0. */ + use the dead_children count to reap children as best we can. */ static unsigned int dead_children = 0; @@ -319,9 +325,15 @@ child_handler (sig) { ++dead_children; - if (job_rfd >= 0) - close (job_rfd); - job_rfd = -1; +#ifdef HAVE_JOBSERVER + if (job_fds[0] >= 0) + { + int fl = fcntl(job_fds[0], F_GETFL, 0); + + if (fl >= 0) + fcntl(job_fds[0], F_SETFL, fl | O_NONBLOCK); + } +#endif if (debug_flag) printf (_("Got a SIGCHLD; %u unreaped children.\n"), dead_children); @@ -330,11 +342,12 @@ child_handler (sig) extern int shell_function_pid, shell_function_completed; -/* Reap dead children, storing the returned status and the new command +/* Reap all dead children, storing the returned status and the new command state (`cs_finished') in the `file' member of the `struct child' for the - dead child, and removing the child from the chain. If BLOCK nonzero, - reap at least one child, waiting for it to die if necessary. If ERR is - nonzero, print an error message first. */ + dead child, and removing the child from the chain. In addition, if BLOCK + nonzero, we block in this function until we've reaped at least one + complete child, waiting for it to die if necessary. If ERR is nonzero, + print an error message first. */ void reap_children (block, err) @@ -415,19 +428,18 @@ reap_children (block, err) { /* A remote status command failed miserably. Punt. */ remote_status_lose: -#ifdef EINTR - if (errno == EINTR) + if (EINTR_SET) continue; -#endif + pfatal_with_name ("remote_status"); } else { /* No remote children. Check for local children. */ - #if !defined(__MSDOS__) && !defined(_AMIGA) && !defined(WINDOWS32) if (any_local) { + local_wait: #ifdef VMS vmsWaitForChildren (&status); pid = c->pid; @@ -446,15 +458,14 @@ reap_children (block, err) if (pid < 0) { /* The wait*() failed miserably. Punt. */ -#ifdef EINTR - if (errno == EINTR) - continue; -#endif + if (EINTR_SET) + goto local_wait; + pfatal_with_name ("wait"); } else if (pid > 0) { - /* We got one; chop the status word up. */ + /* We got a child exit; chop the status word up. */ exit_code = WEXITSTATUS (status); exit_sig = WIFSIGNALED (status) ? WTERMSIG (status) : 0; coredump = WCOREDUMP (status); @@ -1171,20 +1182,16 @@ start_waiting_job (c) c->remote = start_remote_job_p (1); - /* If not, start it locally. */ - if (!c->remote) + /* If we are running at least one job already and the load average + is too high, make this one wait. */ + if (!c->remote && job_slots_used > 0 && load_too_high ()) { - /* If we are running at least one job already and the load average - is too high, make this one wait. */ - if (job_slots_used > 0 && load_too_high ()) - { - /* Put this child on the chain of children waiting - for the load average to go down. */ - set_command_state (f, cs_running); - c->next = waiting_jobs; - waiting_jobs = c; - return 0; - } + /* Put this child on the chain of children waiting for the load average + to go down. */ + set_command_state (f, cs_running); + c->next = waiting_jobs; + waiting_jobs = c; + return 0; } /* Start the first command; reap_children will run later command lines. */ @@ -1379,19 +1386,25 @@ new_job (file) } /* Read a token. As long as there's no token available we'll block. If we get a SIGCHLD we'll return with EINTR. If one happened - before we got here we'll return with EBADF. */ - else if (read (job_rfd, &c->job_token, 1) < 1) + before we got here we'll return immediately with EAGAIN because + the signal handler unsets the blocking bit. */ + else if (read (job_fds[0], &c->job_token, 1) < 1) { - if (errno == EINTR) - ; + int fl; - /* If EBADF, we got a SIGCHLD before. Otherwise, who knows? */ - else if (errno != EBADF) +#if !defined(EAGAIN) +# define EAGAIN EWOULDBLOCK +#endif + if (errno != EINTR && errno != EAGAIN) pfatal_with_name (_("read jobs pipe")); - /* Something's done; reap it. We don't force a block here; if - something strange happened and nothing's ready, just come back - and wait some more. */ + /* Set the blocking bit on the read FD again, just in case. */ + fl = fcntl(job_fds[0], F_GETFL, 0); + if (fl >= 0) + fcntl(job_fds[0], F_SETFL, fl & ~O_NONBLOCK); + + /* Something's done. We don't want to block for a whole child, + just reap whatever's there. */ reap_children (0, 0); } diff --git a/main.c b/main.c index d40ae3fc..c51fe7c7 100644 --- a/main.c +++ b/main.c @@ -205,7 +205,6 @@ static unsigned int inf_jobs = 0; /* File descriptors for the jobs pipe. */ int job_fds[2] = { -1, -1 }; -int job_rfd = -1; /* Maximum load average at which multiple jobs will be run. Negative values mean unlimited, while zero means limit to @@ -1353,15 +1352,13 @@ int main (int argc, char ** argv) job_slots_str = xstrdup(buf); } - /* If we have a jobserver pipe, dup(2) the read end. We'll use that in - the child handler to note a child has died. See job.c. */ - + /* Be sure the blocking bit on the read FD is set to start with. */ if (job_fds[0] >= 0) { - job_rfds = dup (job_fds[0]); - CLOSE_ON_EXEC (job_rfds); + int fl = fcntl(job_fds[0], F_GETFL, 0); + if (fl >= 0) + fcntl(job_fds[0], F_SETFL, fl & ~O_NONBLOCK); } - #endif /* Set up MAKEFLAGS and MFLAGS again, so they will be right. */ diff --git a/make.h b/make.h index 0496ce74..26eaec90 100644 --- a/make.h +++ b/make.h @@ -77,6 +77,14 @@ Boston, MA 02111-1307, USA. */ extern int errno; #endif +/* A shortcut for EINTR checking. Note you should never negate this! That + very likely doesn't mean what you want if EINTR is not available. */ +#ifdef EINTR +# define EINTR_SET (errno == EINTR) +#else +# define EINTR_SET (0) +#endif + #ifndef isblank # define isblank(c) ((c) == ' ' || (c) == '\t') #endif diff --git a/remake.c b/remake.c index 6464f345..74c4a9ac 100644 --- a/remake.c +++ b/remake.c @@ -918,13 +918,10 @@ touch_file (file) char buf; int status; -#ifdef EINTR do -#endif status = fstat (fd, &statbuf); -#ifdef EINTR - while (status < 0 && errno == EINTR); -#endif + while (status < 0 && EINTR_SET); + if (status < 0) TOUCH_ERROR ("touch: fstat: "); /* Rewrite character 0 same as it already is. */