From mboxrd@z Thu Jan 1 00:00:00 1970 X-Msuck: nntp://news.gmane.io/gmane.comp.sysutils.supervision.general/2024 Path: news.gmane.org!not-for-mail From: Charlie Brady Newsgroups: gmane.comp.sysutils.supervision.general Subject: runsv failing when starting up logger - missing pipe - failure of logpipe init? Date: Mon, 16 Aug 2010 16:32:09 -0400 (EDT) Message-ID: NNTP-Posting-Host: lo.gmane.org Mime-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII X-Trace: dough.gmane.org 1281990737 9799 80.91.229.12 (16 Aug 2010 20:32:17 GMT) X-Complaints-To: usenet@dough.gmane.org NNTP-Posting-Date: Mon, 16 Aug 2010 20:32:17 +0000 (UTC) To: supervision@list.skarnet.org Original-X-From: supervision-return-2259-gcsg-supervision=m.gmane.org@list.skarnet.org Mon Aug 16 22:32:14 2010 Return-path: Envelope-to: gcsg-supervision@lo.gmane.org Original-Received: from antah.skarnet.org ([212.85.147.14]) by lo.gmane.org with smtp (Exim 4.69) (envelope-from ) id 1Ol6Ls-0001I9-HQ for gcsg-supervision@lo.gmane.org; Mon, 16 Aug 2010 22:32:12 +0200 Original-Received: (qmail 16945 invoked by uid 76); 16 Aug 2010 20:34:15 -0000 Mailing-List: contact supervision-help@list.skarnet.org; run by ezmlm List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Archive: Original-Received: (qmail 16934 invoked from network); 16 Aug 2010 20:34:14 -0000 X-X-Sender: charlieb@e-smith.charlieb.ott.istop.com Xref: news.gmane.org gmane.comp.sysutils.supervision.general:2024 Archived-At: I'm running runit 1.7.2, and have a sytem showing this anomolous behaviour (one service is failing to start). Here is the bad service: [root@mbgvm1 ~]# sv status /service/tug-metrics/ down: /service/tug-metrics/: 1316s, normally up; down: log: 0s, normally up, want up [root@mbgvm1 ~]# strace of the runsv process shows that the child is dying immediately because fd 5 is not valid: 2005 fork() = 8121 2005 gettimeofday({1281988874, 32865}, NULL) = 0 2005 open("supervise/pid.new", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, 0644) = 5 2005 write(5, "8121\n", 5) = 5 2005 close(5) = 0 2005 rename("supervise/pid.new", "log/supervise/pid") = 0 2005 open("supervise/stat.new", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, 0644) = 5 2005 write(5, "run\n", 4) = 4 2005 close(5) = 0 2005 rename("supervise/stat.new", "log/supervise/stat") = 0 2005 open("supervise/status.new", O_WRONLY|O_CREAT|O_TRUNC|O_NONBLOCK, 0644) = 5 2005 write(5, "@\0\0\0Li\231\24\1\365|\334\271\37\0\0\0u\0\1", 20) = 20 2005 close(5) = 0 2005 rename("supervise/status.new", "log/supervise/status") = 0 2005 gettimeofday({1281988874, 33467}, NULL) = 0 2005 rt_sigprocmask(SIG_UNBLOCK, [TERM], NULL, 8) = 0 2005 rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0 2005 poll([{fd=3, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}], 3, 1000020 8121 fcntl(5, F_GETFL) = -1 EBADF (Bad file descriptor) 8121 write(2, "runsv tug-metrics: fatal: unable to setup filedescriptor for ./log/run: file descriptor not open\n", 97) = 97 8121 _exit(111) = ? 2005 <... poll resumed> ) = -1 EINTR (Interrupted system call) We can see there is no fd 5: [root@mbgvm1 ~]# ls -l /proc/2005/fd/* lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/0 -> /dev/null lrwx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/1 -> /dev/console l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/10 -> /var/service/tug-metrics/supervise/control lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/11 -> /var/service/tug-metrics/log/supervise/control l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/12 -> /var/service/tug-metrics/log/supervise/control lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/13 -> /var/service/tug-metrics/supervise/ok lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/14 -> /var/service/tug-metrics/log/supervise/ok l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/2 -> pipe:[6121] lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/3 -> pipe:[6363] l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/4 -> pipe:[6363] l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/7 -> /var/service/tug-metrics/supervise/lock l-wx------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/8 -> /var/service/tug-metrics/log/supervise/lock lr-x------ 1 root root 64 Aug 16 11:37 /proc/2005/fd/9 -> /var/service/tug-metrics/supervise/control [root@mbgvm1 ~]# If we look at a well behaved runsv, we see that fd 5 and fd 6 is a second pipe: [root@mbgvm1 ~]# ls -l /proc/2023/fd/* lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/0 -> /dev/null lrwx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/1 -> /dev/console l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/10 -> /var/service/tinydns/supervise/control lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/11 -> /var/service/tinydns/log/supervise/control l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/12 -> /var/service/tinydns/log/supervise/control lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/13 -> /var/service/tinydns/supervise/ok lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/14 -> /var/service/tinydns/log/supervise/ok l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/2 -> pipe:[6121] lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/3 -> pipe:[6421] l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/4 -> pipe:[6421] lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/5 -> pipe:[6425] l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/6 -> pipe:[6425] l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/7 -> /var/service/tinydns/supervise/lock l-wx------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/8 -> /var/service/tinydns/log/supervise/lock lr-x------ 1 root root 64 Aug 16 11:37 /proc/2023/fd/9 -> /var/service/tinydns/supervise/control [root@mbgvm1 ~]# Here's the code in runsv where the child is dying: ... if (p == 0) { /* child */ if (haslog) { if (s->islog) { if (fd_copy(0, logpipe[0]) == -1) fatal("unable to setup filedescriptor for ./log/run"); ... and here is where logpipe[0] should have been initialised: ... if (pipe(logpipe) == -1) fatal("unable to create log pipe"); ... Has anyone else seen this error condition or can posit a situation where it might be seen?