From 6909bf9759e37d21e4016ffebef1e453a3abfc07 Mon Sep 17 00:00:00 2001 From: Michael Stapelberg Date: Tue, 17 Jan 2017 22:27:16 +0100 Subject: [PATCH] Implement -u (UNIX socket batch processing) The option argument identifies the file descriptor number of a UNIX listening socket which was inherited from the parent process when said parent exec'd mandoc. mandoc will accept one connection at a time on that socket. As long as the connection lives, mandoc reads 1-byte messages with ancillary data (control information) from the connection. The ancillary data is used to pass three file descriptors from the parent process to mandoc. These file descriptors are then dup2'ed to fd 0, fd 1 and fd 2, respectively. After the file descriptors have been set up, a single manpage is read from stdin and processed to stdout/stderr. This change effectively adds a batch processing mode to mandoc, which eliminates the fork+exec overhead that otherwise accompanies the conversion of a large number of manpages. The resulting speed-up has been observed to exceed a factor of 6, reducing the total conversion wall-clock time of about a 470k manpages from more than 2 hours to a mere 22 minutes. --- main.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- read.c | 1 + 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index b64b3be..1225ace 100644 --- a/main.c +++ b/main.c @@ -41,6 +41,10 @@ #include #include +#include +#include +#include + #include "mandoc_aux.h" #include "mandoc.h" #include "roff.h" @@ -109,6 +113,53 @@ static char *help_argv[] = {help_arg, NULL}; static enum mandoclevel rc; +#define NUM_FDS 3 +static int read_fds(int clientfd, int *fds) { + struct msghdr msg; + unsigned char dummy[1]; + struct iovec iov[1]; + ssize_t n; + struct cmsghdr *cmsg; + int *walk; + int cnt; + /* Union used for alignment. */ + union { + uint8_t controlbuf[CMSG_SPACE(NUM_FDS * sizeof(int))]; + struct cmsghdr align; + } u; + + memset(&msg, '\0', sizeof(msg)); + msg.msg_control = u.controlbuf; + msg.msg_controllen = sizeof(u.controlbuf); + /* Read a dummy byte — sendmsg cannot send an empty message, + * even if we are only interested in the OOB data. */ + iov[0].iov_base = dummy; + iov[0].iov_len = sizeof(dummy); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + + if ((n = recvmsg(clientfd, &msg, 0)) < 0) + return n; + + if ((cmsg = CMSG_FIRSTHDR(&msg)) == NULL) + return -1; + + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return -1; + + if (cmsg->cmsg_len != CMSG_LEN(NUM_FDS * sizeof(int))) + return -1; + + walk = (int*)CMSG_DATA(cmsg); + for (cnt = 0; cnt < NUM_FDS; cnt++) { + fds[cnt] = *walk; + walk++; + } + return NUM_FDS; +} + + int main(int argc, char *argv[]) { @@ -133,6 +184,16 @@ main(int argc, char *argv[]) int status, signum; int c; pid_t pager_pid, tc_pgid, man_pgid, pid; + int sockfd; + const char *errstr; + struct sockaddr_un peer; + socklen_t len; + int clientfd; + int old_stdin; + int old_stdout; + int old_stderr; + int fds[3]; + int n; #if HAVE_PROGNAME progname = getprogname(); @@ -194,8 +255,11 @@ main(int argc, char *argv[]) show_usage = 0; outmode = OUTMODE_DEF; + sockfd = -1; + errstr = NULL; + while (-1 != (c = getopt(argc, argv, - "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) { + "aC:cfhI:iK:klM:m:O:S:s:T:u:VW:w"))) { switch (c) { case 'a': outmode = OUTMODE_ALL; @@ -261,6 +325,13 @@ main(int argc, char *argv[]) if ( ! toptions(&curp, optarg)) return (int)MANDOCLEVEL_BADARG; break; + case 'u': + sockfd = strtonum(optarg, 0, INT_MAX, &errstr); + if (errstr) { + warnx("-u %s: %s", optarg, errstr); + return (int)MANDOCLEVEL_BADARG; + } + break; case 'W': if ( ! woptions(&curp, optarg)) return (int)MANDOCLEVEL_BADARG; @@ -436,6 +507,47 @@ main(int argc, char *argv[]) mchars_alloc(); curp.mp = mparse_alloc(options, curp.wlevel, mmsg, defos); + /* If -u was specified, accept UNIX socket connections until killed. */ + while (sockfd > 0) { + len = sizeof(struct sockaddr_un); + clientfd = accept(sockfd, (struct sockaddr *)&peer, &len); + if (clientfd < 0) + err((int)MANDOCLEVEL_SYSERR, "accept"); + + /* We always swap file descriptors so that we can always + * unconditionally use their file descriptor numbers. */ + fflush(stdout); + fflush(stderr); + old_stdin = dup(STDIN_FILENO); + old_stdout = dup(STDOUT_FILENO); + old_stderr = dup(STDERR_FILENO); + + while (1) { + if ((n = read_fds(clientfd, fds)) < 0) + break; + + if (dup2(fds[0], STDIN_FILENO) == -1 || + dup2(fds[1], STDOUT_FILENO) == -1 || + dup2(fds[2], STDERR_FILENO) == -1) + err((int)MANDOCLEVEL_SYSERR, "dup2"); + + close(fds[0]); + close(fds[1]); + close(fds[2]); + + parse(&curp, STDIN_FILENO, ""); + mparse_reset(curp.mp); + fflush(stdout); + fflush(stderr); + /* Close file descriptors by restoring the old ones. */ + dup2(old_stdin, STDIN_FILENO); + dup2(old_stdout, STDOUT_FILENO); + dup2(old_stderr, STDERR_FILENO); + } + + close(clientfd); + } + /* * Conditionally start up the lookaside buffer before parsing. */ diff --git a/read.c b/read.c index d20a609..d96eb5c 100644 --- a/read.c +++ b/read.c @@ -836,6 +836,7 @@ mparse_reset(struct mparse *curp) free(curp->sodest); curp->sodest = NULL; + curp->gzip = 0; } void -- 2.11.0