mailing list of musl libc
 help / color / mirror / code / Atom feed
From: Askar Safin <safinaskar@zohomail.com>
To: dalias@libc.org
Cc: musl@lists.openwall.com, fw@deneb.enyo.de
Subject: Re: [musl] [bug] Ctrl-Z when process is doing posix_spawn makes the process hard to kill
Date: Thu, 23 Jan 2025 00:45:34 +0300	[thread overview]
Message-ID: <20250122214534.2826650-1-safinaskar@zohomail.com> (raw)
In-Reply-To: <20250118111702.GM10433@brightrain.aerifal.cx>

 ---- On Sat, 18 Jan 2025 15:17:02 +0400  Rich Felker  wrote --- 
 > I don't understand what you think the kernel bug is.

Recently I got a suggestion to use CLONE_VM on io-uring@vger.kernel.org
( https://lore.kernel.org/io-uring/9ee30fc7-0329-4a69-b686-3131ce323c97@gmail.com/ )

So I tried CLONE_VM and it worked! I. e. this Ctrl-Z bug was not reproduced.

Also I compared various methods for spawning. And my testing shows that all
methods based on vfork or CLONE_VFORK or posix_spawn (as well as I understand it
is based on vfork, too) are buggy, and all others are not.

For all methods I wrote in comments whether the bug is reproducible on glibc
and musl.

In the end of this letter you will find full source.

So it may be good idea to replace vfork with CLONE_VM in musl and glibc.

Also, my CLONE_VM-based implementation is essentially reimplementation of vfork, but
in userspace. And it works. I. e. actual kernel implementation of vfork doesn't work,
and its userspace emulation works. This is strong argument for point of view, that
vfork is buggy in kernel.

--
Askar Safin
https://types.pl/@safinaskar

Source:

#define _GNU_SOURCE

#include <spawn.h>
#include <err.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sched.h>
#include <sys/wait.h>

char *args[] = {"/bin/true", NULL};
char *env[] = {"HOME=/", NULL};

// repro glibc
// repro musl
pid_t
spawn_via_posix_spawn (void)
{
    pid_t pid;
    if (posix_spawn (&pid, "/bin/true", NULL, NULL, args, env) != 0)
        {
            errx (1, "posix_spawn");
        }
    return pid;
}

// not repro glibc
// not repro musl
pid_t
spawn_via_fork (void)
{
    pid_t pid = fork ();
    if (pid == -1)
        {
            err (1, "fork");
        }
    if (pid == 0)
        {
            execve ("/bin/true", args, env);
            err (1, "execve");
        }
    return pid;
}

// repro glibc
// repro musl
pid_t
spawn_via_vfork (void)
{
    pid_t pid = vfork ();
    if (pid == -1)
        {
            err (1, "vfork");
        }
    if (pid == 0)
        {
            execve ("/bin/true", args, env);
            err (1, "execve");
        }
    return pid;
}

/* Okay, so below we will emulate vfork using CLONE_VM. We will do so using O_CLOEXEC pipe.
 * We will heavily rely on one important property: during execve Linux first destroys old memory,
 * and then closes all O_CLOEXEC fds. This is actually true, as we can see in Linux source:
 * https://elixir.bootlin.com/linux/v6.13-rc3/source/fs/exec.c#L1274
 * https://elixir.bootlin.com/linux/v6.13-rc3/source/fs/exec.c#L1312
 * As you can see, do_close_on_exec is called after exec_mmap
 */

int pipe_fd[2];

int
helper (void *a)
{
    if (syscall (SYS_close, pipe_fd[0]) != 0)
        {
            syscall (SYS_write, 2, "clo", 3);
            syscall (SYS_exit_group, 1);
        }

    syscall (SYS_execve, "/bin/true", args, env);
    syscall (SYS_write, 2, "exe", 3);
    syscall (SYS_exit_group, 1);
}

// not repro glibc
// not repro musl
pid_t
spawn_via_clone_vm (void)
{
    if (pipe2 (pipe_fd, O_CLOEXEC) == -1)
        {
            err (1, "pipe2");
        }

    // Begin of code, copied from "man 2 clone"

#define STACK_SIZE (1024 * 1024)    /* Stack size for cloned child */
    char            *stack;         /* Start of stack buffer */
    char            *stackTop;      /* End of stack buffer */

    /* Allocate memory to be used for the stack of the child. */
    stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
    if (stack == MAP_FAILED)
        {
            err (1, "mmap");
        }

    stackTop = stack + STACK_SIZE;  /* Assume stack grows downward */

    pid_t pid = clone (helper, stackTop, CLONE_VM | SIGCHLD, NULL);

    if (pid == -1)
        {
            err (1, "clone");
        }

    // End of code, copied from "man 2 clone"

    // Okay, so now we should wait for "execve". We will do this using that pipe
    // We will use "syscall" to avoid messing with libc's state
    // We cannot even rely on errno, because it is probably shared now

    if (syscall (SYS_close, pipe_fd[1]) != 0)
        {
            syscall (SYS_write, 2, "clo", 3);
            syscall (SYS_exit_group, 1);
        }

    char buf[1];
    if (syscall (SYS_read, pipe_fd[0], buf, 1) != 0)
        {
            syscall (SYS_write, 2, "rea", 3);
            syscall (SYS_exit_group, 1);
        }

    // Okay, so the child did "execve", now we can continue running normally

    if (close (pipe_fd[0]) != 0)
        {
            err (1, "close");
        }

    return pid;
}

int
helper_clone_vfork (void *a)
{
    execve ("/bin/true", args, env);
    err (1, "execve");
}

// repro glibc
// repro musl
pid_t
spawn_via_clone_vfork (void)
{
    // Begin of code, copied from "man 2 clone"

#define STACK_SIZE (1024 * 1024)    /* Stack size for cloned child */
    char            *stack;         /* Start of stack buffer */
    char            *stackTop;      /* End of stack buffer */

    /* Allocate memory to be used for the stack of the child. */
    stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
    if (stack == MAP_FAILED)
        {
            err (1, "mmap");
        }

    stackTop = stack + STACK_SIZE;  /* Assume stack grows downward */

    pid_t pid = clone (helper, stackTop, CLONE_VFORK | SIGCHLD, NULL);

    if (pid == -1)
        {
            err (1, "clone");
        }

    // End of code, copied from "man 2 clone"

    return pid;
}

int
main (void)
{
    for (;;)
        {
            pid_t pid = spawn_via_clone_vfork (); // You can replace this line with some other "spawn_via_..." function
            if (waitpid (pid, NULL, 0) != pid)
                {
                    err(1, "waitpid");
                }
        }
}

      parent reply	other threads:[~2025-01-22 21:45 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-16 23:14 Askar Safin
2025-01-17  6:37 ` Rich Felker
2025-01-17  6:46   ` Rich Felker
2025-01-17 17:55   ` Askar Safin
2025-01-18  9:51     ` Florian Weimer
2025-01-18 10:23       ` Rich Felker
2025-01-18 11:13         ` Florian Weimer
2025-01-18 20:58           ` Askar Safin
2025-01-18 11:17     ` Rich Felker
2025-01-18 20:16       ` Markus Wichmann
2025-01-19  3:18         ` Rich Felker
2025-01-18 20:52       ` Askar Safin
2025-01-22 21:45       ` Askar Safin [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250122214534.2826650-1-safinaskar@zohomail.com \
    --to=safinaskar@zohomail.com \
    --cc=dalias@libc.org \
    --cc=fw@deneb.enyo.de \
    --cc=musl@lists.openwall.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).