[PATCH] split __libc_start_main.c into two files (Wasm)

mailing list of musl libc
 help / color / mirror / code / Atom feed

* [PATCH] split __libc_start_main.c into two files (Wasm)
@ 2017-12-07 14:51 Nicholas Wilson
  2017-12-07 17:03 ` Rich Felker
  0 siblings, 1 reply; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-07 14:51 UTC (permalink / raw)
  To: musl

Hi,

I've got some more patches to try and get in for Wasm support.

Here's one that should be fairly straightforward? In Musl, most (non-static) functions are in a file of their own, which is good from a linkage point of view.

I'd like to have __libc_start_main.c split into two files, because for Wasm I'd like to be able to call __libc_start_init (from within the CRT directory) but without having to link in exit(), since many Wasm applications will never call exit() and won't necessarily use main.

There's no pollution of the codebase, it's purely splitting a file.

Patch below.

Nick


========================
commit 3096e0e5e17ff5ee14288574503ad22d8e3d119c
Author: Nicholas Wilson <nicholas.wilson@realvnc.com>
Date:   Thu Dec 7 11:52:32 2017 +0000

    [Wasm] split __libc_start_main.c into two files, for archs that want __libc_start_init but might potentially not link in exit()

diff --git a/src/env/__libc_start_init.c b/src/env/__libc_start_init.c
new file mode 100644
index 00000000..ec05e1df
--- /dev/null
+++ b/src/env/__libc_start_init.c
@@ -0,0 +1,64 @@
+#include <elf.h>
+#include <poll.h>
+#include <fcntl.h>
+#include <signal.h>
+#include "syscall.h"
+#include "atomic.h"
+#include "libc.h"
+
+void __init_tls(size_t *);
+
+static void dummy(void) {}
+weak_alias(dummy, _init);
+
+__attribute__((__weak__, __visibility__("hidden")))
+extern void (*const __init_array_start)(void), (*const __init_array_end)(void);
+
+static void dummy1(void *p) {}
+weak_alias(dummy1, __init_ssp);
+
+#define AUX_CNT 38
+
+void __init_libc(char **envp, char *pn)
+{
+	size_t i, *auxv, aux[AUX_CNT] = { 0 };
+	__environ = envp;
+	for (i=0; envp[i]; i++);
+	libc.auxv = auxv = (void *)(envp+i+1);
+	for (i=0; auxv[i]; i+=2) if (auxv[i]<AUX_CNT) aux[auxv[i]] = auxv[i+1];
+	__hwcap = aux[AT_HWCAP];
+	__sysinfo = aux[AT_SYSINFO];
+	libc.page_size = aux[AT_PAGESZ];
+
+	if (!pn) pn = (void*)aux[AT_EXECFN];
+	if (!pn) pn = "";
+	__progname = __progname_full = pn;
+	for (i=0; pn[i]; i++) if (pn[i]=='/') __progname = pn+i+1;
+
+	__init_tls(aux);
+	__init_ssp((void *)aux[AT_RANDOM]);
+
+	if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
+		&& !aux[AT_SECURE]) return;
+
+	struct pollfd pfd[3] = { {.fd=0}, {.fd=1}, {.fd=2} };
+#ifdef SYS_poll
+	__syscall(SYS_poll, pfd, 3, 0);
+#else
+	__syscall(SYS_ppoll, pfd, 3, &(struct timespec){0}, 0, _NSIG/8);
+#endif
+	for (i=0; i<3; i++) if (pfd[i].revents&POLLNVAL)
+		if (__sys_open("/dev/null", O_RDWR)<0)
+			a_crash();
+	libc.secure = 1;
+}
+
+static void libc_start_init(void)
+{
+	_init();
+	uintptr_t a = (uintptr_t)&__init_array_start;
+	for (; a<(uintptr_t)&__init_array_end; a+=sizeof(void(*)()))
+		(*(void (**)(void))a)();
+}
+
+weak_alias(libc_start_init, __libc_start_init);
diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index 2d758af7..8236749e 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -1,67 +1,7 @@
-#include <elf.h>
-#include <poll.h>
-#include <fcntl.h>
-#include <signal.h>
-#include "syscall.h"
-#include "atomic.h"
-#include "libc.h"
+#include <stdlib.h>
 
-void __init_tls(size_t *);
-
-static void dummy(void) {}
-weak_alias(dummy, _init);
-
-__attribute__((__weak__, __visibility__("hidden")))
-extern void (*const __init_array_start)(void), (*const __init_array_end)(void);
-
-static void dummy1(void *p) {}
-weak_alias(dummy1, __init_ssp);
-
-#define AUX_CNT 38
-
-void __init_libc(char **envp, char *pn)
-{
-	size_t i, *auxv, aux[AUX_CNT] = { 0 };
-	__environ = envp;
-	for (i=0; envp[i]; i++);
-	libc.auxv = auxv = (void *)(envp+i+1);
-	for (i=0; auxv[i]; i+=2) if (auxv[i]<AUX_CNT) aux[auxv[i]] = auxv[i+1];
-	__hwcap = aux[AT_HWCAP];
-	__sysinfo = aux[AT_SYSINFO];
-	libc.page_size = aux[AT_PAGESZ];
-
-	if (!pn) pn = (void*)aux[AT_EXECFN];
-	if (!pn) pn = "";
-	__progname = __progname_full = pn;
-	for (i=0; pn[i]; i++) if (pn[i]=='/') __progname = pn+i+1;
-
-	__init_tls(aux);
-	__init_ssp((void *)aux[AT_RANDOM]);
-
-	if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
-		&& !aux[AT_SECURE]) return;
-
-	struct pollfd pfd[3] = { {.fd=0}, {.fd=1}, {.fd=2} };
-#ifdef SYS_poll
-	__syscall(SYS_poll, pfd, 3, 0);
-#else
-	__syscall(SYS_ppoll, pfd, 3, &(struct timespec){0}, 0, _NSIG/8);
-#endif
-	for (i=0; i<3; i++) if (pfd[i].revents&POLLNVAL)
-		if (__sys_open("/dev/null", O_RDWR)<0)
-			a_crash();
-	libc.secure = 1;
-}
-
-static void libc_start_init(void)
-{
-	_init();
-	uintptr_t a = (uintptr_t)&__init_array_start;
-	for (; a<(uintptr_t)&__init_array_end; a+=sizeof(void(*)()))
-		(*(void (**)(void))a)();
-}
-
-weak_alias(libc_start_init, __libc_start_init);
+extern void __libc_start_init(void);
+extern void __init_libc(char **envp, char *pn);
 
 int __libc_start_main(int (*main)(int,char **,char **), int argc, char **argv)
 {


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-07 14:51 [PATCH] split __libc_start_main.c into two files (Wasm) Nicholas Wilson
@ 2017-12-07 17:03 ` Rich Felker
  2017-12-15  4:19   ` Rich Felker
  0 siblings, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-07 17:03 UTC (permalink / raw)
  To: musl

On Thu, Dec 07, 2017 at 02:51:31PM +0000, Nicholas Wilson wrote:
> Hi,
> 
> I've got some more patches to try and get in for Wasm support.
> 
> Here's one that should be fairly straightforward? In Musl, most
> (non-static) functions are in a file of their own, which is good
> from a linkage point of view.
> 
> I'd like to have __libc_start_main.c split into two files, because
> for Wasm I'd like to be able to call __libc_start_init (from within
> the CRT directory) but without having to link in exit(), since many
> Wasm applications will never call exit() and won't necessarily use
> main.
> 
> There's no pollution of the codebase, it's purely splitting a file.
> 
> Patch below.

__libc_start_init is intentionally not a public interface but part of
musl internals. There is no reason to assume it will continue to exist
with the same name or interface in future versions of musl. The public
interface for the entry point is __libc_start_main.

exit() is literally 9 instructions on x86_64, and likely comparably
small elsewhere. I don't see how trying to optimize it out makes
sense. The bulk of the code that runs at exit() when there's
nontrivial work to do at exit time is linked through dependencies from
other sources like stdio and atexit, and would be linked even if you
succeeded in optimizing exit out.

Rich

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-07 17:03 ` Rich Felker
@ 2017-12-15  4:19   ` Rich Felker
  2017-12-15 11:34     ` Nicholas Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-15  4:19 UTC (permalink / raw)
  To: musl

On Thu, Dec 07, 2017 at 12:03:56PM -0500, Rich Felker wrote:
> On Thu, Dec 07, 2017 at 02:51:31PM +0000, Nicholas Wilson wrote:
> > Hi,
> > 
> > I've got some more patches to try and get in for Wasm support.
> > 
> > Here's one that should be fairly straightforward? In Musl, most
> > (non-static) functions are in a file of their own, which is good
> > from a linkage point of view.
> > 
> > I'd like to have __libc_start_main.c split into two files, because
> > for Wasm I'd like to be able to call __libc_start_init (from within
> > the CRT directory) but without having to link in exit(), since many
> > Wasm applications will never call exit() and won't necessarily use
> > main.
> > 
> > There's no pollution of the codebase, it's purely splitting a file.
> > 
> > Patch below.
> 
> __libc_start_init is intentionally not a public interface but part of
> musl internals. There is no reason to assume it will continue to exist
> with the same name or interface in future versions of musl. The public
> interface for the entry point is __libc_start_main.
> 
> exit() is literally 9 instructions on x86_64, and likely comparably
> small elsewhere. I don't see how trying to optimize it out makes
> sense. The bulk of the code that runs at exit() when there's
> nontrivial work to do at exit time is linked through dependencies from
> other sources like stdio and atexit, and would be linked even if you
> succeeded in optimizing exit out.

Another bug I overlooked here was that, by moving the code to a new
file, it would no longer be affected by $(NOSSP_SRCS) in Makefile,
thereby breaking builds with -fstack-protector or where the compiler
has it on by default. That could have been fixed if needed; I just
bring it up to show that there are subtle possibilities for breakage
like this that we should really either document rig up some sort of
static assertion to catch if there's a regression.

Rich


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15  4:19   ` Rich Felker
@ 2017-12-15 11:34     ` Nicholas Wilson
  2017-12-15 12:33       ` Szabolcs Nagy
  0 siblings, 1 reply; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-15 11:34 UTC (permalink / raw)
  To: musl

Hi Rich,

I've only just noticed your replies - sorry! (Some went in my spam, oops, maybe because of "dalias@aerifal.cx on behalf of dalias@libc.org".)

Thanks for the feedback, on this and the other patches.

On Thu, Dec 07, 2017 at 12:03:56PM -0500, Rich Felker wrote:
> __libc_start_init is intentionally not a public interface but part of
> musl internals. There is no reason to assume it will continue to exist
> with the same name or interface in future versions of musl. The public
> interface for the entry point is __libc_start_main.

That's right - it's a Musl internal. What I was planning to do though was to call it from within Musl, in the arch/wasm code. When I said it's a "public" symbol I meant "non-static/non-local" ie a symbol exposed for use within Musl.

> exit() is literally 9 instructions on x86_64, and likely comparably
> small elsewhere. I don't see how trying to optimize it out makes
> sense. The bulk of the code that runs at exit() when there's
> nontrivial work to do at exit time is linked through dependencies from
> other sources like stdio and atexit, and would be linked even if you
> succeeded in optimizing exit out.

To clarify, it's not exit() itself that's a problem. Remember we're using statically-linked syscalls, so linking in exit() introduces a dependency on SYS_exit_group/SYS_exit, which Wasm pulls in as external dependencies in the host emulation environment. It would be nice to avoid linking in syscalls that aren't actually used, especially ones like SYS_exit that are a bit ugly to emulate.

On Fri, Dec 15, 2017 at 04:19, Rich Felker wrote:
> Another bug I overlooked here was that, by moving the code to a new
> file, it would no longer be affected by $(NOSSP_SRCS) in Makefile,
> thereby breaking builds with -fstack-protector or where the compiler
> has it on by default. That could have been fixed if needed; I just
> bring it up to show that there are subtle possibilities for breakage
> like this that we should really either document rig up some sort of
> static assertion to catch if there's a regression.

Good point, I noticed that as well when splitting the file; I should have mentioned it. I misunderstood how the stack-protection stuff worked, and I looked at __init_libc and __libc_start_init and thought "these functions have normal/valid stack usage, no need to turn off the protection for these". But now I realise that it has to be turned off for all functions before __init_ssp.

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 11:34     ` Nicholas Wilson
@ 2017-12-15 12:33       ` Szabolcs Nagy
  2017-12-15 13:04         ` Nicholas Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Szabolcs Nagy @ 2017-12-15 12:33 UTC (permalink / raw)
  To: musl

* Nicholas Wilson <nicholas.wilson@realvnc.com> [2017-12-15 11:34:10 +0000]:
> 
> To clarify, it's not exit() itself that's a problem. Remember we're using statically-linked syscalls, so linking in exit() introduces a dependency on SYS_exit_group/SYS_exit, which Wasm pulls in as external dependencies in the host emulation environment. It would be nice to avoid linking in syscalls that aren't actually used, especially ones like SYS_exit that are a bit ugly to emulate.
> 

why is exit ugly?
halting execution should not be complicated on whatever platform

how do you implement a_crash?
the c runtime uses that all over the place anyway so it's unlikely
that you can get away without process termination code.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 12:33       ` Szabolcs Nagy
@ 2017-12-15 13:04         ` Nicholas Wilson
  2017-12-15 17:23           ` Rich Felker
  0 siblings, 1 reply; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-15 13:04 UTC (permalink / raw)
  To: musl

Hi,

On 15 December 2017 12:33, Szabolcs Nagy wrote:
> why is exit ugly?
> halting execution should not be complicated on whatever platform

In WebAssembly, the runtime environment is (typically) a webpage. It's not clear what exit() would be expected to do to the webpage or Wasm module. The Wasm interpreter offered by browsers doesn't offer a way to cleanly exit/terminate a running WebAssembly function; the "trap" instruction to abort is the closest you could get.

If you have some existing C code that you want to compile/port to WebAssembly, whatever the code previously expects to happen on exit() won't happen. It will need to be changed when ported to the web, the semantics of a Unix process and a web page are unavoidably different. For brand-new code specifically targeting a web page, you wouldn't deliberately add a dependency on exit(), because it's not clearly meaningful for a webpage.

So, it's "ugly" to me because the lifecycle of the webpage is an awkward fit for exit(). We could bodge in some emulation, but it's just ugly to pull in the emulation when we don't expect it to be used. My preference is "you should only link in the hacks you rely on".

It's not too bad - we will have to implement exit() emulation for legacy code, so I could live with it being linked in to every application. I just dislike hacks, and don't enjoy seeing them bundled into my own applications!

> how do you implement a_crash?
> the c runtime uses that all over the place anyway so it's unlikely
> that you can get away without process termination code.

a_crash will invoke the Wasm "trap" instruction, which throws an exception up to the WebAssembly interpreter. It will have proper "noreturn" semantics, but it's then up to the interpreter to do whatever it feels is most helpful to the developer - probably print a message.

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 13:04         ` Nicholas Wilson
@ 2017-12-15 17:23           ` Rich Felker
  2017-12-15 17:43             ` Nicholas Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-15 17:23 UTC (permalink / raw)
  To: musl

On Fri, Dec 15, 2017 at 01:04:14PM +0000, Nicholas Wilson wrote:
> Hi,
> 
> On 15 December 2017 12:33, Szabolcs Nagy wrote:
> > why is exit ugly?
> > halting execution should not be complicated on whatever platform
> 
> In WebAssembly, the runtime environment is (typically) a webpage.
> It's not clear what exit() would be expected to do to the webpage or
> Wasm module. The Wasm interpreter offered by browsers doesn't offer
> a way to cleanly exit/terminate a running WebAssembly function; the
> "trap" instruction to abort is the closest you could get.
> 
> If you have some existing C code that you want to compile/port to
> WebAssembly, whatever the code previously expects to happen on
> exit() won't happen. It will need to be changed when ported to the
> web, the semantics of a Unix process and a web page are unavoidably
> different. For brand-new code specifically targeting a web page, you
> wouldn't deliberately add a dependency on exit(), because it's not
> clearly meaningful for a webpage.
> 
> So, it's "ugly" to me because the lifecycle of the webpage is an
> awkward fit for exit(). We could bodge in some emulation, but it's
> just ugly to pull in the emulation when we don't expect it to be
> used. My preference is "you should only link in the hacks you rely
> on".
> 
> It's not too bad - we will have to implement exit() emulation for
> legacy code, so I could live with it being linked in to every
> application. I just dislike hacks, and don't enjoy seeing them
> bundled into my own applications!

I don't see this as ugly at all -- the obvious behavior for exit is
for it to just go into a "halted" state. This is what you generally
expect on embedded C implementations with no OS, or if pid 1 exits on
a unix-like system. If you don't yet support thread creation, all it
has to do is something like for(;;) wait_for_something(); or similar.
Once you have threads, you may need some nontrivial work to ensure
that all threads enter a halted state, or it might just be as simple
as stopping the interpreter. None of this should involve significant
size/cost.

Rich


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 17:23           ` Rich Felker
@ 2017-12-15 17:43             ` Nicholas Wilson
  2017-12-15 17:56               ` Rich Felker
  0 siblings, 1 reply; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-15 17:43 UTC (permalink / raw)
  To: musl

On 15 December 2017 17:23, Rich Felker wrote:
> I don't see this as ugly at all -- the obvious behavior for exit is
> for it to just go into a "halted" state.

There is no builtin lifecycle for that in web browsers, but we can emulate it.

> This is what you generally
> expect on embedded C implementations with no OS, or if pid 1 exits on
> a unix-like system. If you don't yet support thread creation, all it
> has to do is something like for(;;) wait_for_something(); or similar.
> Once you have threads, you may need some nontrivial work to ensure
> that all threads enter a halted state, or it might just be as simple
> as stopping the interpreter.

We certainly can't do an infinite loop (who wants tabs in their browser to lock their CPU at 100%?) - and there's no way to wait on anything in Wasm since it's single-threaded.

A trap is the only way to report to the interpreter that it should stop execution. It's workable, although awkward. (JavaScript can also throw exceptions, but Wasm code will soon be able to catch those, so a trap is I believe the only uncatchable exceptional condition.)

It will look something like defining "__syscall_exit() { __builtin_trap(); }" for Wasm.

If you really don't want to split the translation unit, I can live with it, although it doesn't seem like a big request, since it's not changing the code at all, just giving independent linkage to two different functions.

I'll revert the change on my branch. Thanks for discussing it.

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 17:43             ` Nicholas Wilson
@ 2017-12-15 17:56               ` Rich Felker
  2017-12-16 13:21                 ` Nicholas Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-15 17:56 UTC (permalink / raw)
  To: musl

On Fri, Dec 15, 2017 at 05:43:33PM +0000, Nicholas Wilson wrote:
> On 15 December 2017 17:23, Rich Felker wrote:
> > I don't see this as ugly at all -- the obvious behavior for exit is
> > for it to just go into a "halted" state.
> 
> There is no builtin lifecycle for that in web browsers, but we can emulate it.

There's always a trivial halted state: for(;;); Presumably there are
also more efficient ones. :-)

> > This is what you generally
> > expect on embedded C implementations with no OS, or if pid 1 exits on
> > a unix-like system. If you don't yet support thread creation, all it
> > has to do is something like for(;;) wait_for_something(); or similar.
> > Once you have threads, you may need some nontrivial work to ensure
> > that all threads enter a halted state, or it might just be as simple
> > as stopping the interpreter.
> 
> We certainly can't do an infinite loop (who wants tabs in their
> browser to lock their CPU at 100%?) - and there's no way to wait on
> anything in Wasm since it's single-threaded.

Wait doesn't require a thread. It could be waiting for data from a
source that will never have data, waiting for elapse of a certain
period of time (e.g. for(;;) sleep(1000000000);), etc.

> A trap is the only way to report to the interpreter that it should
> stop execution. It's workable, although awkward. (JavaScript can
> also throw exceptions, but Wasm code will soon be able to catch
> those, so a trap is I believe the only uncatchable exceptional
> condition.)
> 
> It will look something like defining "__syscall_exit() {
> __builtin_trap(); }" for Wasm.

Sounds ok.

> If you really don't want to split the translation unit, I can live
> with it, although it doesn't seem like a big request, since it's not
> changing the code at all, just giving independent linkage to two
> different functions.

Adding a new interface boundary/contract for a particular arch _is_ a
big request, one of the biggest types. It's a permanent added
constraint that has to be considered in future modifications to the
code. Probably the only bigger type is adding a new public (to
application) interface boundary/contract.

Rich


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-15 17:56               ` Rich Felker
@ 2017-12-16 13:21                 ` Nicholas Wilson
  2017-12-19  1:08                   ` Rich Felker
  0 siblings, 1 reply; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-16 13:21 UTC (permalink / raw)
  To: musl

On 15 December 2017 17:56, Rich Felker wrote:
> Adding a new interface boundary/contract for a particular arch _is_ a
> big request, one of the biggest types. It's a permanent added
> constraint that has to be considered in future modifications to the
> code. Probably the only bigger type is adding a new public (to
> application) interface boundary/contract.

Thanks for clarifying, I understand now. In our fork/branch, we will have to call *something* on Wasm, since initialisation is different to ELF and the default crt1.c doesn't really work for non-ELF archs.

For your information, what we'll do is call __init_libc directly from crt/wasm/crt1.c, since for the time being we need to do that to keep the prototype working. It would be nice if that were to become an accepted internal Musl interface, so that becomes "legal".

(I have though reverted the split into two files of __libc_start_main.c.)

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-16 13:21                 ` Nicholas Wilson
@ 2017-12-19  1:08                   ` Rich Felker
  2017-12-19 11:04                     ` Nicholas Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-19  1:08 UTC (permalink / raw)
  To: musl

On Sat, Dec 16, 2017 at 01:21:39PM +0000, Nicholas Wilson wrote:
> On 15 December 2017 17:56, Rich Felker wrote:
> > Adding a new interface boundary/contract for a particular arch _is_ a
> > big request, one of the biggest types. It's a permanent added
> > constraint that has to be considered in future modifications to the
> > code. Probably the only bigger type is adding a new public (to
> > application) interface boundary/contract.
> 
> Thanks for clarifying, I understand now. In our fork/branch, we will
> have to call *something* on Wasm, since initialisation is different
> to ELF and the default crt1.c doesn't really work for non-ELF archs.
> 
> For your information, what we'll do is call __init_libc directly
> from crt/wasm/crt1.c, since for the time being we need to do that to
> keep the prototype working. It would be nice if that were to become
> an accepted internal Musl interface, so that becomes "legal".
> 
> (I have though reverted the split into two files of __libc_start_main.c.)

I still don't see any good reason to call __init_libc instead of
__libc_start_main. While the crt1 entry point (_start) itself is a not
a normal C function but something specific to the ELF entry
conventions, __libc_start_main is a perfectly good C function that is
not "ELF specific". It does require a pointer to the args/environment
formatted as an array of:

	{ argc, argv[0], ..., argv[argc-1], 0, environ[0], ..., 0,
	auxv[0], ... 0 }

but __init_libc and other code in musl also requires such an array to
be present. I really don't think you're gaining anything by bypassing
__libc_start_main, but you are losing the property of interfacing with
a stable interface that will never change out from under you.

Rich


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19  1:08                   ` Rich Felker
@ 2017-12-19 11:04                     ` Nicholas Wilson
  2017-12-19 15:27                       ` Szabolcs Nagy
  2017-12-19 15:56                       ` Rich Felker
  0 siblings, 2 replies; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-19 11:04 UTC (permalink / raw)
  To: musl

On 19 December 2017 01:08, Rich Felker wrote:
> I still don't see any good reason to call __init_libc instead of
> __libc_start_main. While the crt1 entry point (_start) itself is a not
> a normal C function but something specific to the ELF entry
> conventions, __libc_start_main is a perfectly good C function that is
> not "ELF specific".

There are two ELF-specific bits to __libc_start_main:

1) The first thing is that it calls main and exit! Wasm users don't want to call exit. (This is different from the discussion we had before as to whether it's OK to *link in* exit when it's never called. This is about whether it's actually OK to call exit.) exit destroys C++ global variables, and Wasm users don't want that. We need to able to call into the Wasm module repeatedly from a JavaScript web page.

Maybe think of a Wasm (statically-linked) binary as being a bit more like a shared library: instead of exporting a single "entrypoint" for the kernel to run (like a Unix process), it exports a list of functions for the JavaScript environment to run repeatedly. We're not using the language of "shared library" though, because we're using that terminology to mean Wasm libraries calling each other, and we don't have all the tooling in place for that in web browsers yet.

2) This second point is just for your interest. It doesn't affect the rest of Musl, since Wasm can provide its own crt/wasm/crt1.c.  (See: https://github.com/NWilson/musl/blob/musl-wasm-native/crt/wasm/crt1.c)

Another difference is that __libc_start_init uses the __init_array_start/end symbols, which come from special compiler-provided crtbegin.o/crtend.o objects on ELF platforms. The linker has to arrange for a table of function pointers for Musl to call, and it does that using the magic .init_array ELF section (plus some priority fiddling).

In Wasm, most functions are not callable by pointer: a function pointer is only assigned to code that has its address taken. For security, a decision was taken *not* to use a table of function pointers for the startup code, since that would mean taking the address of those functions, and they otherwise would be unlikely to need to go in the function pointer table. It's a hardening measure, to reduce the number of functions that can be used as the target of an indirect call.

Hence, the linker synthesises the init-function as a list of call-instructions, rather than synthesising a list of function pointers for a pre-written function to iterate over.

This decision wasn't mine, but I'm sympathetic to it. We did discuss whether it would be unhelpful to diverge from ELF like this, but the consensus was that keeping the table of function pointers small was more worthwhile.

Note - I realise that we could override the ELF-y __libc_start_init since it's weak. However, given that we need to call __init_libc directly anyway, we may as well save some code and just register __init_libc with the Wasm start mechanism.

> It does require a pointer to the args/environment
> formatted as an array of:
> ...
> but __init_libc and other code in musl also requires such an array to
> be present.

Yes, I've got an array of argv/envp for keeping __init_libc happy. That's OK, that's an ELF convention that we're quite happy to follow to reduce friction with Musl. In general, where possible I don't have a problem with emulating ELF.

Thanks for your patience and for asking questions. I hope the answers help!

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 11:04                     ` Nicholas Wilson
@ 2017-12-19 15:27                       ` Szabolcs Nagy
  2017-12-19 15:56                       ` Rich Felker
  1 sibling, 0 replies; 18+ messages in thread
From: Szabolcs Nagy @ 2017-12-19 15:27 UTC (permalink / raw)
  To: musl

* Nicholas Wilson <nicholas.wilson@realvnc.com> [2017-12-19 11:04:33 +0000]:
> On 19 December 2017 01:08, Rich Felker wrote:
> > I still don't see any good reason to call __init_libc instead of
> > __libc_start_main. While the crt1 entry point (_start) itself is a not
> > a normal C function but something specific to the ELF entry
> > conventions, __libc_start_main is a perfectly good C function that is
> > not "ELF specific".
> 
> There are two ELF-specific bits to __libc_start_main:
> 
> 1) The first thing is that it calls main and exit! Wasm users don't want to call exit. (This is different from the discussion we had before as to whether it's OK to *link in* exit when it's never called. This is about whether it's actually OK to call exit.) exit destroys C++ global variables, and Wasm users don't want that. We need to able to call into the Wasm module repeatedly from a JavaScript web page.
> 

i don't know much about the wasm environment, but surely
there are many ways to map c concepts to the wasm ones.
you should choose a mapping that makes wasm less different
from other targets.

> Maybe think of a Wasm (statically-linked) binary as being a bit more like a shared library: instead of exporting a single "entrypoint" for the kernel to run (like a Unix process), it exports a list of functions for the JavaScript environment to run repeatedly. We're not using the language of "shared library" though, because we're using that terminology to mean Wasm libraries calling each other, and we don't have all the tooling in place for that in web browsers yet.
> 
> 2) This second point is just for your interest. It doesn't affect the rest of Musl, since Wasm can provide its own crt/wasm/crt1.c.  (See: https://github.com/NWilson/musl/blob/musl-wasm-native/crt/wasm/crt1.c)
> 

the correctness of the runtime is only guaranteed if you
go via the symbol __libc_start_main otherwise future
changes to that function can easily break your wasm port.

> Another difference is that __libc_start_init uses the __init_array_start/end symbols, which come from special compiler-provided crtbegin.o/crtend.o objects on ELF platforms. The linker has to arrange for a table of function pointers for Musl to call, and it does that using the magic .init_array ELF section (plus some priority fiddling).
> 
> In Wasm, most functions are not callable by pointer: a function pointer is only assigned to code that has its address taken. For security, a decision was taken *not* to use a table of function pointers for the startup code, since that would mean taking the address of those functions, and they otherwise would be unlikely to need to go in the function pointer table. It's a hardening measure, to reduce the number of functions that can be used as the target of an indirect call.
> 
> Hence, the linker synthesises the init-function as a list of call-instructions, rather than synthesising a list of function pointers for a pre-written function to iterate over.
> 

you can use a single function pointer in init_array (to the
linker generated magic init function) or override _init or
let the wasm environment provide main which does the init.
in any case there seems to be no need to do things
differently on wasm (at least for an initial port)

> This decision wasn't mine, but I'm sympathetic to it. We did discuss whether it would be unhelpful to diverge from ELF like this, but the consensus was that keeping the table of function pointers small was more worthwhile.
> 
> Note - I realise that we could override the ELF-y __libc_start_init since it's weak. However, given that we need to call __init_libc directly anyway, we may as well save some code and just register __init_libc with the Wasm start mechanism.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 11:04                     ` Nicholas Wilson
  2017-12-19 15:27                       ` Szabolcs Nagy
@ 2017-12-19 15:56                       ` Rich Felker
  2017-12-19 17:46                         ` Nicholas Wilson
  1 sibling, 1 reply; 18+ messages in thread
From: Rich Felker @ 2017-12-19 15:56 UTC (permalink / raw)
  To: musl

On Tue, Dec 19, 2017 at 11:04:33AM +0000, Nicholas Wilson wrote:
> On 19 December 2017 01:08, Rich Felker wrote:
> > I still don't see any good reason to call __init_libc instead of
> > __libc_start_main. While the crt1 entry point (_start) itself is a not
> > a normal C function but something specific to the ELF entry
> > conventions, __libc_start_main is a perfectly good C function that is
> > not "ELF specific".
> 
> There are two ELF-specific bits to __libc_start_main:
> 
> 1) The first thing is that it calls main and exit! Wasm users don't
> want to call exit. (This is different from the discussion we had
> before as to whether it's OK to *link in* exit when it's never
> called. This is about whether it's actually OK to call exit.) exit
> destroys C++ global variables, and Wasm users don't want that. We
> need to able to call into the Wasm module repeatedly from a
> JavaScript web page.

This is not ELF-specific, and it's not different from the discussion
we had before. You seem to be under the impression that exit "gets
called" because __libc_start_main is called. This is not true. exit is
only called if main returns. (In fact, if you have LTO, the linker
will even optimize out exit like you wanted if main does not return.)

As for why it's not ELF-specific, returning from main producing
behavior as if exit were called is part of the C language.

> Maybe think of a Wasm (statically-linked) binary as being a bit more
> like a shared library: instead of exporting a single "entrypoint"
> for the kernel to run (like a Unix process), it exports a list of
> functions for the JavaScript environment to run repeatedly. We're
> not using the language of "shared library" though, because we're
> using that terminology to mean Wasm libraries calling each other,
> and we don't have all the tooling in place for that in web browsers
> yet.

Are you saying you don't want main to get called at all?

> 2) This second point is just for your interest. It doesn't affect
> the rest of Musl, since Wasm can provide its own crt/wasm/crt1.c.
> (See:
> https://github.com/NWilson/musl/blob/musl-wasm-native/crt/wasm/crt1.c)
> 
> Another difference is that __libc_start_init uses the
> __init_array_start/end symbols, which come from special
> compiler-provided crtbegin.o/crtend.o objects on ELF platforms. The
> linker has to arrange for a table of function pointers for Musl to
> call, and it does that using the magic .init_array ELF section (plus
> some priority fiddling).

These are provided by the linker itself, not by crtbegin/crtend. (The
latter are legacy cruft used by the compiler for implementing the Java
runtime and an old, deprecated way of invoking C++ ctors. They're not
needed by musl or modern toolchains.)

> In Wasm, most functions are not callable by pointer: a function
> pointer is only assigned to code that has its address taken. For
> security, a decision was taken *not* to use a table of function
> pointers for the startup code, since that would mean taking the
> address of those functions, and they otherwise would be unlikely to
> need to go in the function pointer table. It's a hardening measure,
> to reduce the number of functions that can be used as the target of
> an indirect call.
> 
> Hence, the linker synthesises the init-function as a list of
> call-instructions, rather than synthesising a list of function
> pointers for a pre-written function to iterate over.
> 
> This decision wasn't mine, but I'm sympathetic to it. We did discuss
> whether it would be unhelpful to diverge from ELF like this, but the
> consensus was that keeping the table of function pointers small was
> more worthwhile.

I'm not sure it's a good decision, but it doesn't mean you have to
replace __libc_start_init. Note that __libc_start_init also calls
_init() and this is thee symbol that's expected to call into the
legacy "string of init function calls" mechanism you're using. The
__init_array_{start,end} symbols can just be defined both as null, or
as aliases for the same dummy object, and the init array loop becomes
a no-op. (And with LTO it would even be removed completely.)

> Note - I realise that we could override the ELF-y __libc_start_init
> since it's weak.

This is the intended usage (for musl internals, not as an interface
boundary between musl proper and arch code) and how the dynamic linker
works. But as stated above I don't think there's any reason to
override it here.

> However, given that we need to call __init_libc
> directly anyway, we may as well save some code and just register
> __init_libc with the Wasm start mechanism.

From my perspective, doing things in gratuitously arch-dependent ways
to "save some code" doesn't make sense when you're trading a small
(trivial relative size) amount of code for a permanent interface
boundary and maintenance burden.

> > It does require a pointer to the args/environment
> > formatted as an array of:
> > ...
> > but __init_libc and other code in musl also requires such an array to
> > be present.
> 
> Yes, I've got an array of argv/envp for keeping __init_libc happy.
> That's OK, that's an ELF convention that we're quite happy to follow
> to reduce friction with Musl. In general, where possible I don't
> have a problem with emulating ELF.
> 
> Thanks for your patience and for asking questions. I hope the
> answers help!

No problem. Hope this continues to be helpful.

Rich


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 15:56                       ` Rich Felker
@ 2017-12-19 17:46                         ` Nicholas Wilson
  2017-12-19 17:54                           ` Alexander Monakov
  2017-12-19 21:03                           ` Rich Felker
  0 siblings, 2 replies; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-19 17:46 UTC (permalink / raw)
  To: musl

On 19 December 2017 15:56, Rich Felker wrote:
> This is not ELF-specific, and it's not different from the discussion
> we had before. You seem to be under the impression that exit "gets
> called" because __libc_start_main is called. This is not true. exit is
> only called if main returns. (In fact, if you have LTO, the linker
> will even optimize out exit like you wanted if main does not return.)

From our point of view, exit being called *is* a consequence of __libc_start_main. Everything must return; there's no way to hold up the JavaScript main loop in a browser (unless you spin at 100% CPU and freeze the entire browser page, which is totally unacceptable). Wasm is run in a single-threaded asynchronous environment, really just like normal JavaScript, where every function is non-blocking.

(The difference between JavaScript and Wasm is that Wasm doesn't use the JS heap, is guaranteed to generate no JS garbage, and is strongly-typed so can be compiled to native x86/ARM code as soon as the browser receives it, so it runs at the same speed as a native executable. A typical C function compiled the "normal" way to a native C executable will have nearly identical x86 assembly to the x86 code that a browser produces for the corresponding Wasm function. Neat! But, it does mean no non-blocking functions, since the JavaScript environment is not able to handle that. The JavaScript host environment is event-driven: when I/O happens or some other event occurs, we'll call into the Wasm module again, and expect to return promptly so that the browser's event loop isn't blocked.)

> As for why it's not ELF-specific, returning from main producing
> behavior as if exit were called is part of the C language.

Yes, I agree - main must call exit when it returns. But in Wasm we don't want main to get called at all - since immediately afterwards the Wasm module becomes unusable for anything else (because of exit cleanup). For people with one-shot applications, there will be provision for calling main if they want it. However, I don't expect it to be common for people to use that functionality.

>> However, given that we need to call __init_libc
>> directly anyway, we may as well save some code and just register
>> __init_libc with the Wasm start mechanism.
> From my perspective, doing things in gratuitously arch-dependent ways
> to "save some code" doesn't make sense when you're trading a small
> (trivial relative size) amount of code for a permanent interface
> boundary and maintenance burden.

I mean, we have to call __init_libc directly anyway, so may as well just depend on a single internal interface, rather than using a second internal interface as well like __libc_start_init. By "short" I mean "use the absolute minimum of internal Musl dependencies". Because of the requirement not to call exit, I can't see a way of not calling *any* internal Musl functions, so getting it down to just __init_libc is the least-intrusive we can be.

Responding to Szabolcs:

On 19 December 2017 15:27, Szabolcs Nagy wrote:
> the correctness of the runtime is only guaranteed if you
> go via the symbol __libc_start_main otherwise future
> changes to that function can easily break your wasm port.

Yes, we'll have to make sure each time we update our fork that it still works. That's acceptable to me, we understand that the dependency is purely internal to libc, it's a not a public interface that Musl has to support forever. I accept that as long as we have a fork, we risk having to do some maintenance whenever we update to the latest Musl release. (But, with many people at Google and Mozilla working on Wasm long-term, I'm not worried that Wasm support will stagnate.)

That's why I was hoping to eventually merge the Wasm support upstream, so that changes to interfaces used internally within Musl can take Wasm into account, rather than risk using internal interfaces without your knowledge.

All the best,
Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 17:46                         ` Nicholas Wilson
@ 2017-12-19 17:54                           ` Alexander Monakov
  2017-12-19 18:03                             ` Nicholas Wilson
  2017-12-19 21:03                           ` Rich Felker
  1 sibling, 1 reply; 18+ messages in thread
From: Alexander Monakov @ 2017-12-19 17:54 UTC (permalink / raw)
  To: musl

It sounds to me that for practical use you'd want a Wasm module to correspond
roughly to a shared library (with possibly multiple externally callable
functions) that doesn't carry a 'main' function at all. Is that correct?

Alexander

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 17:54                           ` Alexander Monakov
@ 2017-12-19 18:03                             ` Nicholas Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Nicholas Wilson @ 2017-12-19 18:03 UTC (permalink / raw)
  To: musl

On 19 December 2017 17:54, Alexander Monakov wrote:
> It sounds to me that for practical use you'd want a Wasm module to correspond
> roughly to a shared library (with possibly multiple externally callable
> functions) that doesn't carry a 'main' function at all. Is that correct?

Yes, that's right. Maybe think of it though as a statically-linked shared library. We don't have dynamic linking yet, and when we do it's going to look very different to ELF (possibly/probably?). I want to get simple statically-linked modules working first, before adding a dynamic linker to the mix! In fact I know some people from Mozilla are working on dynamic linking, but I haven't looked into what they've done in detail.

Nick

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] split __libc_start_main.c into two files (Wasm)
  2017-12-19 17:46                         ` Nicholas Wilson
  2017-12-19 17:54                           ` Alexander Monakov
@ 2017-12-19 21:03                           ` Rich Felker
  1 sibling, 0 replies; 18+ messages in thread
From: Rich Felker @ 2017-12-19 21:03 UTC (permalink / raw)
  To: musl

On Tue, Dec 19, 2017 at 05:46:22PM +0000, Nicholas Wilson wrote:
> On 19 December 2017 15:56, Rich Felker wrote:
> > This is not ELF-specific, and it's not different from the discussion
> > we had before. You seem to be under the impression that exit "gets
> > called" because __libc_start_main is called. This is not true. exit is
> > only called if main returns. (In fact, if you have LTO, the linker
> > will even optimize out exit like you wanted if main does not return.)
> 
> From our point of view, exit being called *is* a consequence of
> __libc_start_main. Everything must return; there's no way to hold up
> the JavaScript main loop in a browser (unless you spin at 100% CPU
> and freeze the entire browser page, which is totally unacceptable).
> Wasm is run in a single-threaded asynchronous environment, really
> just like normal JavaScript, where every function is non-blocking.

OK. That's why I asked if you intend for main not to be called at all.
My idea was that main would be called, but would be expected to go
into some sort of event loop that yields (implemented by recording
state and returning back to JS control), allowing execution of an
actual C application in the browser JS context. But it seems at
present you're only interested in use of library code written in C.

> >> However, given that we need to call __init_libc
> >> directly anyway, we may as well save some code and just register
> >> __init_libc with the Wasm start mechanism.
> > From my perspective, doing things in gratuitously arch-dependent ways
> > to "save some code" doesn't make sense when you're trading a small
> > (trivial relative size) amount of code for a permanent interface
> > boundary and maintenance burden.
> 
> I mean, we have to call __init_libc directly anyway, so may as well
> just depend on a single internal interface, rather than using a
> second internal interface as well like __libc_start_init. By "short"
> I mean "use the absolute minimum of internal Musl dependencies".
> Because of the requirement not to call exit, I can't see a way of
> not calling *any* internal Musl functions, so getting it down to
> just __init_libc is the least-intrusive we can be.

Yes. Calling just __init_libc is what you would do in that case, and I
think it works as long as there are no interface boundaries subject to
version mismatch. That is, if you're ok with the situation where, if
the internal contract for __init_libc changed or if it got refactored
differently, the corresponding wasm glue would have to be changed
accordingly, and they're always static-linked together as part of the
same module so that you couldn't end up with mismatched versions.

> Responding to Szabolcs:
> 
> On 19 December 2017 15:27, Szabolcs Nagy wrote:
> > the correctness of the runtime is only guaranteed if you
> > go via the symbol __libc_start_main otherwise future
> > changes to that function can easily break your wasm port.
> 
> Yes, we'll have to make sure each time we update our fork that it
> still works. That's acceptable to me, we understand that the
> dependency is purely internal to libc, it's a not a public interface
> that Musl has to support forever. I accept that as long as we have a
> fork, we risk having to do some maintenance whenever we update to
> the latest Musl release. (But, with many people at Google and
> Mozilla working on Wasm long-term, I'm not worried that Wasm support
> will stagnate.)
> 
> That's why I was hoping to eventually merge the Wasm support
> upstream, so that changes to interfaces used internally within Musl
> can take Wasm into account, rather than risk using internal
> interfaces without your knowledge.

I think merging upstream is unlikely, at least in the short term, for
at least two reasons:

1. Cost (to me as maintainer). Having it upstream implies a burden to
   ensure that changes to musl proper don't introduce regressions to
   any of the supported archs, which requires ability to test, etc.
   Need for specialized tooling to build and test, along with poking
   through some interface boundaries (__init_libc), make these issues
   rather more impactful than for an average arch. On the other hand
   if the wasm port is treated as a consumer of musl releases, it's
   easy to have any of those issues resolved after the release, when
   syncing up to a new upstream.

2. Policy/scope. The stated (e.g. on website) principle of musl arch
   support is that (short of fenv which is only hardfloat and
   differences in long double type and ILP32 vs LP64) they all provide
   the same C- and POSIX-conforming environment and that application
   compatibility should not differ significantly across archs. Aside
   from being a nice guarantee to users, it's a convenient limitation
   of scope of what needs to be considered for maintenance in the
   upstream project, and in particular omits things like ports to
   specific bare-metal targets, which while interesting, would become
   huge scope creep if I were involved in handling them.

While the convenience of doing this probably isn't yet at the level it
ideally should be at, it's my intent that musl can fairly easily use
"overlay" style arch glue for exotic targets that aren't upstream.
This is the model midipix (midipix.org) is using, where all the arch
files are provided by a separately-maintained package that can be
extracted into the musl source tree. (Note: we do have nt32/nt64 arch
targets in upstream configure so that patching of configure is not
needed; same for wasm would make sense I think.)

How does this sound?

Rich

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-12-19 21:03 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07 14:51 [PATCH] split __libc_start_main.c into two files (Wasm) Nicholas Wilson
2017-12-07 17:03 ` Rich Felker
2017-12-15  4:19   ` Rich Felker
2017-12-15 11:34     ` Nicholas Wilson
2017-12-15 12:33       ` Szabolcs Nagy
2017-12-15 13:04         ` Nicholas Wilson
2017-12-15 17:23           ` Rich Felker
2017-12-15 17:43             ` Nicholas Wilson
2017-12-15 17:56               ` Rich Felker
2017-12-16 13:21                 ` Nicholas Wilson
2017-12-19  1:08                   ` Rich Felker
2017-12-19 11:04                     ` Nicholas Wilson
2017-12-19 15:27                       ` Szabolcs Nagy
2017-12-19 15:56                       ` Rich Felker
2017-12-19 17:46                         ` Nicholas Wilson
2017-12-19 17:54                           ` Alexander Monakov
2017-12-19 18:03                             ` Nicholas Wilson
2017-12-19 21:03                           ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).