mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] Initial xtensa/fdpic port review
@ 2024-02-27 23:24 Rich Felker
  2024-02-28  0:13 ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Rich Felker @ 2024-02-27 23:24 UTC (permalink / raw)
  To: musl; +Cc: Max Filippov

[-- Attachment #1: Type: text/plain, Size: 133 bytes --]

Attached patches were pulled from
https://github.com/jcmvbkbc/musl-xtensa/commits/xtensa-1.2.4-fdpic/

I'll reply to comment inline.

[-- Attachment #2: 0001-xtensa-add-port.patch --]
[-- Type: text/plain, Size: 53991 bytes --]

From 868b34272f414323f10528d5b9988886541cb1c0 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Tue, 22 Mar 2016 02:35:58 +0300
Subject: [PATCH 1/2] xtensa: add port

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/arch.mak                  |   1 +
 arch/xtensa/atomic_arch.h             |  27 ++
 arch/xtensa/bits/alltypes.h.in        |  27 ++
 arch/xtensa/bits/float.h              |  16 ++
 arch/xtensa/bits/ioctl.h              | 219 ++++++++++++++
 arch/xtensa/bits/limits.h             |   1 +
 arch/xtensa/bits/mman.h               |  68 +++++
 arch/xtensa/bits/posix.h              |   2 +
 arch/xtensa/bits/reg.h                |   2 +
 arch/xtensa/bits/setjmp.h             |   5 +
 arch/xtensa/bits/signal.h             |  88 ++++++
 arch/xtensa/bits/stat.h               |  24 ++
 arch/xtensa/bits/stdint.h             |  20 ++
 arch/xtensa/bits/syscall.h.in         | 396 ++++++++++++++++++++++++++
 arch/xtensa/bits/termios.h            | 167 +++++++++++
 arch/xtensa/bits/user.h               |   4 +
 arch/xtensa/bits/xtensa-config.h      |   8 +
 arch/xtensa/crt_arch.h                |  48 ++++
 arch/xtensa/kstat.h                   |  18 ++
 arch/xtensa/pthread_arch.h            |  11 +
 arch/xtensa/reloc.h                   |  38 +++
 arch/xtensa/syscall_arch.h            | 104 +++++++
 configure                             |   1 +
 crt/xtensa/crti.S                     |  29 ++
 crt/xtensa/crtn.S                     |  23 ++
 include/elf.h                         |  70 +++++
 src/internal/xtensa/syscall.S         |  25 ++
 src/ldso/xtensa/tlsdesc.S             |  36 +++
 src/process/xtensa/vfork.S            |  21 ++
 src/setjmp/xtensa/longjmp.S           |  22 ++
 src/setjmp/xtensa/setjmp.S            |  25 ++
 src/signal/xtensa/restore.s           |  10 +
 src/signal/xtensa/sigsetjmp.S         |  24 ++
 src/thread/xtensa/__set_thread_area.S |  16 ++
 src/thread/xtensa/__unmapself.S       |  13 +
 src/thread/xtensa/clone.S             |  46 +++
 src/thread/xtensa/syscall_cp.S        |  38 +++
 37 files changed, 1693 insertions(+)
 create mode 100644 arch/xtensa/arch.mak
 create mode 100644 arch/xtensa/atomic_arch.h
 create mode 100644 arch/xtensa/bits/alltypes.h.in
 create mode 100644 arch/xtensa/bits/float.h
 create mode 100644 arch/xtensa/bits/ioctl.h
 create mode 100644 arch/xtensa/bits/limits.h
 create mode 100644 arch/xtensa/bits/mman.h
 create mode 100644 arch/xtensa/bits/posix.h
 create mode 100644 arch/xtensa/bits/reg.h
 create mode 100644 arch/xtensa/bits/setjmp.h
 create mode 100644 arch/xtensa/bits/signal.h
 create mode 100644 arch/xtensa/bits/stat.h
 create mode 100644 arch/xtensa/bits/stdint.h
 create mode 100644 arch/xtensa/bits/syscall.h.in
 create mode 100644 arch/xtensa/bits/termios.h
 create mode 100644 arch/xtensa/bits/user.h
 create mode 100644 arch/xtensa/bits/xtensa-config.h
 create mode 100644 arch/xtensa/crt_arch.h
 create mode 100644 arch/xtensa/kstat.h
 create mode 100644 arch/xtensa/pthread_arch.h
 create mode 100644 arch/xtensa/reloc.h
 create mode 100644 arch/xtensa/syscall_arch.h
 create mode 100644 crt/xtensa/crti.S
 create mode 100644 crt/xtensa/crtn.S
 create mode 100644 src/internal/xtensa/syscall.S
 create mode 100644 src/ldso/xtensa/tlsdesc.S
 create mode 100644 src/process/xtensa/vfork.S
 create mode 100644 src/setjmp/xtensa/longjmp.S
 create mode 100644 src/setjmp/xtensa/setjmp.S
 create mode 100644 src/signal/xtensa/restore.s
 create mode 100644 src/signal/xtensa/sigsetjmp.S
 create mode 100644 src/thread/xtensa/__set_thread_area.S
 create mode 100644 src/thread/xtensa/__unmapself.S
 create mode 100644 src/thread/xtensa/clone.S
 create mode 100644 src/thread/xtensa/syscall_cp.S

diff --git a/arch/xtensa/arch.mak b/arch/xtensa/arch.mak
new file mode 100644
index 00000000..aa4d05ce
--- /dev/null
+++ b/arch/xtensa/arch.mak
@@ -0,0 +1 @@
+COMPAT_SRC_DIRS = compat/time32
diff --git a/arch/xtensa/atomic_arch.h b/arch/xtensa/atomic_arch.h
new file mode 100644
index 00000000..34bf0704
--- /dev/null
+++ b/arch/xtensa/atomic_arch.h
@@ -0,0 +1,27 @@
+#include "bits/xtensa-config.h"
+
+#if XCHAL_HAVE_S32C1I
+#define a_cas a_cas
+static inline int a_cas(volatile int *p, int t, int s)
+{
+	__asm__ __volatile__ (
+		"	wsr	%2, scompare1\n"
+		"	s32c1i	%0, %1\n"
+		: "+a"(s), "+m"(*p)
+		: "a"(t)
+		: "memory" );
+        return s;
+}
+#endif
+
+#define a_barrier a_barrier
+static inline void a_barrier()
+{
+	__asm__ __volatile__ ("memw" : : : "memory");
+}
+
+#define a_crash a_crash
+static inline void a_crash()
+{
+	__asm__ __volatile__ ("ill" : : : "memory");
+}
diff --git a/arch/xtensa/bits/alltypes.h.in b/arch/xtensa/bits/alltypes.h.in
new file mode 100644
index 00000000..21221ce5
--- /dev/null
+++ b/arch/xtensa/bits/alltypes.h.in
@@ -0,0 +1,27 @@
+#define _REDIR_TIME64 1
+#define _Addr int
+#define _Int64 long long
+#define _Reg int
+
+#if __XTENSA_EB__
+#define __BYTE_ORDER 4321
+#elif __XTENSA_EL__
+#define __BYTE_ORDER 1234
+#else
+#error Unknown endianness
+#endif
+
+#define __LONG_MAX 0x7fffffffL
+
+#ifndef __cplusplus
+#ifdef __WCHAR_TYPE__
+TYPEDEF __WCHAR_TYPE__ wchar_t;
+#else
+TYPEDEF unsigned wchar_t;
+#endif
+#endif
+
+TYPEDEF float float_t;
+TYPEDEF double double_t;
+
+TYPEDEF struct { long __l __attribute__((aligned(__BIGGEST_ALIGNMENT__))); } max_align_t;
diff --git a/arch/xtensa/bits/float.h b/arch/xtensa/bits/float.h
new file mode 100644
index 00000000..c4a655e7
--- /dev/null
+++ b/arch/xtensa/bits/float.h
@@ -0,0 +1,16 @@
+#define FLT_EVAL_METHOD 0
+
+#define LDBL_TRUE_MIN 4.94065645841246544177e-324L
+#define LDBL_MIN 2.22507385850720138309e-308L
+#define LDBL_MAX 1.79769313486231570815e+308L
+#define LDBL_EPSILON 2.22044604925031308085e-16L
+
+#define LDBL_MANT_DIG 53
+#define LDBL_MIN_EXP (-1021)
+#define LDBL_MAX_EXP 1024
+
+#define LDBL_DIG 15
+#define LDBL_MIN_10_EXP (-307)
+#define LDBL_MAX_10_EXP 308
+
+#define DECIMAL_DIG 17
diff --git a/arch/xtensa/bits/ioctl.h b/arch/xtensa/bits/ioctl.h
new file mode 100644
index 00000000..f30e3a69
--- /dev/null
+++ b/arch/xtensa/bits/ioctl.h
@@ -0,0 +1,219 @@
+#define _IOC(a,b,c,d) ( ((a)<<30) | ((b)<<8) | (c) | ((d)<<16) )
+#define _IOC_NONE  0U
+#define _IOC_READ  2U
+#define _IOC_WRITE 1U
+
+#define _IO(a,b) _IOC(_IOC_NONE,(a),(b),0)
+#define _IOW(a,b,c) _IOC(_IOC_WRITE,(a),(b),sizeof(c))
+#define _IOR(a,b,c) _IOC(_IOC_READ,(a),(b),sizeof(c))
+#define _IOWR(a,b,c) _IOC(_IOC_READ|_IOC_WRITE,(a),(b),sizeof(c))
+
+#define FIOCLEX		_IO('f', 1)
+#define FIONCLEX	_IO('f', 2)
+#define FIOASYNC	_IOW('f', 125, int)
+#define FIONBIO		_IOW('f', 126, int)
+#define FIONREAD	_IOR('f', 127, int)
+#define TIOCINQ		FIONREAD
+#define FIOQSIZE	_IOR('f', 128, loff_t)
+
+#define TCGETS		0x5401
+#define TCSETS		0x5402
+#define TCSETSW		0x5403
+#define TCSETSF		0x5404
+
+#define TCGETA		0x80127417	/* _IOR('t', 23, struct termio) */
+#define TCSETA		0x40127418	/* _IOW('t', 24, struct termio) */
+#define TCSETAW		0x40127419	/* _IOW('t', 25, struct termio) */
+#define TCSETAF		0x4012741C	/* _IOW('t', 28, struct termio) */
+
+#define TCSBRK		_IO('t', 29)
+#define TCXONC		_IO('t', 30)
+#define TCFLSH		_IO('t', 31)
+
+#define TIOCSWINSZ	0x40087467	/* _IOW('t', 103, struct winsize) */
+#define TIOCGWINSZ	0x80087468	/* _IOR('t', 104, struct winsize) */
+#define	TIOCSTART	_IO('t', 110)		/* start output, like ^Q */
+#define	TIOCSTOP	_IO('t', 111)		/* stop output, like ^S */
+#define TIOCOUTQ        _IOR('t', 115, int)     /* output queue size */
+
+#define TIOCSPGRP	_IOW('t', 118, int)
+#define TIOCGPGRP	_IOR('t', 119, int)
+
+#define TIOCEXCL	_IO('T', 12)
+#define TIOCNXCL	_IO('T', 13)
+#define TIOCSCTTY	_IO('T', 14)
+
+#define TIOCSTI		_IOW('T', 18, char)
+#define TIOCMGET	_IOR('T', 21, unsigned int)
+#define TIOCMBIS	_IOW('T', 22, unsigned int)
+#define TIOCMBIC	_IOW('T', 23, unsigned int)
+#define TIOCMSET	_IOW('T', 24, unsigned int)
+# define TIOCM_LE	0x001
+# define TIOCM_DTR	0x002
+# define TIOCM_RTS	0x004
+# define TIOCM_ST	0x008
+# define TIOCM_SR	0x010
+# define TIOCM_CTS	0x020
+# define TIOCM_CAR	0x040
+# define TIOCM_RNG	0x080
+# define TIOCM_DSR	0x100
+# define TIOCM_CD	TIOCM_CAR
+# define TIOCM_RI	TIOCM_RNG
+
+#define TIOCGSOFTCAR	_IOR('T', 25, unsigned int)
+#define TIOCSSOFTCAR	_IOW('T', 26, unsigned int)
+#define TIOCLINUX	_IOW('T', 28, char)
+#define TIOCCONS	_IO('T', 29)
+#define TIOCGSERIAL	0x803C541E	/*_IOR('T', 30, struct serial_struct)*/
+#define TIOCSSERIAL	0x403C541F	/*_IOW('T', 31, struct serial_struct)*/
+#define TIOCPKT		_IOW('T', 32, int)
+# define TIOCPKT_DATA		 0
+# define TIOCPKT_FLUSHREAD	 1
+# define TIOCPKT_FLUSHWRITE	 2
+# define TIOCPKT_STOP		 4
+# define TIOCPKT_START		 8
+# define TIOCPKT_NOSTOP		16
+# define TIOCPKT_DOSTOP		32
+# define TIOCPKT_IOCTL		64
+
+
+#define TIOCNOTTY	_IO('T', 34)
+#define TIOCSETD	_IOW('T', 35, int)
+#define TIOCGETD	_IOR('T', 36, int)
+#define TCSBRKP		_IOW('T', 37, int)   /* Needed for POSIX tcsendbreak()*/
+#define TIOCSBRK	_IO('T', 39) 	     /* BSD compatibility */
+#define TIOCCBRK	_IO('T', 40)	     /* BSD compatibility */
+#define TIOCGSID	_IOR('T', 41, pid_t) /* Return the session ID of FD*/
+#define TCGETS2		_IOR('T', 42, struct termios2)
+#define TCSETS2		_IOW('T', 43, struct termios2)
+#define TCSETSW2	_IOW('T', 44, struct termios2)
+#define TCSETSF2	_IOW('T', 45, struct termios2)
+#define TIOCGRS485	_IOR('T', 46, struct serial_rs485)
+#define TIOCSRS485	_IOWR('T', 47, struct serial_rs485)
+#define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
+#define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+#define TIOCGDEV	_IOR('T',0x32, unsigned int) /* Get primary device node of /dev/console */
+#define TIOCSIG		_IOW('T',0x36, int)  /* Generate signal on Pty slave */
+#define TIOCVHANGUP	_IO('T', 0x37)
+#define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
+#define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
+#define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
+
+#define TIOCSERCONFIG	_IO('T', 83)
+#define TIOCSERGWILD	_IOR('T', 84,  int)
+#define TIOCSERSWILD	_IOW('T', 85,  int)
+#define TIOCGLCKTRMIOS	0x5456
+#define TIOCSLCKTRMIOS	0x5457
+#define TIOCSERGSTRUCT	0x5458		     /* For debugging only */
+#define TIOCSERGETLSR   _IOR('T', 89, unsigned int) /* Get line status reg. */
+  /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
+#define TIOCSERGETMULTI 0x80a8545a /* Get multiport config  */
+			/* _IOR('T', 90, struct serial_multiport_struct) */
+#define TIOCSERSETMULTI 0x40a8545b /* Set multiport config */
+			/* _IOW('T', 91, struct serial_multiport_struct) */
+
+#define TIOCMIWAIT	_IO('T', 92) /* wait for a change on serial input line(s) */
+#define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
+
+#define TIOCM_LE        0x001
+#define TIOCM_DTR       0x002
+#define TIOCM_RTS       0x004
+#define TIOCM_ST        0x008
+#define TIOCM_SR        0x010
+#define TIOCM_CTS       0x020
+#define TIOCM_CAR       0x040
+#define TIOCM_RNG       0x080
+#define TIOCM_DSR       0x100
+#define TIOCM_CD        TIOCM_CAR
+#define TIOCM_RI        TIOCM_RNG
+#define TIOCM_OUT1      0x2000
+#define TIOCM_OUT2      0x4000
+#define TIOCM_LOOP      0x8000
+#define TIOCM_MODEM_BITS TIOCM_OUT2
+
+#define N_TTY           0
+#define N_SLIP          1
+#define N_MOUSE         2
+#define N_PPP           3
+#define N_STRIP         4
+#define N_AX25          5
+#define N_X25           6
+#define N_6PACK         7
+#define N_MASC          8
+#define N_R3964         9
+#define N_PROFIBUS_FDL  10
+#define N_IRDA          11
+#define N_SMSBLOCK      12
+#define N_HDLC          13
+#define N_SYNC_PPP      14
+#define N_HCI           15
+
+#define FIOGETOWN	_IOR('f', 123, int)
+#define FIOSETOWN 	_IOW('f', 124, int)
+#define SIOCATMARK	_IOR('s', 7, int)
+#define SIOCSPGRP	_IOW('s', 8, pid_t)
+#define SIOCGPGRP	_IOR('s', 9, pid_t)
+#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+
+#define SIOCADDRT       0x890B
+#define SIOCDELRT       0x890C
+#define SIOCRTMSG       0x890D
+
+#define SIOCGIFNAME     0x8910
+#define SIOCSIFLINK     0x8911
+#define SIOCGIFCONF     0x8912
+#define SIOCGIFFLAGS    0x8913
+#define SIOCSIFFLAGS    0x8914
+#define SIOCGIFADDR     0x8915
+#define SIOCSIFADDR     0x8916
+#define SIOCGIFDSTADDR  0x8917
+#define SIOCSIFDSTADDR  0x8918
+#define SIOCGIFBRDADDR  0x8919
+#define SIOCSIFBRDADDR  0x891a
+#define SIOCGIFNETMASK  0x891b
+#define SIOCSIFNETMASK  0x891c
+#define SIOCGIFMETRIC   0x891d
+#define SIOCSIFMETRIC   0x891e
+#define SIOCGIFMEM      0x891f
+#define SIOCSIFMEM      0x8920
+#define SIOCGIFMTU      0x8921
+#define SIOCSIFMTU      0x8922
+#define SIOCSIFHWADDR   0x8924
+#define SIOCGIFENCAP    0x8925
+#define SIOCSIFENCAP    0x8926
+#define SIOCGIFHWADDR   0x8927
+#define SIOCGIFSLAVE    0x8929
+#define SIOCSIFSLAVE    0x8930
+#define SIOCADDMULTI    0x8931
+#define SIOCDELMULTI    0x8932
+#define SIOCGIFINDEX    0x8933
+#define SIOGIFINDEX     SIOCGIFINDEX
+#define SIOCSIFPFLAGS   0x8934
+#define SIOCGIFPFLAGS   0x8935
+#define SIOCDIFADDR     0x8936
+#define SIOCSIFHWBROADCAST 0x8937
+#define SIOCGIFCOUNT    0x8938
+
+#define SIOCGIFBR       0x8940
+#define SIOCSIFBR       0x8941
+
+#define SIOCGIFTXQLEN   0x8942
+#define SIOCSIFTXQLEN   0x8943
+
+#define SIOCDARP        0x8953
+#define SIOCGARP        0x8954
+#define SIOCSARP        0x8955
+
+#define SIOCDRARP       0x8960
+#define SIOCGRARP       0x8961
+#define SIOCSRARP       0x8962
+
+#define SIOCGIFMAP      0x8970
+#define SIOCSIFMAP      0x8971
+
+#define SIOCADDDLCI     0x8980
+#define SIOCDELDLCI     0x8981
+
+#define SIOCDEVPRIVATE		0x89F0
+#define SIOCPROTOPRIVATE	0x89E0
diff --git a/arch/xtensa/bits/limits.h b/arch/xtensa/bits/limits.h
new file mode 100644
index 00000000..07743b6f
--- /dev/null
+++ b/arch/xtensa/bits/limits.h
@@ -0,0 +1 @@
+#define PAGESIZE 4096
diff --git a/arch/xtensa/bits/mman.h b/arch/xtensa/bits/mman.h
new file mode 100644
index 00000000..d2f58eb1
--- /dev/null
+++ b/arch/xtensa/bits/mman.h
@@ -0,0 +1,68 @@
+#define MAP_FAILED ((void *) -1)
+
+#define	PROT_NONE      0
+#define	PROT_READ      1
+#define	PROT_WRITE     2
+#define	PROT_EXEC      4
+#define	PROT_GROWSDOWN 0x01000000
+#define	PROT_GROWSUP   0x02000000
+
+#define	MAP_SHARED     0x01
+#define	MAP_PRIVATE    0x02
+#define	MAP_FIXED      0x10
+
+#define MAP_TYPE       0x0f
+#undef MAP_FILE
+#define MAP_FILE       0x00
+#undef MAP_ANON
+#define MAP_ANON       0x800
+#define MAP_ANONYMOUS  MAP_ANON
+#undef MAP_NORESERVE
+#define MAP_NORESERVE  0x0400
+#undef MAP_GROWSDOWN
+#define MAP_GROWSDOWN  0x1000
+#undef MAP_DENYWRITE
+#define MAP_DENYWRITE  0x2000
+#undef MAP_EXECUTABLE
+#define MAP_EXECUTABLE 0x4000
+#undef MAP_LOCKED
+#define MAP_LOCKED     0x8000
+#undef MAP_POPULATE
+#define MAP_POPULATE   0x10000
+#undef MAP_NONBLOCK
+#define MAP_NONBLOCK   0x20000
+#undef MAP_STACK
+#define MAP_STACK      0x40000
+#undef MAP_HUGETLB
+#define MAP_HUGETLB    0x80000
+
+#define POSIX_MADV_NORMAL       0
+#define POSIX_MADV_RANDOM       1
+#define POSIX_MADV_SEQUENTIAL   2
+#define POSIX_MADV_WILLNEED     3
+#define POSIX_MADV_DONTNEED     4
+
+#define MS_ASYNC        1
+#define MS_INVALIDATE   2
+#define MS_SYNC         4
+
+#define MCL_CURRENT     1
+#define MCL_FUTURE      2
+
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+#define MADV_NORMAL      0
+#define MADV_RANDOM      1
+#define MADV_SEQUENTIAL  2
+#define MADV_WILLNEED    3
+#define MADV_DONTNEED    4
+#define MADV_REMOVE      9
+#define MADV_DONTFORK    10
+#define MADV_DOFORK      11
+#define MADV_MERGEABLE   12
+#define MADV_UNMERGEABLE 13
+#define MADV_HUGEPAGE    14
+#define MADV_NOHUGEPAGE  15
+#define MADV_DONTDUMP    16
+#define MADV_DODUMP      17
+#define MADV_HWPOISON    100
+#endif
diff --git a/arch/xtensa/bits/posix.h b/arch/xtensa/bits/posix.h
new file mode 100644
index 00000000..30a38714
--- /dev/null
+++ b/arch/xtensa/bits/posix.h
@@ -0,0 +1,2 @@
+#define _POSIX_V6_ILP32_OFFBIG  1
+#define _POSIX_V7_ILP32_OFFBIG  1
diff --git a/arch/xtensa/bits/reg.h b/arch/xtensa/bits/reg.h
new file mode 100644
index 00000000..0192a293
--- /dev/null
+++ b/arch/xtensa/bits/reg.h
@@ -0,0 +1,2 @@
+#undef __WORDSIZE
+#define __WORDSIZE 32
diff --git a/arch/xtensa/bits/setjmp.h b/arch/xtensa/bits/setjmp.h
new file mode 100644
index 00000000..e1a6dd97
--- /dev/null
+++ b/arch/xtensa/bits/setjmp.h
@@ -0,0 +1,5 @@
+#if defined(__XTENSA_CALL0_ABI__)
+typedef unsigned long __jmp_buf[6];
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/arch/xtensa/bits/signal.h b/arch/xtensa/bits/signal.h
new file mode 100644
index 00000000..545ffd36
--- /dev/null
+++ b/arch/xtensa/bits/signal.h
@@ -0,0 +1,88 @@
+#if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
+ || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+
+#if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+#define MINSIGSTKSZ 2048
+#define SIGSTKSZ 8192
+#endif
+
+typedef struct sigcontext
+{
+	unsigned long sc_pc;
+	unsigned long sc_ps;
+	unsigned long sc_lbeg;
+	unsigned long sc_lend;
+	unsigned long sc_lcount;
+	unsigned long sc_sar;
+	unsigned long sc_acclo;
+	unsigned long sc_acchi;
+	unsigned long sc_a[16];
+	void *sc_xtregs;
+} mcontext_t;
+
+struct sigaltstack {
+	void *ss_sp;
+	int ss_flags;
+	size_t ss_size;
+};
+
+typedef struct __ucontext {
+	unsigned long uc_flags;
+	struct __ucontext *uc_link;
+	stack_t uc_stack;
+	mcontext_t uc_mcontext;
+	sigset_t uc_sigmask;
+} ucontext_t;
+
+#define SA_NOCLDSTOP	0x00000001
+#define SA_NOCLDWAIT	0x00000002 /* not supported yet */
+#define SA_SIGINFO	0x00000004
+#define SA_ONSTACK	0x08000000
+#define SA_RESTART	0x10000000
+#define SA_NODEFER	0x40000000
+#define SA_RESETHAND	0x80000000
+
+#define SA_NOMASK	SA_NODEFER
+#define SA_ONESHOT	SA_RESETHAND
+
+#define SA_RESTORER	0x04000000
+
+#endif
+
+#define SIGHUP		 1
+#define SIGINT		 2
+#define SIGQUIT		 3
+#define SIGILL		 4
+#define SIGTRAP		 5
+#define SIGABRT		 6
+#define SIGIOT		 6
+#define SIGBUS		 7
+#define SIGFPE		 8
+#define SIGKILL		 9
+#define SIGUSR1		10
+#define SIGSEGV		11
+#define SIGUSR2		12
+#define SIGPIPE		13
+#define SIGALRM		14
+#define SIGTERM		15
+#define SIGSTKFLT	16
+#define SIGCHLD		17
+#define SIGCONT		18
+#define SIGSTOP		19
+#define SIGTSTP		20
+#define SIGTTIN		21
+#define SIGTTOU		22
+#define SIGURG		23
+#define SIGXCPU		24
+#define SIGXFSZ		25
+#define SIGVTALRM	26
+#define SIGPROF		27
+#define SIGWINCH	28
+#define SIGIO		29
+#define SIGPOLL		SIGIO
+/* #define SIGLOST		29 */
+#define SIGPWR		30
+#define SIGSYS		31
+#define	SIGUNUSED	31
+
+#define _NSIG 64
diff --git a/arch/xtensa/bits/stat.h b/arch/xtensa/bits/stat.h
new file mode 100644
index 00000000..58881366
--- /dev/null
+++ b/arch/xtensa/bits/stat.h
@@ -0,0 +1,24 @@
+/* copied from kernel definition, but with padding replaced
+ * by the corresponding correctly-sized userspace types. */
+
+struct stat
+{
+	dev_t st_dev;
+	ino_t st_ino;
+	mode_t st_mode;
+	nlink_t st_nlink;
+	uid_t st_uid;
+	gid_t st_gid;
+	dev_t st_rdev;
+	off_t st_size;
+	blksize_t st_blksize;
+	long __st_padding1;
+	blkcnt_t st_blocks;
+	struct {
+		long tv_sec;
+		long tv_nsec;
+	} __st_atim32, __st_mtim32, __st_ctim32;
+	struct timespec st_atim;
+	struct timespec st_mtim;
+	struct timespec st_ctim;
+};
diff --git a/arch/xtensa/bits/stdint.h b/arch/xtensa/bits/stdint.h
new file mode 100644
index 00000000..d1b27121
--- /dev/null
+++ b/arch/xtensa/bits/stdint.h
@@ -0,0 +1,20 @@
+typedef int32_t int_fast16_t;
+typedef int32_t int_fast32_t;
+typedef uint32_t uint_fast16_t;
+typedef uint32_t uint_fast32_t;
+
+#define INT_FAST16_MIN  INT32_MIN
+#define INT_FAST32_MIN  INT32_MIN
+
+#define INT_FAST16_MAX  INT32_MAX
+#define INT_FAST32_MAX  INT32_MAX
+
+#define UINT_FAST16_MAX UINT32_MAX
+#define UINT_FAST32_MAX UINT32_MAX
+
+#define INTPTR_MIN      INT32_MIN
+#define INTPTR_MAX      INT32_MAX
+#define UINTPTR_MAX     UINT32_MAX
+#define PTRDIFF_MIN     INT32_MIN
+#define PTRDIFF_MAX     INT32_MAX
+#define SIZE_MAX        UINT32_MAX
diff --git a/arch/xtensa/bits/syscall.h.in b/arch/xtensa/bits/syscall.h.in
new file mode 100644
index 00000000..06f8ca2e
--- /dev/null
+++ b/arch/xtensa/bits/syscall.h.in
@@ -0,0 +1,396 @@
+#define __NR_spill				  0
+#define __NR_xtensa				  1
+#define __NR_available4				  2
+#define __NR_available5				  3
+#define __NR_available6				  4
+#define __NR_available7				  5
+#define __NR_available8				  6
+#define __NR_available9				  7
+#define __NR_open				  8
+#define __NR_close				  9
+#define __NR_dup				 10
+#define __NR_dup2				 11
+#define __NR_read				 12
+#define __NR_write				 13
+#define __NR_select				 14
+#define __NR_lseek				 15
+#define __NR_poll				 16
+#define __NR__llseek				 17
+#define __NR_epoll_wait				 18
+#define __NR_epoll_ctl				 19
+#define __NR_epoll_create			 20
+#define __NR_creat				 21
+#define __NR_truncate				 22
+#define __NR_ftruncate				 23
+#define __NR_readv				 24
+#define __NR_writev				 25
+#define __NR_fsync				 26
+#define __NR_fdatasync				 27
+#define __NR_truncate64				 28
+#define __NR_ftruncate64			 29
+#define __NR_pread64				 30
+#define __NR_pwrite64				 31
+#define __NR_link				 32
+#define __NR_rename				 33
+#define __NR_symlink				 34
+#define __NR_readlink				 35
+#define __NR_mknod				 36
+#define __NR_pipe				 37
+#define __NR_unlink				 38
+#define __NR_rmdir				 39
+#define __NR_mkdir				 40
+#define __NR_chdir				 41
+#define __NR_fchdir				 42
+#define __NR_getcwd				 43
+#define __NR_chmod				 44
+#define __NR_chown				 45
+#define __NR_stat				 46
+#define __NR_stat64				 47
+#define __NR_lchown				 48
+#define __NR_lstat				 49
+#define __NR_lstat64				 50
+#define __NR_available51			 51
+#define __NR_fchmod				 52
+#define __NR_fchown				 53
+#define __NR_fstat				 54
+#define __NR_fstat64				 55
+#define __NR_flock				 56
+#define __NR_access				 57
+#define __NR_umask				 58
+#define __NR_getdents				 59
+#define __NR_getdents64				 60
+#define __NR_fcntl64				 61
+#define __NR_fallocate				 62
+#define __NR_fadvise64_64			 63
+#define __NR_utime				 64	/* glibc 2.3.3 ?? */
+#define __NR_utimes				 65
+#define __NR_ioctl				 66
+#define __NR_fcntl				 67
+#define __NR_setxattr				 68
+#define __NR_getxattr				 69
+#define __NR_listxattr				 70
+#define __NR_removexattr			 71
+#define __NR_lsetxattr				 72
+#define __NR_lgetxattr				 73
+#define __NR_llistxattr				 74
+#define __NR_lremovexattr			 75
+#define __NR_fsetxattr				 76
+#define __NR_fgetxattr				 77
+#define __NR_flistxattr				 78
+#define __NR_fremovexattr			 79
+#define __NR_mmap2				 80
+#define __NR_munmap				 81
+#define __NR_mprotect				 82
+#define __NR_brk				 83
+#define __NR_mlock				 84
+#define __NR_munlock				 85
+#define __NR_mlockall				 86
+#define __NR_munlockall				 87
+#define __NR_mremap				 88
+#define __NR_msync				 89
+#define __NR_mincore				 90
+#define __NR_madvise				 91
+#define __NR_shmget				 92
+#define __NR_shmat				 93
+#define __NR_shmctl				 94
+#define __NR_shmdt				 95
+#define __NR_socket				 96
+#define __NR_setsockopt				 97
+#define __NR_getsockopt				 98
+#define __NR_shutdown				 99
+#define __NR_bind				100
+#define __NR_connect				101
+#define __NR_listen				102
+#define __NR_accept				103
+#define __NR_getsockname			104
+#define __NR_getpeername			105
+#define __NR_sendmsg				106
+#define __NR_recvmsg				107
+#define __NR_send				108
+#define __NR_recv				109
+#define __NR_sendto				110
+#define __NR_recvfrom				111
+#define __NR_socketpair				112
+#define __NR_sendfile				113
+#define __NR_sendfile64				114
+#define __NR_sendmmsg				115
+#define __NR_clone				116
+#define __NR_execve				117
+#define __NR_exit				118
+#define __NR_exit_group				119
+#define __NR_getpid				120
+#define __NR_wait4				121
+#define __NR_waitid				122
+#define __NR_kill				123
+#define __NR_tkill				124
+#define __NR_tgkill				125
+#define __NR_set_tid_address			126
+#define __NR_gettid				127
+#define __NR_setsid				128
+#define __NR_getsid				129
+#define __NR_prctl				130
+#define __NR_personality			131
+#define __NR_getpriority			132
+#define __NR_setpriority			133
+#define __NR_setitimer				134
+#define __NR_getitimer				135
+#define __NR_setuid				136
+#define __NR_getuid				137
+#define __NR_setgid				138
+#define __NR_getgid				139
+#define __NR_geteuid				140
+#define __NR_getegid				141
+#define __NR_setreuid				142
+#define __NR_setregid				143
+#define __NR_setresuid				144
+#define __NR_getresuid				145
+#define __NR_setresgid				146
+#define __NR_getresgid				147
+#define __NR_setpgid				148
+#define __NR_getpgid				149
+#define __NR_getppid				150
+#define __NR_getpgrp				151
+#define __NR_reserved152			152	/* set_thread_area */
+#define __NR_reserved153			153	/* get_thread_area */
+#define __NR_times				154
+#define __NR_acct				155
+#define __NR_sched_setaffinity			156
+#define __NR_sched_getaffinity			157
+#define __NR_capget				158
+#define __NR_capset				159
+#define __NR_ptrace				160
+#define __NR_semtimedop				161
+#define __NR_semget				162
+#define __NR_semop				163
+#define __NR_semctl				164
+#define __NR_available165			165
+#define __NR_msgget				166
+#define __NR_msgsnd				167
+#define __NR_msgrcv				168
+#define __NR_msgctl				169
+#define __NR_available170			170
+#define __NR_umount2				171
+#define __NR_mount				172
+#define __NR_swapon				173
+#define __NR_chroot				174
+#define __NR_pivot_root				175
+#define __NR_umount				176
+#define __NR_swapoff				177
+#define __NR_sync				178
+#define __NR_syncfs				179
+#define __NR_setfsuid				180
+#define __NR_setfsgid				181
+#define __NR_sysfs				182
+#define __NR_ustat				183
+#define __NR_statfs				184
+#define __NR_fstatfs				185
+#define __NR_statfs64				186
+#define __NR_fstatfs64				187
+#define __NR_setrlimit				188
+#define __NR_getrlimit				189
+#define __NR_getrusage				190
+#define __NR_futex				191
+#define __NR_gettimeofday			192
+#define __NR_settimeofday			193
+#define __NR_adjtimex				194
+#define __NR_nanosleep				195
+#define __NR_getgroups				196
+#define __NR_setgroups				197
+#define __NR_sethostname			198
+#define __NR_setdomainname			199
+#define __NR_syslog				200
+#define __NR_vhangup				201
+#define __NR_uselib				202
+#define __NR_reboot				203
+#define __NR_quotactl				204
+#define __NR_nfsservctl				205
+#define __NR__sysctl				206
+#define __NR_bdflush				207
+#define __NR_uname				208
+#define __NR_sysinfo				209
+#define __NR_init_module			210
+#define __NR_delete_module			211
+#define __NR_sched_setparam			212
+#define __NR_sched_getparam			213
+#define __NR_sched_setscheduler			214
+#define __NR_sched_getscheduler			215
+#define __NR_sched_get_priority_max		216
+#define __NR_sched_get_priority_min		217
+#define __NR_sched_rr_get_interval		218
+#define __NR_sched_yield			219
+#define __NR_available222			222
+#define __NR_restart_syscall			223
+#define __NR_sigaltstack			224
+#define __NR_rt_sigreturn			225
+#define __NR_rt_sigaction			226
+#define __NR_rt_sigprocmask			227
+#define __NR_rt_sigpending			228
+#define __NR_rt_sigtimedwait			229
+#define __NR_rt_sigqueueinfo			230
+#define __NR_rt_sigsuspend			231
+#define __NR_mq_open				232
+#define __NR_mq_unlink				233
+#define __NR_mq_timedsend			234
+#define __NR_mq_timedreceive			235
+#define __NR_mq_notify				236
+#define __NR_mq_getsetattr			237
+#define __NR_available238			238
+#define __NR_io_setup				239
+#define __NR_io_destroy				240
+#define __NR_io_submit				241
+#define __NR_io_getevents			242
+#define __NR_io_cancel				243
+#define __NR_clock_settime			244
+#define __NR_clock_gettime			245
+#define __NR_clock_getres			246
+#define __NR_clock_nanosleep			247
+#define __NR_timer_create			248
+#define __NR_timer_delete			249
+#define __NR_timer_settime			250
+#define __NR_timer_gettime			251
+#define __NR_timer_getoverrun			252
+#define __NR_reserved253			253
+#define __NR_lookup_dcookie			254
+#define __NR_available255			255
+#define __NR_add_key				256
+#define __NR_request_key			257
+#define __NR_keyctl				258
+#define __NR_available259			259
+#define __NR_readahead				260
+#define __NR_remap_file_pages			261
+#define __NR_migrate_pages			262
+#define __NR_mbind				263
+#define __NR_get_mempolicy			264
+#define __NR_set_mempolicy			265
+#define __NR_unshare				266
+#define __NR_move_pages				267
+#define __NR_splice				268
+#define __NR_tee				269
+#define __NR_vmsplice				270
+#define __NR_available271			271
+#define __NR_pselect6				272
+#define __NR_ppoll				273
+#define __NR_epoll_pwait			274
+#define __NR_epoll_create1			275
+#define __NR_inotify_init			276
+#define __NR_inotify_add_watch			277
+#define __NR_inotify_rm_watch			278
+#define __NR_inotify_init1			279
+#define __NR_getcpu				280
+#define __NR_kexec_load				281
+#define __NR_ioprio_set				282
+#define __NR_ioprio_get				283
+#define __NR_set_robust_list			284
+#define __NR_get_robust_list			285
+#define __NR_available286			286
+#define __NR_available287			287
+#define __NR_openat				288
+#define __NR_mkdirat				289
+#define __NR_mknodat				290
+#define __NR_unlinkat				291
+#define __NR_renameat				292
+#define __NR_linkat				293
+#define __NR_symlinkat				294
+#define __NR_readlinkat				295
+#define __NR_utimensat				296
+#define __NR_fchownat				297
+#define __NR_futimesat				298
+#define __NR_fstatat64				299
+#define __NR_fchmodat				300
+#define __NR_faccessat				301
+#define __NR_available302			302
+#define __NR_available303			303
+#define __NR_signalfd				304
+#define __NR_eventfd				306
+#define __NR_recvmmsg				307
+#define __NR_setns				308
+#define __NR_signalfd4				309
+#define __NR_dup3				310
+#define __NR_pipe2				311
+#define __NR_timerfd_create			312
+#define __NR_timerfd_settime			313
+#define __NR_timerfd_gettime			314
+#define __NR_available315			315
+#define __NR_eventfd2				316
+#define __NR_preadv				317
+#define __NR_pwritev				318
+#define __NR_available319			319
+#define __NR_fanotify_init			320
+#define __NR_fanotify_mark			321
+#define __NR_process_vm_readv			322
+#define __NR_process_vm_writev			323
+#define __NR_name_to_handle_at			324
+#define __NR_open_by_handle_at			325
+#define __NR_sync_file_range2			326
+#define __NR_perf_event_open			327
+#define __NR_rt_tgsigqueueinfo			328
+#define __NR_clock_adjtime			329
+#define __NR_prlimit64				330
+#define __NR_kcmp				331
+#define __NR_finit_module			332
+#define __NR_accept4				333
+#define __NR_sched_setattr			334
+#define __NR_sched_getattr			335
+#define __NR_renameat2				336
+#define __NR_seccomp				337
+#define __NR_getrandom				338
+#define __NR_memfd_create			339
+#define __NR_bpf				340
+#define __NR_execveat				341
+#define __NR_userfaultfd			342
+#define __NR_membarrier				343
+#define __NR_mlock2				344
+#define __NR_copy_file_range			345
+#define __NR_preadv2				346
+#define __NR_pwritev2				347
+#define __NR_pkey_mprotect			348
+#define __NR_pkey_alloc				349
+#define __NR_pkey_free				350
+#define __NR_statx				351
+#define __NR_rseq				352
+#define __NR_clock_gettime64			403
+#define __NR_clock_settime64			404
+#define __NR_clock_adjtime64			405
+#define __NR_clock_getres_time64		406
+#define __NR_clock_nanosleep_time64		407
+#define __NR_timer_gettime64			408
+#define __NR_timer_settime64			409
+#define __NR_timerfd_gettime64			410
+#define __NR_timerfd_settime64			411
+#define __NR_utimensat_time64			412
+#define __NR_pselect6_time64			413
+#define __NR_ppoll_time64			414
+#define __NR_io_pgetevents_time64		416
+#define __NR_recvmmsg_time64			417
+#define __NR_mq_timedsend_time64		418
+#define __NR_mq_timedreceive_time64		419
+#define __NR_semtimedop_time64			420
+#define __NR_rt_sigtimedwait_time64		421
+#define __NR_futex_time64			422
+#define __NR_sched_rr_get_interval_time64	423
+#define __NR_pidfd_send_signal			424
+#define __NR_io_uring_setup			425
+#define __NR_io_uring_enter			426
+#define __NR_io_uring_register			427
+#define __NR_open_tree				428
+#define __NR_move_mount				429
+#define __NR_fsopen				430
+#define __NR_fsconfig				431
+#define __NR_fsmount				432
+#define __NR_fspick				433
+#define __NR_pidfd_open				434
+#define __NR_clone3				435
+#define __NR_close_range			436
+#define __NR_openat2				437
+#define __NR_pidfd_getfd			438
+#define __NR_faccessat2				439
+#define __NR_process_madvise			440
+#define __NR_epoll_pwait2			441
+#define __NR_mount_setattr			442
+#define __NR_quotactl_fd			443
+#define __NR_landlock_create_ruleset		444
+#define __NR_landlock_add_rule			445
+#define __NR_landlock_restrict_self		446
+#define __NR_process_mrelease			448
+#define __NR_futex_waitv			449
+#define __NR_set_mempolicy_home_node		450
diff --git a/arch/xtensa/bits/termios.h b/arch/xtensa/bits/termios.h
new file mode 100644
index 00000000..ff1cbb25
--- /dev/null
+++ b/arch/xtensa/bits/termios.h
@@ -0,0 +1,167 @@
+struct termios
+{
+	tcflag_t c_iflag;
+	tcflag_t c_oflag;
+	tcflag_t c_cflag;
+	tcflag_t c_lflag;
+	cc_t c_line;
+	cc_t c_cc[NCCS];
+	speed_t __c_ispeed;
+	speed_t __c_ospeed;
+};
+
+#define VINTR 0
+#define VQUIT 1
+#define VERASE 2
+#define VKILL 3
+#define VEOF 4
+#define VTIME 5
+#define VMIN 6
+#define VSWTC 7
+#define VSTART 8
+#define VSTOP 9
+#define VSUSP 10
+#define VEOL 11
+#define VREPRINT 12
+#define VDISCARD 13
+#define VWERASE 14
+#define VLNEXT 15
+#define VEOL2 16
+
+#define IGNBRK	0000001
+#define BRKINT	0000002
+#define IGNPAR	0000004
+#define PARMRK	0000010
+#define INPCK	0000020
+#define ISTRIP	0000040
+#define INLCR	0000100
+#define IGNCR	0000200
+#define ICRNL	0000400
+#define IUCLC	0001000
+#define IXON	0002000
+#define IXANY	0004000
+#define IXOFF	0010000
+#define IMAXBEL	0020000
+#define IUTF8	0040000
+
+#define OPOST	0000001
+#define OLCUC	0000002
+#define ONLCR	0000004
+#define OCRNL	0000010
+#define ONOCR	0000020
+#define ONLRET	0000040
+#define OFILL	0000100
+#define OFDEL	0000200
+#define NLDLY	0000400
+#define   NL0	0000000
+#define   NL1	0000400
+#define CRDLY	0003000
+#define   CR0	0000000
+#define   CR1	0001000
+#define   CR2	0002000
+#define   CR3	0003000
+#define TABDLY	0014000
+#define   TAB0	0000000
+#define   TAB1	0004000
+#define   TAB2	0010000
+#define   TAB3	0014000
+#define   XTABS	0014000
+#define BSDLY	0020000
+#define   BS0	0000000
+#define   BS1	0020000
+#define VTDLY	0040000
+#define   VT0	0000000
+#define   VT1	0040000
+#define FFDLY	0100000
+#define   FF0	0000000
+#define   FF1	0100000
+
+#define B0       0000000
+#define B50      0000001
+#define B75      0000002
+#define B110     0000003
+#define B134     0000004
+#define B150     0000005
+#define B200     0000006
+#define B300     0000007
+#define B600     0000010
+#define B1200    0000011
+#define B1800    0000012
+#define B2400    0000013
+#define B4800    0000014
+#define B9600    0000015
+#define B19200   0000016
+#define B38400   0000017
+#define EXTA     0000016
+#define EXTB     0000017
+
+#define BOTHER   0010000
+#define B57600   0010001
+#define B115200  0010002
+#define B230400  0010003
+#define B460800  0010004
+#define B500000  0010005
+#define B576000  0010006
+#define B921600  0010007
+#define B1000000 0010010
+#define B1152000 0010011
+#define B1500000 0010012
+#define B2000000 0010013
+#define B2500000 0010014
+#define B3000000 0010015
+#define B3500000 0010016
+#define B4000000 0010017
+
+#define CBAUD    0010017
+
+#define CSIZE  0000060
+#define CS5    0000000
+#define CS6    0000020
+#define CS7    0000040
+#define CS8    0000060
+#define CSTOPB 0000100
+#define CREAD  0000200
+#define PARENB 0000400
+#define PARODD 0001000
+#define HUPCL  0002000
+#define CLOCAL 0004000
+
+#define ISIG	0000001
+#define ICANON	0000002
+#define XCASE	0000004
+#define ECHO	0000010
+#define ECHOE	0000020
+#define ECHOK	0000040
+#define ECHONL	0000100
+#define NOFLSH	0000200
+#define TOSTOP	0000400
+#define ECHOCTL	0001000
+#define ECHOPRT	0002000
+#define ECHOKE	0004000
+#define FLUSHO	0010000
+#define PENDIN	0040000
+#define IEXTEN	0100000
+#define EXTPROC	0200000
+
+#define TCOOFF 0
+#define TCOON  1
+#define TCIOFF 2
+#define TCION  3
+
+#define TCIFLUSH  0
+#define TCOFLUSH  1
+#define TCIOFLUSH 2
+
+#define TCSANOW   0
+#define TCSADRAIN 1
+#define TCSAFLUSH 2
+
+#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
+#define CBAUDEX 0010000
+#define CIBAUD   002003600000
+#define IBSHIFT 16
+#define CMSPAR   010000000000
+#define CRTSCTS  020000000000
+#define EXTPROC 0200000
+#define XTABS  0014000
+#endif
diff --git a/arch/xtensa/bits/user.h b/arch/xtensa/bits/user.h
new file mode 100644
index 00000000..8ac7526f
--- /dev/null
+++ b/arch/xtensa/bits/user.h
@@ -0,0 +1,4 @@
+#define ELF_NGREG 128
+#define ELF_NFPREG 18
+typedef unsigned long elf_greg_t, elf_gregset_t[ELF_NGREG];
+typedef unsigned int elf_fpreg_t, elf_fpregset_t[ELF_NFPREG];
diff --git a/arch/xtensa/bits/xtensa-config.h b/arch/xtensa/bits/xtensa-config.h
new file mode 100644
index 00000000..a2e95904
--- /dev/null
+++ b/arch/xtensa/bits/xtensa-config.h
@@ -0,0 +1,8 @@
+#undef XCHAL_NUM_AREGS
+#define XCHAL_NUM_AREGS			__XCHAL_NUM_AREGS
+
+#undef XCHAL_HAVE_S32C1I
+#define XCHAL_HAVE_S32C1I		__XCHAL_HAVE_S32C1I
+
+#undef XCHAL_HAVE_EXCLUSIVE
+#define XCHAL_HAVE_EXCLUSIVE		__XCHAL_HAVE_EXCLUSIVE
diff --git a/arch/xtensa/crt_arch.h b/arch/xtensa/crt_arch.h
new file mode 100644
index 00000000..1019c186
--- /dev/null
+++ b/arch/xtensa/crt_arch.h
@@ -0,0 +1,48 @@
+#ifdef __FDPIC__
+
+__asm__(
+".text \n"
+".global " START " \n"
+START ": \n"
+"	.begin	no-transform\n"
+"	call0	1f\n"
+"2:\n"
+"	.end	no-transform\n"
+"	.align	4\n"
+"	.literal_position\n"
+"1:\n"
+"	movi	a15, 2b\n"
+"	sub	a15, a0, a15\n"
+
+"	mov	a12, a4\n"
+"	mov	a13, a5\n"
+"	mov	a14, a6\n"
+#ifndef SHARED
+"	mov	a2, a4\n"
+"	movi	a3, __ROFIXUP_LIST__\n"
+"	add	a3, a3, a15\n"
+"	movi	a4, __ROFIXUP_END__\n"
+"	add	a4, a4, a15\n"
+"	movi	a0, __fdpic_fixup\n"
+"	add	a0, a0, a15\n"
+"	callx0	a0\n"
+"	mov	a11, a2\n"
+"	s32i	a15, a11, 12\n"
+#endif
+"	addi	a7, a1, -8\n"
+"	s32i	a12, a7, 0\n"
+"	s32i	a13, a7, 4\n"
+"	mov	a2, a1\n"
+"	mov	a3, a14\n"
+"	movi	a4, -16\n"
+"	and	a1, a7, a4\n"
+"	movi	a0, "START"_c\n"
+"	add	a0, a0, a15\n"
+"	callx0	a0\n"
+);
+
+#ifndef SHARED
+#include "fdpic_crt.h"
+#endif
+
+#endif
diff --git a/arch/xtensa/kstat.h b/arch/xtensa/kstat.h
new file mode 100644
index 00000000..9aeaf7e3
--- /dev/null
+++ b/arch/xtensa/kstat.h
@@ -0,0 +1,18 @@
+struct kstat {
+	dev_t st_dev;
+	ino_t st_ino;
+	mode_t st_mode;
+	nlink_t st_nlink;
+	uid_t st_uid;
+	gid_t st_gid;
+	dev_t st_rdev;
+	off_t st_size;
+	blksize_t st_blksize;
+	blkcnt_t st_blocks;
+	long st_atime_sec;
+	long st_atime_nsec;
+	long st_mtime_sec;
+	long st_mtime_nsec;
+	long st_ctime_sec;
+	long st_ctime_nsec;
+};
diff --git a/arch/xtensa/pthread_arch.h b/arch/xtensa/pthread_arch.h
new file mode 100644
index 00000000..79f53ddd
--- /dev/null
+++ b/arch/xtensa/pthread_arch.h
@@ -0,0 +1,11 @@
+static inline uintptr_t __get_tp()
+{
+	uintptr_t tp;
+	__asm__ __volatile__ ("rur %0, threadptr" : "=a" (tp));
+	return tp;
+}
+
+#define TLS_ABOVE_TP
+#define GAP_ABOVE_TP 8
+
+#define MC_PC sc_pc
diff --git a/arch/xtensa/reloc.h b/arch/xtensa/reloc.h
new file mode 100644
index 00000000..03482f18
--- /dev/null
+++ b/arch/xtensa/reloc.h
@@ -0,0 +1,38 @@
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN_SUFFIX "eb"
+#else
+#define ENDIAN_SUFFIX ""
+#endif
+
+#if __FDPIC__
+#define ABI_SUFFIX "-fdpic"
+#else
+#define ABI_SUFFIX ""
+#endif
+
+#define LDSO_ARCH "xtensa" ENDIAN_SUFFIX ABI_SUFFIX
+
+#define TPOFF_K 8
+
+#define REL_PLT         R_XTENSA_JMP_SLOT
+#define REL_RELATIVE    R_XTENSA_RELATIVE
+#define REL_GOT         R_XTENSA_GLOB_DAT
+#define REL_COPY        R_XTENSA_32
+#define REL_TLSDESC     R_XTENSA_TLSDESC_FN
+
+#if __FDPIC__
+#define REL_FUNCDESC	R_XTENSA_FUNCDESC
+#define REL_FUNCDESC_VAL R_XTENSA_FUNCDESC_VALUE
+
+#define DL_FDPIC 1
+#define DL_NOMMU_SUPPORT 1
+
+#define CRTJMP(pc,sp) do { \
+	register size_t a4 __asm__("a4") = ((size_t *)(sp))[-2]; \
+	__asm__ __volatile__( "mov a1, %1 ; jx %0" \
+	: : "r"(pc), "r"(sp), "r"(a4) : "memory" ); } while(0)
+
+#define GETFUNCSYM(fp, sym, got) __asm__ ( \
+	"movi %0, " #sym "@GOTOFFFUNCDESC ; add %0, %0, %1" \
+	: "=&a"(*fp) : "a"(got) : "memory" )
+#endif
diff --git a/arch/xtensa/syscall_arch.h b/arch/xtensa/syscall_arch.h
new file mode 100644
index 00000000..63ee4e96
--- /dev/null
+++ b/arch/xtensa/syscall_arch.h
@@ -0,0 +1,104 @@
+#define __SYSCALL_LL_E(x) \
+((union { long long ll; long l[2]; }){ .ll = x }).l[0], \
+((union { long long ll; long l[2]; }){ .ll = x }).l[1]
+#define __SYSCALL_LL_O(x) 0, __SYSCALL_LL_E((x))
+
+static inline long __syscall0(long n)
+{
+	register long a2 __asm__("a2") = n;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      :
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall1(long n, long a)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6)
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall2(long n, long a, long b)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+	register long a3 __asm__("a3") = b;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6), "a"(a3)
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall3(long n, long a, long b, long c)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+	register long a3 __asm__("a3") = b;
+	register long a4 __asm__("a4") = c;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6), "a"(a3), "a"(a4)
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall4(long n, long a, long b, long c, long d)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+	register long a3 __asm__("a3") = b;
+	register long a4 __asm__("a4") = c;
+	register long a5 __asm__("a5") = d;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6), "a"(a3), "a"(a4), "a"(a5)
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall5(long n, long a, long b, long c, long d, long e)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+	register long a3 __asm__("a3") = b;
+	register long a4 __asm__("a4") = c;
+	register long a5 __asm__("a5") = d;
+	register long a8 __asm__("a8") = e;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6), "a"(a3), "a"(a4), "a"(a5), "a"(a8)
+			      : "memory");
+	return a2;
+}
+
+static inline long __syscall6(long n, long a, long b, long c, long d, long e, long f)
+{
+	register long a2 __asm__("a2") = n;
+	register long a6 __asm__("a6") = a;
+	register long a3 __asm__("a3") = b;
+	register long a4 __asm__("a4") = c;
+	register long a5 __asm__("a5") = d;
+	register long a8 __asm__("a8") = e;
+	register long a9 __asm__("a9") = f;
+
+	__asm__ __volatile__ ("syscall"
+			      : "+&a"(a2)
+			      : "a"(a6), "a"(a3), "a"(a4), "a"(a5), "a"(a8), "a"(a9)
+			      : "memory");
+	return a2;
+}
+
+#define SYSCALL_FADVISE_6_ARG
diff --git a/configure b/configure
index 853bf05e..8cfc3111 100755
--- a/configure
+++ b/configure
@@ -338,6 +338,7 @@ powerpc*|ppc*) ARCH=powerpc ;;
 riscv64*) ARCH=riscv64 ;;
 sh[1-9bel-]*|sh|superh*) ARCH=sh ;;
 s390x*) ARCH=s390x ;;
+xtensa*) ARCH=xtensa ;;
 unknown) fail "$0: unable to detect target arch; try $0 --target=..." ;;
 *) fail "$0: unknown or unsupported target \"$target\"" ;;
 esac
diff --git a/crt/xtensa/crti.S b/crt/xtensa/crti.S
new file mode 100644
index 00000000..82a837e7
--- /dev/null
+++ b/crt/xtensa/crti.S
@@ -0,0 +1,29 @@
+.section .init
+.global  _init
+.type    _init, @function
+_init:
+#if defined(__XTENSA_CALL0_ABI__)
+	addi	sp, sp, -16
+	s32i	a0, sp, 0
+#ifdef __FDPIC__
+	s32i	a12, sp, 4
+	mov	a12, a11
+#endif
+#else
+#error Unsupported Xtensa ABI
+#endif
+
+.section .fini
+.global  _fini
+.type    _fini, @function
+_fini:
+#if defined(__XTENSA_CALL0_ABI__)
+	addi	sp, sp, -16
+	s32i	a0, sp, 0
+#ifdef __FDPIC__
+	s32i	a12, sp, 4
+	mov	a12, a11
+#endif
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/crt/xtensa/crtn.S b/crt/xtensa/crtn.S
new file mode 100644
index 00000000..5f93b193
--- /dev/null
+++ b/crt/xtensa/crtn.S
@@ -0,0 +1,23 @@
+.section .init
+#if defined(__XTENSA_CALL0_ABI__)
+#ifdef __FDPIC__
+	l32i	a12, sp, 4
+#endif
+	l32i	a0, sp, 0
+	addi	sp, sp, 16
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
+
+.section .fini
+#if defined(__XTENSA_CALL0_ABI__)
+#ifdef __FDPIC__
+	l32i	a12, sp, 4
+#endif
+	l32i	a0, sp, 0
+	addi	sp, sp, 16
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/include/elf.h b/include/elf.h
index 23f2c4bc..f472c808 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -3293,6 +3293,76 @@ enum
 #define R_RISCV_SET32           56
 #define R_RISCV_32_PCREL        57
 
+#define R_XTENSA_NONE		0
+#define R_XTENSA_32		1
+#define R_XTENSA_RTLD		2
+#define R_XTENSA_GLOB_DAT	3
+#define R_XTENSA_JMP_SLOT	4
+#define R_XTENSA_RELATIVE	5
+#define R_XTENSA_PLT		6
+#define R_XTENSA_OP0		8
+#define R_XTENSA_OP1		9
+#define R_XTENSA_OP2		10
+#define R_XTENSA_ASM_EXPAND	11
+#define R_XTENSA_ASM_SIMPLIFY	12
+#define R_XTENSA_32_PCREL	14
+#define R_XTENSA_GNU_VTINHERIT	15
+#define R_XTENSA_GNU_VTENTRY	16
+#define R_XTENSA_DIFF8		17
+#define R_XTENSA_DIFF16		18
+#define R_XTENSA_DIFF32		19
+#define R_XTENSA_SLOT0_OP	20
+#define R_XTENSA_SLOT1_OP	21
+#define R_XTENSA_SLOT2_OP	22
+#define R_XTENSA_SLOT3_OP	23
+#define R_XTENSA_SLOT4_OP	24
+#define R_XTENSA_SLOT5_OP	25
+#define R_XTENSA_SLOT6_OP	26
+#define R_XTENSA_SLOT7_OP	27
+#define R_XTENSA_SLOT8_OP	28
+#define R_XTENSA_SLOT9_OP	29
+#define R_XTENSA_SLOT10_OP	30
+#define R_XTENSA_SLOT11_OP	31
+#define R_XTENSA_SLOT12_OP	32
+#define R_XTENSA_SLOT13_OP	33
+#define R_XTENSA_SLOT14_OP	34
+#define R_XTENSA_SLOT0_ALT	35
+#define R_XTENSA_SLOT1_ALT	36
+#define R_XTENSA_SLOT2_ALT	37
+#define R_XTENSA_SLOT3_ALT	38
+#define R_XTENSA_SLOT4_ALT	39
+#define R_XTENSA_SLOT5_ALT	40
+#define R_XTENSA_SLOT6_ALT	41
+#define R_XTENSA_SLOT7_ALT	42
+#define R_XTENSA_SLOT8_ALT	43
+#define R_XTENSA_SLOT9_ALT	44
+#define R_XTENSA_SLOT10_ALT	45
+#define R_XTENSA_SLOT11_ALT	46
+#define R_XTENSA_SLOT12_ALT	47
+#define R_XTENSA_SLOT13_ALT	48
+#define R_XTENSA_SLOT14_ALT	49
+#define R_XTENSA_TLSDESC_FN	50
+#define R_XTENSA_TLSDESC_ARG	51
+#define R_XTENSA_TLS_DTPOFF	52
+#define R_XTENSA_TLS_TPOFF	53
+#define R_XTENSA_TLS_FUNC	54
+#define R_XTENSA_TLS_ARG	55
+#define R_XTENSA_TLS_CALL	56
+#define R_XTENSA_PDIFF8		57
+#define R_XTENSA_PDIFF16	58
+#define R_XTENSA_PDIFF32	59
+#define R_XTENSA_NDIFF8		60
+#define R_XTENSA_NDIFF16	61
+#define R_XTENSA_NDIFF32	62
+#define R_XTENSA_GOT		63
+#define R_XTENSA_GOTOFF		64
+#define R_XTENSA_GOTFUNCDESC	65
+#define R_XTENSA_GOTOFFFUNCDESC	66
+#define R_XTENSA_FUNCDESC	67
+#define R_XTENSA_FUNCDESC_VALUE	68
+
+#define R_XTENSA_NUM		69
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/internal/xtensa/syscall.S b/src/internal/xtensa/syscall.S
new file mode 100644
index 00000000..f09a46e8
--- /dev/null
+++ b/src/internal/xtensa/syscall.S
@@ -0,0 +1,25 @@
+.global __syscall
+.hidden __syscall
+.type   __syscall,@function
+.align 4
+__syscall:
+#ifdef __XTENSA_WINDOWED_ABI__
+	entry	a1, 16
+#endif
+	mov	a8, a3
+	mov	a3, a4
+	mov	a4, a5
+	mov	a5, a6
+	mov	a6, a8
+	mov	a8, a7
+#if defined(__XTENSA_WINDOWED_ABI__)
+	l32i	a9, a1, 16
+	syscall
+	retw
+#elif defined(__XTENSA_CALL0_ABI__)
+	l32i	a9, a1, 0
+	syscall
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/ldso/xtensa/tlsdesc.S b/src/ldso/xtensa/tlsdesc.S
new file mode 100644
index 00000000..084d5f3e
--- /dev/null
+++ b/src/ldso/xtensa/tlsdesc.S
@@ -0,0 +1,36 @@
+.global __tlsdesc_static
+.hidden __tlsdesc_static
+.type __tlsdesc_static,@function
+.align 4
+__tlsdesc_static:
+#ifdef __XTENSA_WINDOWED_ABI__
+	entry	a1, 16
+#endif
+	rur	a3, threadptr
+	add	a2, a2, a3
+#if defined(__XTENSA_WINDOWED_ABI__)
+	retw
+#elif defined(__XTENSA_CALL0_ABI__)
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
+
+.hidden __tls_get_new
+
+.global __tlsdesc_dynamic
+.hidden __tlsdesc_dynamic
+.type __tlsdesc_dynamic,@function
+.align 4
+__tlsdesc_dynamic:
+#if defined(__XTENSA_WINDOWED_ABI__)
+	entry	a1, 16
+	mov	a6, a2
+	call4	__tls_get_addr
+	mov	a2, a6
+	retw
+#elif defined(__XTENSA_CALL0_ABI__)
+	j	__tls_get_addr
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/process/xtensa/vfork.S b/src/process/xtensa/vfork.S
new file mode 100644
index 00000000..25478b87
--- /dev/null
+++ b/src/process/xtensa/vfork.S
@@ -0,0 +1,21 @@
+.global vfork
+.type vfork,@function
+vfork:
+#if defined(__XTENSA_CALL0_ABI__)
+	movi	a2, 116 # __NR_clone
+	movi	a3, 0
+	movi	a6, 0x4111 # CLONE_VM | CLONE_VFORK | SIGCHLD
+	syscall
+
+#ifdef __FDPIC__
+.hidden __syscall_ret
+	movi	a9, __syscall_ret@GOT
+	add	a9, a9, a11
+	l32i	a9, a9, 0
+	jx	a9
+#else
+#error Unsupported Xtensa ABI
+#endif
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/setjmp/xtensa/longjmp.S b/src/setjmp/xtensa/longjmp.S
new file mode 100644
index 00000000..602093c9
--- /dev/null
+++ b/src/setjmp/xtensa/longjmp.S
@@ -0,0 +1,22 @@
+.global _longjmp
+.global longjmp
+.type _longjmp,@function
+.type longjmp,@function
+.align 4
+_longjmp:
+longjmp:
+#if defined(__XTENSA_CALL0_ABI__)
+	l32i	a0, a2, 0
+	l32i	a1, a2, 4
+	l32i	a12, a2, 8
+	l32i	a13, a2, 12
+	l32i	a14, a2, 16
+	l32i	a15, a2, 20
+
+	movi	a2, 1
+	movnez	a2, a3, a3
+
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/setjmp/xtensa/setjmp.S b/src/setjmp/xtensa/setjmp.S
new file mode 100644
index 00000000..3a735a6a
--- /dev/null
+++ b/src/setjmp/xtensa/setjmp.S
@@ -0,0 +1,25 @@
+.global ___setjmp
+.hidden ___setjmp
+.global __setjmp
+.global _setjmp
+.global setjmp
+.type __setjmp,@function
+.type _setjmp,@function
+.type setjmp,@function
+.align 4
+___setjmp:
+__setjmp:
+_setjmp:
+setjmp:
+#if defined(__XTENSA_CALL0_ABI__)
+	s32i	a0, a2, 0
+	s32i	a1, a2, 4
+	s32i	a12, a2, 8
+	s32i	a13, a2, 12
+	s32i	a14, a2, 16
+	s32i	a15, a2, 20
+	movi	a2, 0
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/signal/xtensa/restore.s b/src/signal/xtensa/restore.s
new file mode 100644
index 00000000..e03e0fb7
--- /dev/null
+++ b/src/signal/xtensa/restore.s
@@ -0,0 +1,10 @@
+.global __restore
+.global __restore_rt
+.type __restore,@function
+.type __restore_rt,@function
+.space 1
+.align 4
+__restore:
+__restore_rt:
+	movi	a2, 225 # SYS_rt_sigreturn
+	syscall
diff --git a/src/signal/xtensa/sigsetjmp.S b/src/signal/xtensa/sigsetjmp.S
new file mode 100644
index 00000000..e44bba88
--- /dev/null
+++ b/src/signal/xtensa/sigsetjmp.S
@@ -0,0 +1,24 @@
+.global sigsetjmp
+.global __sigsetjmp
+.type sigsetjmp,@function
+.type __sigsetjmp,@function
+.align 4
+sigsetjmp:
+__sigsetjmp:
+#if defined(__XTENSA_CALL0_ABI__)
+	bnez	a3, 1f
+.hidden ___setjmp
+	j	___setjmp
+1:
+	mov	a3, a0
+	mov	a4, a2
+	call0	___setjmp
+	mov	a0, a3
+	mov	a3, a2
+	mov	a2, a4
+
+.hidden __sigsetjmp_tail
+	j	__sigsetjmp_tail
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/thread/xtensa/__set_thread_area.S b/src/thread/xtensa/__set_thread_area.S
new file mode 100644
index 00000000..1f402125
--- /dev/null
+++ b/src/thread/xtensa/__set_thread_area.S
@@ -0,0 +1,16 @@
+.global __set_thread_area
+.type   __set_thread_area,@function
+.align 4
+__set_thread_area:
+#ifdef __XTENSA_WINDOWED_ABI__
+	entry	a1, 16
+#endif
+	wur	a2, threadptr
+	movi	a2, 0
+#if defined(__XTENSA_WINDOWED_ABI__)
+	retw
+#elif defined(__XTENSA_CALL0_ABI__)
+	ret
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/thread/xtensa/__unmapself.S b/src/thread/xtensa/__unmapself.S
new file mode 100644
index 00000000..15bf2bdf
--- /dev/null
+++ b/src/thread/xtensa/__unmapself.S
@@ -0,0 +1,13 @@
+.global __unmapself
+.type   __unmapself,@function
+.align 4
+__unmapself:
+#if defined(__XTENSA_CALL0_ABI__)
+	mov	a6, a2
+	movi	a2, 81 # SYS_munmap
+	syscall
+	movi	a2, 118 # SYS_exit
+	syscall
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/thread/xtensa/clone.S b/src/thread/xtensa/clone.S
new file mode 100644
index 00000000..8e8514d6
--- /dev/null
+++ b/src/thread/xtensa/clone.S
@@ -0,0 +1,46 @@
+# __clone(func, stack, flags, arg, ptid, tls, ctid)
+#         a2,   a3,    a4,    a5,  a6,   a7,  [sp]
+
+# syscall(SYS_clone, flags, stack, ptid, tls, ctid)
+#         a2,        a6,    a3,    a4,   a5,  a8
+
+.global __clone
+.type   __clone,@function
+.align 4
+__clone:
+#if defined(__XTENSA_CALL0_ABI__)
+	# align stack and save func,arg
+	srli	a3, a3, 4
+	slli	a3, a3, 4
+	addi	a3, a3, -16
+	s32i	a2, a3, 0
+	s32i	a5, a3, 4
+
+	# syscall
+	mov	a2, a4
+	mov	a4, a6
+	mov	a6, a2
+	mov	a5, a7
+	l32i	a8, a1, 16
+	movi	a2, 116 # SYS_clone
+	syscall
+
+	beqz	a2, 1f
+	# parent
+	ret
+
+	# child
+1:
+	l32i	a0, a1, 0
+	l32i	a2, a1, 4
+#ifdef __FDPIC__
+	l32i	a11, a0, 4
+	l32i	a0, a0, 0
+#endif
+	callx0	a0
+	mov	a6, a2
+	movi	a2, 118 # SYS_exit
+	syscall
+#else
+#error Unsupported Xtensa ABI
+#endif
diff --git a/src/thread/xtensa/syscall_cp.S b/src/thread/xtensa/syscall_cp.S
new file mode 100644
index 00000000..de7c3e66
--- /dev/null
+++ b/src/thread/xtensa/syscall_cp.S
@@ -0,0 +1,38 @@
+# __syscall_cp_asm(&self->cancel, nr, u, v, w, x, y, z)
+#                  a2             a3  a4 a5 a6 a7 [sp] [sp+4]
+
+# syscall(nr, u, v, w, x, y, z)
+#         a2  a6 a3 a4 a5 a8 a9
+
+.global __cp_begin
+.hidden __cp_begin
+.global __cp_end
+.hidden __cp_end
+.global __cp_cancel
+.hidden __cp_cancel
+.hidden __cancel
+.global __syscall_cp_asm
+.hidden __syscall_cp_asm
+.type __syscall_cp_asm,@function
+.align 4
+#if defined(__XTENSA_CALL0_ABI__)
+__syscall_cp_asm:
+__cp_begin:
+	l32i	a2, a2, 0
+	bnez	a2, __cp_cancel
+	mov	a2, a4
+	mov	a4, a6
+	mov	a6, a2
+	mov	a2, a3
+	mov	a3, a5
+	mov	a5, a7
+	l32i	a8, a1, 0
+	l32i	a9, a1, 4
+	syscall
+__cp_end:
+	ret
+__cp_cancel:
+	j	__cancel
+#else
+#error Unsupported Xtensa ABI
+#endif
-- 
2.21.0


[-- Attachment #3: 0002-WIP.patch --]
[-- Type: text/plain, Size: 3181 bytes --]

From f8c5953ecdb948282cc8e573b729c25db60a95a8 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Wed, 21 Feb 2024 08:27:37 -0800
Subject: [PATCH 2/2] WIP

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 ldso/dlstart.c         | 7 +++++++
 ldso/dynlink.c         | 6 ++++--
 src/internal/dynlink.h | 5 +++--
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/ldso/dlstart.c b/ldso/dlstart.c
index 259f5e18..beca953f 100644
--- a/ldso/dlstart.c
+++ b/ldso/dlstart.c
@@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
 				- segs[rel_addr[1]].p_vaddr
 				+ syms[R_SYM(rel[1])].st_value;
 			rel_addr[1] = dyn[DT_PLTGOT];
+		} else if (R_TYPE(rel[1]) == REL_RELATIVE) {
+			size_t val = *rel_addr;
+			for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
+			*rel_addr += segs[j].addr - segs[j].p_vaddr;
 		} else {
 			size_t val = syms[R_SYM(rel[1])].st_value;
 			for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
 			*rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
 		}
 	}
+#ifdef __xtensa__
+	((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
+#endif
 #else
 	/* If the dynamic linker is invoked as a command, its load
 	 * address is not available in the aux vector. Instead, compute
diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index ceca3c98..25563af3 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
 		if (!DL_FDPIC)
 			do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
 
+#if 0
 		if (head != &ldso && p->relro_start != p->relro_end) {
 			long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
 				p->relro_end-p->relro_start, PROT_READ);
@@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
 				if (runtime) longjmp(*rtld_fail, 1);
 			}
 		}
+#endif
 
 		p->relocated = 1;
 	}
@@ -1485,7 +1487,7 @@ void __libc_exit_fini()
 		if (dyn[0] & (1<<DT_FINI_ARRAY)) {
 			size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
 			size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
-			while (n--) ((void (*)(void))*--fn)();
+			while (n--) fpaddr(p, *--fn)();
 		}
 #ifndef NO_LEGACY_INITFINI
 		if ((dyn[0] & (1<<DT_FINI)) && dyn[DT_FINI])
@@ -1603,7 +1605,7 @@ static void do_init_fini(struct dso **queue)
 		if (dyn[0] & (1<<DT_INIT_ARRAY)) {
 			size_t n = dyn[DT_INIT_ARRAYSZ]/sizeof(size_t);
 			size_t *fn = laddr(p, dyn[DT_INIT_ARRAY]);
-			while (n--) ((void (*)(void))*fn++)();
+			while (n--) fpaddr(p, *fn++)();
 		}
 
 		pthread_mutex_lock(&init_fini_lock);
diff --git a/src/internal/dynlink.h b/src/internal/dynlink.h
index 06f41d09..6485e883 100644
--- a/src/internal/dynlink.h
+++ b/src/internal/dynlink.h
@@ -78,10 +78,11 @@ struct fdpic_dummy_loadmap {
 	(R_TYPE(x) == REL_RELATIVE) || \
 	(R_TYPE(x) == REL_SYM_OR_REL && !R_SYM(x)) )
 #else
-#define IS_RELATIVE(x,s) ( ( \
+#define IS_RELATIVE(x,s) ( \
+	(R_TYPE(x) == REL_RELATIVE) || ( ( \
 	(R_TYPE(x) == REL_FUNCDESC_VAL) || \
 	(R_TYPE(x) == REL_SYMBOLIC) ) \
-	&& (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) )
+	&& (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) ) )
 #endif
 
 #ifndef NEED_MIPS_GOT_RELOCS
-- 
2.21.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-27 23:24 [musl] Initial xtensa/fdpic port review Rich Felker
@ 2024-02-28  0:13 ` Rich Felker
  2024-02-28 17:20   ` Max Filippov
  0 siblings, 1 reply; 17+ messages in thread
From: Rich Felker @ 2024-02-28  0:13 UTC (permalink / raw)
  To: musl; +Cc: Max Filippov

First of all, I'm excited to see new work on this and especially the
FDPIC ABI. Not only is xtensa a very relevant arch I've been hoping we
could support for some time now, but having FDPIC will help getting
the parts of musl's FDPIC support that are currently making
sh-specific assumptions worked out so that we can get other new FDPIC
ports added too (particularly, arm and riscv).

I'm not up-to-date on what toolchain status/stability is or how close
this is to being intended as upstreamable, but since I was pointed to
it and took a look, I wanted to record my initial review thoughts here
so they don't get lost and to hopefully move this closer to ready for
upstream.

Review follows inline:

On Tue, Feb 27, 2024 at 06:24:31PM -0500, Rich Felker wrote:
> Attached patches were pulled from
> https://github.com/jcmvbkbc/musl-xtensa/commits/xtensa-1.2.4-fdpic/
> 
> I'll reply to comment inline.

> >From 868b34272f414323f10528d5b9988886541cb1c0 Mon Sep 17 00:00:00 2001
> From: Max Filippov <jcmvbkbc@gmail.com>
> Date: Tue, 22 Mar 2016 02:35:58 +0300
> Subject: [PATCH 1/2] xtensa: add port
> 
> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> ---
>  arch/xtensa/arch.mak                  |   1 +
>  arch/xtensa/atomic_arch.h             |  27 ++
>  arch/xtensa/bits/alltypes.h.in        |  27 ++
>  arch/xtensa/bits/float.h              |  16 ++
>  arch/xtensa/bits/ioctl.h              | 219 ++++++++++++++
>  arch/xtensa/bits/limits.h             |   1 +
>  arch/xtensa/bits/mman.h               |  68 +++++
>  arch/xtensa/bits/posix.h              |   2 +
>  arch/xtensa/bits/reg.h                |   2 +
>  arch/xtensa/bits/setjmp.h             |   5 +
>  arch/xtensa/bits/signal.h             |  88 ++++++
>  arch/xtensa/bits/stat.h               |  24 ++
>  arch/xtensa/bits/stdint.h             |  20 ++
>  arch/xtensa/bits/syscall.h.in         | 396 ++++++++++++++++++++++++++
>  arch/xtensa/bits/termios.h            | 167 +++++++++++
>  arch/xtensa/bits/user.h               |   4 +
>  arch/xtensa/bits/xtensa-config.h      |   8 +
>  arch/xtensa/crt_arch.h                |  48 ++++
>  arch/xtensa/kstat.h                   |  18 ++
>  arch/xtensa/pthread_arch.h            |  11 +
>  arch/xtensa/reloc.h                   |  38 +++
>  arch/xtensa/syscall_arch.h            | 104 +++++++
>  configure                             |   1 +
>  crt/xtensa/crti.S                     |  29 ++
>  crt/xtensa/crtn.S                     |  23 ++
>  include/elf.h                         |  70 +++++
>  src/internal/xtensa/syscall.S         |  25 ++
>  src/ldso/xtensa/tlsdesc.S             |  36 +++
>  src/process/xtensa/vfork.S            |  21 ++
>  src/setjmp/xtensa/longjmp.S           |  22 ++
>  src/setjmp/xtensa/setjmp.S            |  25 ++
>  src/signal/xtensa/restore.s           |  10 +
>  src/signal/xtensa/sigsetjmp.S         |  24 ++
>  src/thread/xtensa/__set_thread_area.S |  16 ++
>  src/thread/xtensa/__unmapself.S       |  13 +
>  src/thread/xtensa/clone.S             |  46 +++
>  src/thread/xtensa/syscall_cp.S        |  38 +++
>  37 files changed, 1693 insertions(+)
>  create mode 100644 arch/xtensa/arch.mak
>  create mode 100644 arch/xtensa/atomic_arch.h
>  create mode 100644 arch/xtensa/bits/alltypes.h.in
>  create mode 100644 arch/xtensa/bits/float.h
>  create mode 100644 arch/xtensa/bits/ioctl.h
>  create mode 100644 arch/xtensa/bits/limits.h
>  create mode 100644 arch/xtensa/bits/mman.h
>  create mode 100644 arch/xtensa/bits/posix.h
>  create mode 100644 arch/xtensa/bits/reg.h
>  create mode 100644 arch/xtensa/bits/setjmp.h
>  create mode 100644 arch/xtensa/bits/signal.h
>  create mode 100644 arch/xtensa/bits/stat.h
>  create mode 100644 arch/xtensa/bits/stdint.h
>  create mode 100644 arch/xtensa/bits/syscall.h.in
>  create mode 100644 arch/xtensa/bits/termios.h
>  create mode 100644 arch/xtensa/bits/user.h
>  create mode 100644 arch/xtensa/bits/xtensa-config.h
>  create mode 100644 arch/xtensa/crt_arch.h
>  create mode 100644 arch/xtensa/kstat.h
>  create mode 100644 arch/xtensa/pthread_arch.h
>  create mode 100644 arch/xtensa/reloc.h
>  create mode 100644 arch/xtensa/syscall_arch.h
>  create mode 100644 crt/xtensa/crti.S
>  create mode 100644 crt/xtensa/crtn.S
>  create mode 100644 src/internal/xtensa/syscall.S
>  create mode 100644 src/ldso/xtensa/tlsdesc.S
>  create mode 100644 src/process/xtensa/vfork.S
>  create mode 100644 src/setjmp/xtensa/longjmp.S
>  create mode 100644 src/setjmp/xtensa/setjmp.S
>  create mode 100644 src/signal/xtensa/restore.s
>  create mode 100644 src/signal/xtensa/sigsetjmp.S
>  create mode 100644 src/thread/xtensa/__set_thread_area.S
>  create mode 100644 src/thread/xtensa/__unmapself.S
>  create mode 100644 src/thread/xtensa/clone.S
>  create mode 100644 src/thread/xtensa/syscall_cp.S
> 
> diff --git a/arch/xtensa/arch.mak b/arch/xtensa/arch.mak
> new file mode 100644
> index 00000000..aa4d05ce
> --- /dev/null
> +++ b/arch/xtensa/arch.mak
> @@ -0,0 +1 @@
> +COMPAT_SRC_DIRS = compat/time32
> diff --git a/arch/xtensa/atomic_arch.h b/arch/xtensa/atomic_arch.h
> new file mode 100644
> index 00000000..34bf0704
> --- /dev/null
> +++ b/arch/xtensa/atomic_arch.h
> @@ -0,0 +1,27 @@
> +#include "bits/xtensa-config.h"

This is not what bits headers are for; they are public-installed
headers that provide parts of the standard public headers that vary by
arch, and are never directly #include'able.

If you really need arch-internal headers like this just put them in
the top-level dir of the arch. But in the case of these macros, either
just use the compiler-provided, __-prefixed ones directly, or if
they're not actually variable, don't inspect them at all.

> +
> +#if XCHAL_HAVE_S32C1I
> +#define a_cas a_cas
> +static inline int a_cas(volatile int *p, int t, int s)
> +{
> +	__asm__ __volatile__ (
> +		"	wsr	%2, scompare1\n"
> +		"	s32c1i	%0, %1\n"
> +		: "+a"(s), "+m"(*p)
> +		: "a"(t)
> +		: "memory" );
> +        return s;
> +}
> +#endif

For example this is not a useful test, because there's nothing you can
do in the case where it's not defined; it's just not supportable
(unless there's some kernel fallback mechanism like ARM and SH have;
then it would make sense to hard-code the cas instruction as above if
the ISA level is known to have it, and otherwise do conditional
runtime selection (again, like ARM and SH).

> diff --git a/arch/xtensa/bits/alltypes.h.in b/arch/xtensa/bits/alltypes.h.in
> new file mode 100644
> index 00000000..21221ce5
> --- /dev/null
> +++ b/arch/xtensa/bits/alltypes.h.in
> @@ -0,0 +1,27 @@
> +#define _REDIR_TIME64 1
> +#define _Addr int
> +#define _Int64 long long
> +#define _Reg int
> +
> +#if __XTENSA_EB__
> +#define __BYTE_ORDER 4321
> +#elif __XTENSA_EL__
> +#define __BYTE_ORDER 1234
> +#else
> +#error Unknown endianness
> +#endif
> +
> +#define __LONG_MAX 0x7fffffffL
> +
> +#ifndef __cplusplus
> +#ifdef __WCHAR_TYPE__
> +TYPEDEF __WCHAR_TYPE__ wchar_t;
> +#else
> +TYPEDEF unsigned wchar_t;
> +#endif
> +#endif
> +
> +TYPEDEF float float_t;
> +TYPEDEF double double_t;
> +
> +TYPEDEF struct { long __l __attribute__((aligned(__BIGGEST_ALIGNMENT__))); } max_align_t;

It's best to avoid macros like __BIGGEST_ALIGNMENT__ here that could
change out from under us in the future, thereby quietly breaking the
ABI. Whatever the ABI is, that's what the alignment here should be,
and just using a type that naturally has the alignment if possible
rather than attributes.

> diff --git a/arch/xtensa/bits/mman.h b/arch/xtensa/bits/mman.h
> new file mode 100644
> index 00000000..d2f58eb1
> --- /dev/null
> +++ b/arch/xtensa/bits/mman.h
> @@ -0,0 +1,68 @@
> +#define MAP_FAILED ((void *) -1)
> +
> +#define	PROT_NONE      0
> +#define	PROT_READ      1
> +#define	PROT_WRITE     2
> +#define	PROT_EXEC      4
> +#define	PROT_GROWSDOWN 0x01000000
> +#define	PROT_GROWSUP   0x02000000
> +
> +#define	MAP_SHARED     0x01
> +#define	MAP_PRIVATE    0x02
> +#define	MAP_FIXED      0x10
> +
> +#define MAP_TYPE       0x0f
> +#undef MAP_FILE
> +#define MAP_FILE       0x00
> +#undef MAP_ANON
> +#define MAP_ANON       0x800
> +#define MAP_ANONYMOUS  MAP_ANON
> +#undef MAP_NORESERVE
> +#define MAP_NORESERVE  0x0400
> +#undef MAP_GROWSDOWN
> +#define MAP_GROWSDOWN  0x1000
> +#undef MAP_DENYWRITE
> +#define MAP_DENYWRITE  0x2000
> +#undef MAP_EXECUTABLE
> +#define MAP_EXECUTABLE 0x4000
> +#undef MAP_LOCKED
> +#define MAP_LOCKED     0x8000
> +#undef MAP_POPULATE
> +#define MAP_POPULATE   0x10000
> +#undef MAP_NONBLOCK
> +#define MAP_NONBLOCK   0x20000
> +#undef MAP_STACK
> +#define MAP_STACK      0x40000
> +#undef MAP_HUGETLB
> +#define MAP_HUGETLB    0x80000
> +
> +#define POSIX_MADV_NORMAL       0
> +#define POSIX_MADV_RANDOM       1
> +#define POSIX_MADV_SEQUENTIAL   2
> +#define POSIX_MADV_WILLNEED     3
> +#define POSIX_MADV_DONTNEED     4
> +
> +#define MS_ASYNC        1
> +#define MS_INVALIDATE   2
> +#define MS_SYNC         4
> +
> +#define MCL_CURRENT     1
> +#define MCL_FUTURE      2
> +
> +#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> +#define MADV_NORMAL      0
> +#define MADV_RANDOM      1
> +#define MADV_SEQUENTIAL  2
> +#define MADV_WILLNEED    3
> +#define MADV_DONTNEED    4
> +#define MADV_REMOVE      9
> +#define MADV_DONTFORK    10
> +#define MADV_DOFORK      11
> +#define MADV_MERGEABLE   12
> +#define MADV_UNMERGEABLE 13
> +#define MADV_HUGEPAGE    14
> +#define MADV_NOHUGEPAGE  15
> +#define MADV_DONTDUMP    16
> +#define MADV_DODUMP      17
> +#define MADV_HWPOISON    100
> +#endif

A lot of these don't vary from the standard values, and including them
(especially since most will be duplicate/re- definitions) obscures
what actually is arch-specific here.

> diff --git a/arch/xtensa/bits/setjmp.h b/arch/xtensa/bits/setjmp.h
> new file mode 100644
> index 00000000..e1a6dd97
> --- /dev/null
> +++ b/arch/xtensa/bits/setjmp.h
> @@ -0,0 +1,5 @@
> +#if defined(__XTENSA_CALL0_ABI__)
> +typedef unsigned long __jmp_buf[6];
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

See notes later on reloc.h regarding ABIs.

> diff --git a/arch/xtensa/bits/signal.h b/arch/xtensa/bits/signal.h
> new file mode 100644
> index 00000000..545ffd36
> --- /dev/null
> +++ b/arch/xtensa/bits/signal.h
> @@ -0,0 +1,88 @@
> +#if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
> + || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> +
> +#if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> +#define MINSIGSTKSZ 2048
> +#define SIGSTKSZ 8192
> +#endif
> +
> +typedef struct sigcontext
> +{
> +	unsigned long sc_pc;
> +	unsigned long sc_ps;
> +	unsigned long sc_lbeg;
> +	unsigned long sc_lend;
> +	unsigned long sc_lcount;
> +	unsigned long sc_sar;
> +	unsigned long sc_acclo;
> +	unsigned long sc_acchi;
> +	unsigned long sc_a[16];
> +	void *sc_xtregs;
> +} mcontext_t;

This needs a namespace-safe alternate definition when in
standards-conforming profile. See how it's done on most other archs.
(It can just be a dummy struct of the right size if you like.)

> diff --git a/arch/xtensa/bits/xtensa-config.h b/arch/xtensa/bits/xtensa-config.h
> new file mode 100644
> index 00000000..a2e95904
> --- /dev/null
> +++ b/arch/xtensa/bits/xtensa-config.h
> @@ -0,0 +1,8 @@
> +#undef XCHAL_NUM_AREGS
> +#define XCHAL_NUM_AREGS			__XCHAL_NUM_AREGS
> +
> +#undef XCHAL_HAVE_S32C1I
> +#define XCHAL_HAVE_S32C1I		__XCHAL_HAVE_S32C1I
> +
> +#undef XCHAL_HAVE_EXCLUSIVE
> +#define XCHAL_HAVE_EXCLUSIVE		__XCHAL_HAVE_EXCLUSIVE

See notes above on this header.

> diff --git a/arch/xtensa/reloc.h b/arch/xtensa/reloc.h
> new file mode 100644
> index 00000000..03482f18
> --- /dev/null
> +++ b/arch/xtensa/reloc.h
> @@ -0,0 +1,38 @@
> +#if __BYTE_ORDER == __BIG_ENDIAN
> +#define ENDIAN_SUFFIX "eb"
> +#else
> +#define ENDIAN_SUFFIX ""
> +#endif
> +
> +#if __FDPIC__
> +#define ABI_SUFFIX "-fdpic"
> +#else
> +#define ABI_SUFFIX ""
> +#endif
> +
> +#define LDSO_ARCH "xtensa" ENDIAN_SUFFIX ABI_SUFFIX

Any actual incompatible ABI (as opposed to ISA level) needs its own
LDSO name. So if there are alternate (call0/windowed?) ABIs you want
to support, different names need to be defined for them here and in
configure. Unless there's a good reason to support more than one ABI,
it probably just makes sense to pick the preferred one and have
configure error-out if the toolchain's ABI is incompatible.

> diff --git a/crt/xtensa/crti.S b/crt/xtensa/crti.S
> new file mode 100644
> index 00000000..82a837e7
> --- /dev/null
> +++ b/crt/xtensa/crti.S
> @@ -0,0 +1,29 @@
> +.section .init
> +.global  _init
> +.type    _init, @function
> +_init:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	addi	sp, sp, -16
> +	s32i	a0, sp, 0
> +#ifdef __FDPIC__
> +	s32i	a12, sp, 4
> +	mov	a12, a11
> +#endif
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

Same thing regarding ABIs.

> diff --git a/src/internal/xtensa/syscall.S b/src/internal/xtensa/syscall.S
> new file mode 100644
> index 00000000..f09a46e8
> --- /dev/null
> +++ b/src/internal/xtensa/syscall.S
> @@ -0,0 +1,25 @@
> +.global __syscall
> +.hidden __syscall
> +.type   __syscall,@function
> +.align 4
> +__syscall:
> +#ifdef __XTENSA_WINDOWED_ABI__
> +	entry	a1, 16
> +#endif
> +	mov	a8, a3
> +	mov	a3, a4
> +	mov	a4, a5
> +	mov	a5, a6
> +	mov	a6, a8
> +	mov	a8, a7
> +#if defined(__XTENSA_WINDOWED_ABI__)
> +	l32i	a9, a1, 16
> +	syscall
> +	retw
> +#elif defined(__XTENSA_CALL0_ABI__)
> +	l32i	a9, a1, 0
> +	syscall
> +	ret
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

And here too.

> diff --git a/src/ldso/xtensa/tlsdesc.S b/src/ldso/xtensa/tlsdesc.S
> new file mode 100644
> index 00000000..084d5f3e
> --- /dev/null
> +++ b/src/ldso/xtensa/tlsdesc.S
> @@ -0,0 +1,36 @@
> +.global __tlsdesc_static
> +.hidden __tlsdesc_static
> +.type __tlsdesc_static,@function
> +.align 4
> +__tlsdesc_static:
> +#ifdef __XTENSA_WINDOWED_ABI__
> +	entry	a1, 16
> +#endif
> +	rur	a3, threadptr
> +	add	a2, a2, a3
> +#if defined(__XTENSA_WINDOWED_ABI__)
> +	retw
> +#elif defined(__XTENSA_CALL0_ABI__)
> +	ret
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

Are you sure the entry step makes sense with tlsdesc? Is this honoring
the ABI for what the tlsdesc function is allowed to clobber?

> +.hidden __tls_get_new
> +
> +.global __tlsdesc_dynamic
> +.hidden __tlsdesc_dynamic
> +.type __tlsdesc_dynamic,@function
> +.align 4
> +__tlsdesc_dynamic:
> +#if defined(__XTENSA_WINDOWED_ABI__)
> +	entry	a1, 16
> +	mov	a6, a2
> +	call4	__tls_get_addr
> +	mov	a2, a6
> +	retw
> +#elif defined(__XTENSA_CALL0_ABI__)
> +	j	__tls_get_addr
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

This approach is no longer a valid implementation for
__tlsdesc_dynamic, because it calls C code which could clobber any
call-clobbered registers, not just the ones the tlsdesc function is
allowed to clobber.

Instead, it needs to actually do the dtv lookup in asm. musl does not
use any dynamic allocation here, so the lookup is guaranteed to just
work using the indices found with no conditional branches. See the
recently added riscv code for a clean example, in commit
407aea628af8c81d9e3f5a068568f2217db71bba.

> diff --git a/src/process/xtensa/vfork.S b/src/process/xtensa/vfork.S
> new file mode 100644
> index 00000000..25478b87
> --- /dev/null
> +++ b/src/process/xtensa/vfork.S
> @@ -0,0 +1,21 @@
> +.global vfork
> +.type vfork,@function
> +vfork:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	movi	a2, 116 # __NR_clone
> +	movi	a3, 0
> +	movi	a6, 0x4111 # CLONE_VM | CLONE_VFORK | SIGCHLD
> +	syscall
> +
> +#ifdef __FDPIC__
> +.hidden __syscall_ret
> +	movi	a9, __syscall_ret@GOT
> +	add	a9, a9, a11
> +	l32i	a9, a9, 0
> +	jx	a9
> +#else
> +#error Unsupported Xtensa ABI
> +#endif
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

Same re: ABIs. But maybe there's intent to support a non-FDPIC ABI
too?

> diff --git a/src/setjmp/xtensa/longjmp.S b/src/setjmp/xtensa/longjmp.S
> new file mode 100644
> index 00000000..602093c9
> --- /dev/null
> +++ b/src/setjmp/xtensa/longjmp.S
> @@ -0,0 +1,22 @@
> +.global _longjmp
> +.global longjmp
> +.type _longjmp,@function
> +.type longjmp,@function
> +.align 4
> +_longjmp:
> +longjmp:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	l32i	a0, a2, 0
> +	l32i	a1, a2, 4
> +	l32i	a12, a2, 8
> +	l32i	a13, a2, 12
> +	l32i	a14, a2, 16
> +	l32i	a15, a2, 20
> +
> +	movi	a2, 1
> +	movnez	a2, a3, a3
> +
> +	ret
> +#else
> +#error Unsupported Xtensa ABI
> +#endif
> diff --git a/src/setjmp/xtensa/setjmp.S b/src/setjmp/xtensa/setjmp.S
> new file mode 100644
> index 00000000..3a735a6a
> --- /dev/null
> +++ b/src/setjmp/xtensa/setjmp.S
> @@ -0,0 +1,25 @@
> +.global ___setjmp
> +.hidden ___setjmp
> +.global __setjmp
> +.global _setjmp
> +.global setjmp
> +.type __setjmp,@function
> +.type _setjmp,@function
> +.type setjmp,@function
> +.align 4
> +___setjmp:
> +__setjmp:
> +_setjmp:
> +setjmp:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	s32i	a0, a2, 0
> +	s32i	a1, a2, 4
> +	s32i	a12, a2, 8
> +	s32i	a13, a2, 12
> +	s32i	a14, a2, 16
> +	s32i	a15, a2, 20
> +	movi	a2, 0
> +	ret
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

Ditto.

> diff --git a/src/signal/xtensa/sigsetjmp.S b/src/signal/xtensa/sigsetjmp.S
> new file mode 100644
> index 00000000..e44bba88
> --- /dev/null
> +++ b/src/signal/xtensa/sigsetjmp.S
> @@ -0,0 +1,24 @@
> +.global sigsetjmp
> +.global __sigsetjmp
> +.type sigsetjmp,@function
> +.type __sigsetjmp,@function
> +.align 4
> +sigsetjmp:
> +__sigsetjmp:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	bnez	a3, 1f
> +.hidden ___setjmp
> +	j	___setjmp
> +1:
> +	mov	a3, a0
> +	mov	a4, a2
> +	call0	___setjmp
> +	mov	a0, a3
> +	mov	a3, a2
> +	mov	a2, a4
> +
> +.hidden __sigsetjmp_tail
> +	j	__sigsetjmp_tail
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

I don't think this code works. It does not seem to save the return
address and argument address before calling ___setjmp, so they will be
clobbered after it returns.

> diff --git a/src/thread/xtensa/__set_thread_area.S b/src/thread/xtensa/__set_thread_area.S
> new file mode 100644
> index 00000000..1f402125
> --- /dev/null
> +++ b/src/thread/xtensa/__set_thread_area.S
> @@ -0,0 +1,16 @@
> +.global __set_thread_area
> +.type   __set_thread_area,@function
> +.align 4
> +__set_thread_area:
> +#ifdef __XTENSA_WINDOWED_ABI__
> +	entry	a1, 16
> +#endif
> +	wur	a2, threadptr
> +	movi	a2, 0
> +#if defined(__XTENSA_WINDOWED_ABI__)
> +	retw
> +#elif defined(__XTENSA_CALL0_ABI__)
> +	ret
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

FWIW, few archs do this yet, but this function can be written in C
with inline asm to set the register, and that would avoid any ABI
conditionals if they're ever needed.

Eventually I'd like to move most functions like this, that are
currently external asm but have no good reason to be (unlike setjmp,
vfork, etc. which *do* have a good reason to be), to minimal inline
asm in C. Some archs' versions of this file would even just become
__syscall, no further asm.

> diff --git a/src/thread/xtensa/__unmapself.S b/src/thread/xtensa/__unmapself.S
> new file mode 100644
> index 00000000..15bf2bdf
> --- /dev/null
> +++ b/src/thread/xtensa/__unmapself.S
> @@ -0,0 +1,13 @@
> +.global __unmapself
> +.type   __unmapself,@function
> +.align 4
> +__unmapself:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	mov	a6, a2
> +	movi	a2, 81 # SYS_munmap
> +	syscall
> +	movi	a2, 118 # SYS_exit
> +	syscall
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

This is unsafe on fdpic/nommu if the kernel requires the task to
always have a valid stack. You might want to check how it's done on SH
and whether something similar is needed here. The idea is that it
temporarily switches to an alternate stack protected by the global
thread list lock (so only one thread can be using it at a time) in
order to make it safe to unmap the thread's own stack. The thread list
is then unlocked by the kernel when the thread dies (via the exit
futex).

> diff --git a/src/thread/xtensa/clone.S b/src/thread/xtensa/clone.S
> new file mode 100644
> index 00000000..8e8514d6
> --- /dev/null
> +++ b/src/thread/xtensa/clone.S
> @@ -0,0 +1,46 @@
> +# __clone(func, stack, flags, arg, ptid, tls, ctid)
> +#         a2,   a3,    a4,    a5,  a6,   a7,  [sp]
> +
> +# syscall(SYS_clone, flags, stack, ptid, tls, ctid)
> +#         a2,        a6,    a3,    a4,   a5,  a8
> +
> +.global __clone
> +.type   __clone,@function
> +.align 4
> +__clone:
> +#if defined(__XTENSA_CALL0_ABI__)
> +	# align stack and save func,arg
> +	srli	a3, a3, 4
> +	slli	a3, a3, 4
> +	addi	a3, a3, -16
> +	s32i	a2, a3, 0
> +	s32i	a5, a3, 4
> +
> +	# syscall
> +	mov	a2, a4
> +	mov	a4, a6
> +	mov	a6, a2
> +	mov	a5, a7
> +	l32i	a8, a1, 16
> +	movi	a2, 116 # SYS_clone
> +	syscall
> +
> +	beqz	a2, 1f
> +	# parent
> +	ret
> +
> +	# child
> +1:
> +	l32i	a0, a1, 0
> +	l32i	a2, a1, 4
> +#ifdef __FDPIC__
> +	l32i	a11, a0, 4
> +	l32i	a0, a0, 0
> +#endif
> +	callx0	a0
> +	mov	a6, a2
> +	movi	a2, 118 # SYS_exit
> +	syscall
> +#else
> +#error Unsupported Xtensa ABI
> +#endif

Same re: ABIs.

> diff --git a/src/thread/xtensa/syscall_cp.S b/src/thread/xtensa/syscall_cp.S
> new file mode 100644
> index 00000000..de7c3e66
> --- /dev/null
> +++ b/src/thread/xtensa/syscall_cp.S
> @@ -0,0 +1,38 @@
> +# __syscall_cp_asm(&self->cancel, nr, u, v, w, x, y, z)
> +#                  a2             a3  a4 a5 a6 a7 [sp] [sp+4]
> +
> +# syscall(nr, u, v, w, x, y, z)
> +#         a2  a6 a3 a4 a5 a8 a9
> +
> +.global __cp_begin
> +.hidden __cp_begin
> +.global __cp_end
> +.hidden __cp_end
> +.global __cp_cancel
> +.hidden __cp_cancel
> +.hidden __cancel
> +.global __syscall_cp_asm
> +.hidden __syscall_cp_asm
> +.type __syscall_cp_asm,@function
> +.align 4
> +#if defined(__XTENSA_CALL0_ABI__)
> +__syscall_cp_asm:
> +__cp_begin:
> +	l32i	a2, a2, 0
> +	bnez	a2, __cp_cancel
> +	mov	a2, a4
> +	mov	a4, a6
> +	mov	a6, a2
> +	mov	a2, a3
> +	mov	a3, a5
> +	mov	a5, a7
> +	l32i	a8, a1, 0
> +	l32i	a9, a1, 4
> +	syscall
> +__cp_end:
> +	ret
> +__cp_cancel:
> +	j	__cancel
> +#else
> +#error Unsupported Xtensa ABI
> +#endif
> -- 
> 2.21.0

And this one.


> >From f8c5953ecdb948282cc8e573b729c25db60a95a8 Mon Sep 17 00:00:00 2001
> From: Max Filippov <jcmvbkbc@gmail.com>
> Date: Wed, 21 Feb 2024 08:27:37 -0800
> Subject: [PATCH 2/2] WIP
> 
> Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> ---
>  ldso/dlstart.c         | 7 +++++++
>  ldso/dynlink.c         | 6 ++++--
>  src/internal/dynlink.h | 5 +++--
>  3 files changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/ldso/dlstart.c b/ldso/dlstart.c
> index 259f5e18..beca953f 100644
> --- a/ldso/dlstart.c
> +++ b/ldso/dlstart.c
> @@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
>  				- segs[rel_addr[1]].p_vaddr
>  				+ syms[R_SYM(rel[1])].st_value;
>  			rel_addr[1] = dyn[DT_PLTGOT];
> +		} else if (R_TYPE(rel[1]) == REL_RELATIVE) {
> +			size_t val = *rel_addr;
> +			for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> +			*rel_addr += segs[j].addr - segs[j].p_vaddr;

So xtensa has a "relative" reloc type that's just adjusted by the load
offset of the segment the relocation lives in, rather than needing to
use a symbolic relocation referencing a section symbol like other
fdpic archs do?

If so, this looks right and looks ok.

>  		} else {
>  			size_t val = syms[R_SYM(rel[1])].st_value;
>  			for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
>  			*rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
>  		}
>  	}
> +#ifdef __xtensa__
> +	((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
> +#endif

Is this actually needed for anything? Generally musl doesn't use the
reserved GOT slots itself, and on all the other archs I'm aware of,
they're essentially reserved to the dynamic linker implementation so
the dynamic linker is just free not to use them and not to set them
up.

>  #else
>  	/* If the dynamic linker is invoked as a command, its load
>  	 * address is not available in the aux vector. Instead, compute
> diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> index ceca3c98..25563af3 100644
> --- a/ldso/dynlink.c
> +++ b/ldso/dynlink.c
> @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
>  		if (!DL_FDPIC)
>  			do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
>  
> +#if 0
>  		if (head != &ldso && p->relro_start != p->relro_end) {
>  			long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
>  				p->relro_end-p->relro_start, PROT_READ);
> @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
>  				if (runtime) longjmp(*rtld_fail, 1);
>  			}
>  		}
> +#endif

Was this breaking something? Relro linking probably should have been
disabled on fdpic, and we ignore ENOSYS anyway for nommu.

>  		p->relocated = 1;
>  	}
> @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
>  		if (dyn[0] & (1<<DT_FINI_ARRAY)) {
>  			size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
>  			size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> -			while (n--) ((void (*)(void))*--fn)();
> +			while (n--) fpaddr(p, *--fn)();

If this is fixable on the tooling side it really should be fixed
there. init/fini arrays should have actual language-level function
addresses (descriptor addresses on fdpic), not instruction addresses.

If it's not fixable at this point, I guess we need the arch's reloc.h
to define a macro signaling this difference in behavior.

I have no idea how this would be expected to work on static linked
programs...

> diff --git a/src/internal/dynlink.h b/src/internal/dynlink.h
> index 06f41d09..6485e883 100644
> --- a/src/internal/dynlink.h
> +++ b/src/internal/dynlink.h
> @@ -78,10 +78,11 @@ struct fdpic_dummy_loadmap {
>  	(R_TYPE(x) == REL_RELATIVE) || \
>  	(R_TYPE(x) == REL_SYM_OR_REL && !R_SYM(x)) )
>  #else
> -#define IS_RELATIVE(x,s) ( ( \
> +#define IS_RELATIVE(x,s) ( \
> +	(R_TYPE(x) == REL_RELATIVE) || ( ( \
>  	(R_TYPE(x) == REL_FUNCDESC_VAL) || \
>  	(R_TYPE(x) == REL_SYMBOLIC) ) \
> -	&& (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) )
> +	&& (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) ) )
>  #endif
>  
>  #ifndef NEED_MIPS_GOT_RELOCS
> -- 
> 2.21.0
> 

This looks ok assuming the above about relative relocs.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28  0:13 ` Rich Felker
@ 2024-02-28 17:20   ` Max Filippov
  2024-02-28 18:30     ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-28 17:20 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

Hi Rich,

On Tue, Feb 27, 2024 at 4:12 PM Rich Felker <dalias@libc.org> wrote:
>
> First of all, I'm excited to see new work on this and especially the
> FDPIC ABI. Not only is xtensa a very relevant arch I've been hoping we
> could support for some time now, but having FDPIC will help getting
> the parts of musl's FDPIC support that are currently making
> sh-specific assumptions worked out so that we can get other new FDPIC
> ports added too (particularly, arm and riscv).
>
> I'm not up-to-date on what toolchain status/stability is or how close
> this is to being intended as upstreamable, but since I was pointed to
> it and took a look, I wanted to record my initial review thoughts here
> so they don't get lost and to hopefully move this closer to ready for
> upstream.

Thanks for your review.
I expect to get toolchain components into shape for upstreaming during
this year.

> Review follows inline:
>
> On Tue, Feb 27, 2024 at 06:24:31PM -0500, Rich Felker wrote:
> > Attached patches were pulled from
> > https://github.com/jcmvbkbc/musl-xtensa/commits/xtensa-1.2.4-fdpic/
> >
> > I'll reply to comment inline.
>
> > >From 868b34272f414323f10528d5b9988886541cb1c0 Mon Sep 17 00:00:00 2001
> > From: Max Filippov <jcmvbkbc@gmail.com>
> > Date: Tue, 22 Mar 2016 02:35:58 +0300
> > Subject: [PATCH 1/2] xtensa: add port
> >
> > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > ---
> >  arch/xtensa/arch.mak                  |   1 +
> >  arch/xtensa/atomic_arch.h             |  27 ++
> >  arch/xtensa/bits/alltypes.h.in        |  27 ++
> >  arch/xtensa/bits/float.h              |  16 ++
> >  arch/xtensa/bits/ioctl.h              | 219 ++++++++++++++
> >  arch/xtensa/bits/limits.h             |   1 +
> >  arch/xtensa/bits/mman.h               |  68 +++++
> >  arch/xtensa/bits/posix.h              |   2 +
> >  arch/xtensa/bits/reg.h                |   2 +
> >  arch/xtensa/bits/setjmp.h             |   5 +
> >  arch/xtensa/bits/signal.h             |  88 ++++++
> >  arch/xtensa/bits/stat.h               |  24 ++
> >  arch/xtensa/bits/stdint.h             |  20 ++
> >  arch/xtensa/bits/syscall.h.in         | 396 ++++++++++++++++++++++++++
> >  arch/xtensa/bits/termios.h            | 167 +++++++++++
> >  arch/xtensa/bits/user.h               |   4 +
> >  arch/xtensa/bits/xtensa-config.h      |   8 +
> >  arch/xtensa/crt_arch.h                |  48 ++++
> >  arch/xtensa/kstat.h                   |  18 ++
> >  arch/xtensa/pthread_arch.h            |  11 +
> >  arch/xtensa/reloc.h                   |  38 +++
> >  arch/xtensa/syscall_arch.h            | 104 +++++++
> >  configure                             |   1 +
> >  crt/xtensa/crti.S                     |  29 ++
> >  crt/xtensa/crtn.S                     |  23 ++
> >  include/elf.h                         |  70 +++++
> >  src/internal/xtensa/syscall.S         |  25 ++
> >  src/ldso/xtensa/tlsdesc.S             |  36 +++
> >  src/process/xtensa/vfork.S            |  21 ++
> >  src/setjmp/xtensa/longjmp.S           |  22 ++
> >  src/setjmp/xtensa/setjmp.S            |  25 ++
> >  src/signal/xtensa/restore.s           |  10 +
> >  src/signal/xtensa/sigsetjmp.S         |  24 ++
> >  src/thread/xtensa/__set_thread_area.S |  16 ++
> >  src/thread/xtensa/__unmapself.S       |  13 +
> >  src/thread/xtensa/clone.S             |  46 +++
> >  src/thread/xtensa/syscall_cp.S        |  38 +++
> >  37 files changed, 1693 insertions(+)
> >  create mode 100644 arch/xtensa/arch.mak
> >  create mode 100644 arch/xtensa/atomic_arch.h
> >  create mode 100644 arch/xtensa/bits/alltypes.h.in
> >  create mode 100644 arch/xtensa/bits/float.h
> >  create mode 100644 arch/xtensa/bits/ioctl.h
> >  create mode 100644 arch/xtensa/bits/limits.h
> >  create mode 100644 arch/xtensa/bits/mman.h
> >  create mode 100644 arch/xtensa/bits/posix.h
> >  create mode 100644 arch/xtensa/bits/reg.h
> >  create mode 100644 arch/xtensa/bits/setjmp.h
> >  create mode 100644 arch/xtensa/bits/signal.h
> >  create mode 100644 arch/xtensa/bits/stat.h
> >  create mode 100644 arch/xtensa/bits/stdint.h
> >  create mode 100644 arch/xtensa/bits/syscall.h.in
> >  create mode 100644 arch/xtensa/bits/termios.h
> >  create mode 100644 arch/xtensa/bits/user.h
> >  create mode 100644 arch/xtensa/bits/xtensa-config.h
> >  create mode 100644 arch/xtensa/crt_arch.h
> >  create mode 100644 arch/xtensa/kstat.h
> >  create mode 100644 arch/xtensa/pthread_arch.h
> >  create mode 100644 arch/xtensa/reloc.h
> >  create mode 100644 arch/xtensa/syscall_arch.h
> >  create mode 100644 crt/xtensa/crti.S
> >  create mode 100644 crt/xtensa/crtn.S
> >  create mode 100644 src/internal/xtensa/syscall.S
> >  create mode 100644 src/ldso/xtensa/tlsdesc.S
> >  create mode 100644 src/process/xtensa/vfork.S
> >  create mode 100644 src/setjmp/xtensa/longjmp.S
> >  create mode 100644 src/setjmp/xtensa/setjmp.S
> >  create mode 100644 src/signal/xtensa/restore.s
> >  create mode 100644 src/signal/xtensa/sigsetjmp.S
> >  create mode 100644 src/thread/xtensa/__set_thread_area.S
> >  create mode 100644 src/thread/xtensa/__unmapself.S
> >  create mode 100644 src/thread/xtensa/clone.S
> >  create mode 100644 src/thread/xtensa/syscall_cp.S
> >
> > diff --git a/arch/xtensa/arch.mak b/arch/xtensa/arch.mak
> > new file mode 100644
> > index 00000000..aa4d05ce
> > --- /dev/null
> > +++ b/arch/xtensa/arch.mak
> > @@ -0,0 +1 @@
> > +COMPAT_SRC_DIRS = compat/time32
> > diff --git a/arch/xtensa/atomic_arch.h b/arch/xtensa/atomic_arch.h
> > new file mode 100644
> > index 00000000..34bf0704
> > --- /dev/null
> > +++ b/arch/xtensa/atomic_arch.h
> > @@ -0,0 +1,27 @@
> > +#include "bits/xtensa-config.h"
>
> This is not what bits headers are for; they are public-installed
> headers that provide parts of the standard public headers that vary by
> arch, and are never directly #include'able.
>
> If you really need arch-internal headers like this just put them in
> the top-level dir of the arch. But in the case of these macros, either
> just use the compiler-provided, __-prefixed ones directly, or if
> they're not actually variable, don't inspect them at all.

I carry this header from the time when configuration-specific header
file was the only available configuration option. __XCHAL_* macros
only appeared in gcc-13, but since FDPIC support will only be
available on toolchains that have these macros I guess I'll drop the
header and use __XCHAL_* macros directly.

> > +
> > +#if XCHAL_HAVE_S32C1I
> > +#define a_cas a_cas
> > +static inline int a_cas(volatile int *p, int t, int s)
> > +{
> > +     __asm__ __volatile__ (
> > +             "       wsr     %2, scompare1\n"
> > +             "       s32c1i  %0, %1\n"
> > +             : "+a"(s), "+m"(*p)
> > +             : "a"(t)
> > +             : "memory" );
> > +        return s;
> > +}
> > +#endif
>
> For example this is not a useful test, because there's nothing you can
> do in the case where it's not defined;

This test is in the spirit of xtensa: it covers one possible
configuration option.
There's at least one other hardware option (exclusive access instructions)
available for this functionality and a fallback to syscalls. They're not listed
here because I tried to keep this initial implementation to a bare minimum.

> it's just not supportable
> (unless there's some kernel fallback mechanism like ARM and SH have;
> then it would make sense to hard-code the cas instruction as above if
> the ISA level is known to have it, and otherwise do conditional
> runtime selection (again, like ARM and SH).

Xtensa is quite limited on the runtime selection, because opcodes belonging
to the options not enabled for a particular core just would not assemble.

> > diff --git a/arch/xtensa/bits/alltypes.h.in b/arch/xtensa/bits/alltypes.h.in
> > new file mode 100644
> > index 00000000..21221ce5
> > --- /dev/null
> > +++ b/arch/xtensa/bits/alltypes.h.in
> > @@ -0,0 +1,27 @@
> > +#define _REDIR_TIME64 1
> > +#define _Addr int
> > +#define _Int64 long long
> > +#define _Reg int
> > +
> > +#if __XTENSA_EB__
> > +#define __BYTE_ORDER 4321
> > +#elif __XTENSA_EL__
> > +#define __BYTE_ORDER 1234
> > +#else
> > +#error Unknown endianness
> > +#endif
> > +
> > +#define __LONG_MAX 0x7fffffffL
> > +
> > +#ifndef __cplusplus
> > +#ifdef __WCHAR_TYPE__
> > +TYPEDEF __WCHAR_TYPE__ wchar_t;
> > +#else
> > +TYPEDEF unsigned wchar_t;
> > +#endif
> > +#endif
> > +
> > +TYPEDEF float float_t;
> > +TYPEDEF double double_t;
> > +
> > +TYPEDEF struct { long __l __attribute__((aligned(__BIGGEST_ALIGNMENT__))); } max_align_t;
>
> It's best to avoid macros like __BIGGEST_ALIGNMENT__ here that could
> change out from under us in the future, thereby quietly breaking the
> ABI. Whatever the ABI is, that's what the alignment here should be,
> and just using a type that naturally has the alignment if possible
> rather than attributes.

The thing is: we don't know the name of the type with the widest
alignment, as indeed in the future someone may come up with
a configuration with an arbitrarily named type with an alignment
as big as allowed physically. But then its their responsibility to
keep the __BIGGEST_ALIGNMENT__ consistent across the
updates of that configuration that intend to stay compatible with
each other.

> > diff --git a/arch/xtensa/bits/mman.h b/arch/xtensa/bits/mman.h
> > new file mode 100644
> > index 00000000..d2f58eb1
> > --- /dev/null
> > +++ b/arch/xtensa/bits/mman.h
> > @@ -0,0 +1,68 @@
> > +#define MAP_FAILED ((void *) -1)
> > +
> > +#define      PROT_NONE      0
> > +#define      PROT_READ      1
> > +#define      PROT_WRITE     2
> > +#define      PROT_EXEC      4
> > +#define      PROT_GROWSDOWN 0x01000000
> > +#define      PROT_GROWSUP   0x02000000
> > +
> > +#define      MAP_SHARED     0x01
> > +#define      MAP_PRIVATE    0x02
> > +#define      MAP_FIXED      0x10
> > +
> > +#define MAP_TYPE       0x0f
> > +#undef MAP_FILE
> > +#define MAP_FILE       0x00
> > +#undef MAP_ANON
> > +#define MAP_ANON       0x800
> > +#define MAP_ANONYMOUS  MAP_ANON
> > +#undef MAP_NORESERVE
> > +#define MAP_NORESERVE  0x0400
> > +#undef MAP_GROWSDOWN
> > +#define MAP_GROWSDOWN  0x1000
> > +#undef MAP_DENYWRITE
> > +#define MAP_DENYWRITE  0x2000
> > +#undef MAP_EXECUTABLE
> > +#define MAP_EXECUTABLE 0x4000
> > +#undef MAP_LOCKED
> > +#define MAP_LOCKED     0x8000
> > +#undef MAP_POPULATE
> > +#define MAP_POPULATE   0x10000
> > +#undef MAP_NONBLOCK
> > +#define MAP_NONBLOCK   0x20000
> > +#undef MAP_STACK
> > +#define MAP_STACK      0x40000
> > +#undef MAP_HUGETLB
> > +#define MAP_HUGETLB    0x80000
> > +
> > +#define POSIX_MADV_NORMAL       0
> > +#define POSIX_MADV_RANDOM       1
> > +#define POSIX_MADV_SEQUENTIAL   2
> > +#define POSIX_MADV_WILLNEED     3
> > +#define POSIX_MADV_DONTNEED     4
> > +
> > +#define MS_ASYNC        1
> > +#define MS_INVALIDATE   2
> > +#define MS_SYNC         4
> > +
> > +#define MCL_CURRENT     1
> > +#define MCL_FUTURE      2
> > +
> > +#if defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> > +#define MADV_NORMAL      0
> > +#define MADV_RANDOM      1
> > +#define MADV_SEQUENTIAL  2
> > +#define MADV_WILLNEED    3
> > +#define MADV_DONTNEED    4
> > +#define MADV_REMOVE      9
> > +#define MADV_DONTFORK    10
> > +#define MADV_DOFORK      11
> > +#define MADV_MERGEABLE   12
> > +#define MADV_UNMERGEABLE 13
> > +#define MADV_HUGEPAGE    14
> > +#define MADV_NOHUGEPAGE  15
> > +#define MADV_DONTDUMP    16
> > +#define MADV_DODUMP      17
> > +#define MADV_HWPOISON    100
> > +#endif
>
> A lot of these don't vary from the standard values, and including them
> (especially since most will be duplicate/re- definitions) obscures
> what actually is arch-specific here.

Ok, I'll drop the duplicates.

> > diff --git a/arch/xtensa/bits/setjmp.h b/arch/xtensa/bits/setjmp.h
> > new file mode 100644
> > index 00000000..e1a6dd97
> > --- /dev/null
> > +++ b/arch/xtensa/bits/setjmp.h
> > @@ -0,0 +1,5 @@
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +typedef unsigned long __jmp_buf[6];
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> See notes later on reloc.h regarding ABIs.
>
> > diff --git a/arch/xtensa/bits/signal.h b/arch/xtensa/bits/signal.h
> > new file mode 100644
> > index 00000000..545ffd36
> > --- /dev/null
> > +++ b/arch/xtensa/bits/signal.h
> > @@ -0,0 +1,88 @@
> > +#if defined(_POSIX_SOURCE) || defined(_POSIX_C_SOURCE) \
> > + || defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> > +
> > +#if defined(_XOPEN_SOURCE) || defined(_GNU_SOURCE) || defined(_BSD_SOURCE)
> > +#define MINSIGSTKSZ 2048
> > +#define SIGSTKSZ 8192
> > +#endif
> > +
> > +typedef struct sigcontext
> > +{
> > +     unsigned long sc_pc;
> > +     unsigned long sc_ps;
> > +     unsigned long sc_lbeg;
> > +     unsigned long sc_lend;
> > +     unsigned long sc_lcount;
> > +     unsigned long sc_sar;
> > +     unsigned long sc_acclo;
> > +     unsigned long sc_acchi;
> > +     unsigned long sc_a[16];
> > +     void *sc_xtregs;
> > +} mcontext_t;
>
> This needs a namespace-safe alternate definition when in
> standards-conforming profile. See how it's done on most other archs.
> (It can just be a dummy struct of the right size if you like.)

Ok.

> > diff --git a/arch/xtensa/bits/xtensa-config.h b/arch/xtensa/bits/xtensa-config.h
> > new file mode 100644
> > index 00000000..a2e95904
> > --- /dev/null
> > +++ b/arch/xtensa/bits/xtensa-config.h
> > @@ -0,0 +1,8 @@
> > +#undef XCHAL_NUM_AREGS
> > +#define XCHAL_NUM_AREGS                      __XCHAL_NUM_AREGS
> > +
> > +#undef XCHAL_HAVE_S32C1I
> > +#define XCHAL_HAVE_S32C1I            __XCHAL_HAVE_S32C1I
> > +
> > +#undef XCHAL_HAVE_EXCLUSIVE
> > +#define XCHAL_HAVE_EXCLUSIVE         __XCHAL_HAVE_EXCLUSIVE
>
> See notes above on this header.
>
> > diff --git a/arch/xtensa/reloc.h b/arch/xtensa/reloc.h
> > new file mode 100644
> > index 00000000..03482f18
> > --- /dev/null
> > +++ b/arch/xtensa/reloc.h
> > @@ -0,0 +1,38 @@
> > +#if __BYTE_ORDER == __BIG_ENDIAN
> > +#define ENDIAN_SUFFIX "eb"
> > +#else
> > +#define ENDIAN_SUFFIX ""
> > +#endif
> > +
> > +#if __FDPIC__
> > +#define ABI_SUFFIX "-fdpic"
> > +#else
> > +#define ABI_SUFFIX ""
> > +#endif
> > +
> > +#define LDSO_ARCH "xtensa" ENDIAN_SUFFIX ABI_SUFFIX
>
> Any actual incompatible ABI (as opposed to ISA level) needs its own
> LDSO name.

There's no real ISA levels in the xtensa. Every CPU core configuration
is its own unique ISA.

> So if there are alternate (call0/windowed?) ABIs you want
> to support, different names need to be defined for them here and in
> configure.

I understand that it means that I should drop the endianness suffix,
as the endianness of each core is fixed, but add windowed/call0
discriminator.

> Unless there's a good reason to support more than one ABI,
> it probably just makes sense to pick the preferred one and have
> configure error-out if the toolchain's ABI is incompatible.

I still have hope to support windowed FDPIC and then non-FDPIC
variants.

> > diff --git a/crt/xtensa/crti.S b/crt/xtensa/crti.S
> > new file mode 100644
> > index 00000000..82a837e7
> > --- /dev/null
> > +++ b/crt/xtensa/crti.S
> > @@ -0,0 +1,29 @@
> > +.section .init
> > +.global  _init
> > +.type    _init, @function
> > +_init:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     addi    sp, sp, -16
> > +     s32i    a0, sp, 0
> > +#ifdef __FDPIC__
> > +     s32i    a12, sp, 4
> > +     mov     a12, a11
> > +#endif
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> Same thing regarding ABIs.
>
> > diff --git a/src/internal/xtensa/syscall.S b/src/internal/xtensa/syscall.S
> > new file mode 100644
> > index 00000000..f09a46e8
> > --- /dev/null
> > +++ b/src/internal/xtensa/syscall.S
> > @@ -0,0 +1,25 @@
> > +.global __syscall
> > +.hidden __syscall
> > +.type   __syscall,@function
> > +.align 4
> > +__syscall:
> > +#ifdef __XTENSA_WINDOWED_ABI__
> > +     entry   a1, 16
> > +#endif
> > +     mov     a8, a3
> > +     mov     a3, a4
> > +     mov     a4, a5
> > +     mov     a5, a6
> > +     mov     a6, a8
> > +     mov     a8, a7
> > +#if defined(__XTENSA_WINDOWED_ABI__)
> > +     l32i    a9, a1, 16
> > +     syscall
> > +     retw
> > +#elif defined(__XTENSA_CALL0_ABI__)
> > +     l32i    a9, a1, 0
> > +     syscall
> > +     ret
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> And here too.
>
> > diff --git a/src/ldso/xtensa/tlsdesc.S b/src/ldso/xtensa/tlsdesc.S
> > new file mode 100644
> > index 00000000..084d5f3e
> > --- /dev/null
> > +++ b/src/ldso/xtensa/tlsdesc.S
> > @@ -0,0 +1,36 @@
> > +.global __tlsdesc_static
> > +.hidden __tlsdesc_static
> > +.type __tlsdesc_static,@function
> > +.align 4
> > +__tlsdesc_static:
> > +#ifdef __XTENSA_WINDOWED_ABI__
> > +     entry   a1, 16
> > +#endif
> > +     rur     a3, threadptr
> > +     add     a2, a2, a3
> > +#if defined(__XTENSA_WINDOWED_ABI__)
> > +     retw
> > +#elif defined(__XTENSA_CALL0_ABI__)
> > +     ret
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> Are you sure the entry step makes sense with tlsdesc? Is this honoring
> the ABI for what the tlsdesc function is allowed to clobber?

But the tlsdesc function is just a normal function, it can do whatever
a regular function can do AFAIU. In addition there are no relaxations
that can replace non-call code with a variant with a call.

'entry' is the right way to begin a regular function in windowed xtensa ABI.

> > +.hidden __tls_get_new
> > +
> > +.global __tlsdesc_dynamic
> > +.hidden __tlsdesc_dynamic
> > +.type __tlsdesc_dynamic,@function
> > +.align 4
> > +__tlsdesc_dynamic:
> > +#if defined(__XTENSA_WINDOWED_ABI__)
> > +     entry   a1, 16
> > +     mov     a6, a2
> > +     call4   __tls_get_addr
> > +     mov     a2, a6
> > +     retw
> > +#elif defined(__XTENSA_CALL0_ABI__)
> > +     j       __tls_get_addr
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> This approach is no longer a valid implementation for
> __tlsdesc_dynamic, because it calls C code which could clobber any
> call-clobbered registers, not just the ones the tlsdesc function is
> allowed to clobber.
>
> Instead, it needs to actually do the dtv lookup in asm. musl does not
> use any dynamic allocation here, so the lookup is guaranteed to just
> work using the indices found with no conditional branches. See the
> recently added riscv code for a clean example, in commit
> 407aea628af8c81d9e3f5a068568f2217db71bba.

Yeah, this whole xtensa TLS code should be in the WIP pile.
I'll rewrite it in assembly.

> > diff --git a/src/process/xtensa/vfork.S b/src/process/xtensa/vfork.S
> > new file mode 100644
> > index 00000000..25478b87
> > --- /dev/null
> > +++ b/src/process/xtensa/vfork.S
> > @@ -0,0 +1,21 @@
> > +.global vfork
> > +.type vfork,@function
> > +vfork:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     movi    a2, 116 # __NR_clone
> > +     movi    a3, 0
> > +     movi    a6, 0x4111 # CLONE_VM | CLONE_VFORK | SIGCHLD
> > +     syscall
> > +
> > +#ifdef __FDPIC__
> > +.hidden __syscall_ret
> > +     movi    a9, __syscall_ret@GOT
> > +     add     a9, a9, a11
> > +     l32i    a9, a9, 0
> > +     jx      a9
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> Same re: ABIs. But maybe there's intent to support a non-FDPIC ABI
> too?

Right, since there's no intent to support non-FDPIC ABI at this time
the FDPIC condition can be dropped here.

> > diff --git a/src/setjmp/xtensa/longjmp.S b/src/setjmp/xtensa/longjmp.S
> > new file mode 100644
> > index 00000000..602093c9
> > --- /dev/null
> > +++ b/src/setjmp/xtensa/longjmp.S
> > @@ -0,0 +1,22 @@
> > +.global _longjmp
> > +.global longjmp
> > +.type _longjmp,@function
> > +.type longjmp,@function
> > +.align 4
> > +_longjmp:
> > +longjmp:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     l32i    a0, a2, 0
> > +     l32i    a1, a2, 4
> > +     l32i    a12, a2, 8
> > +     l32i    a13, a2, 12
> > +     l32i    a14, a2, 16
> > +     l32i    a15, a2, 20
> > +
> > +     movi    a2, 1
> > +     movnez  a2, a3, a3
> > +
> > +     ret
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
> > diff --git a/src/setjmp/xtensa/setjmp.S b/src/setjmp/xtensa/setjmp.S
> > new file mode 100644
> > index 00000000..3a735a6a
> > --- /dev/null
> > +++ b/src/setjmp/xtensa/setjmp.S
> > @@ -0,0 +1,25 @@
> > +.global ___setjmp
> > +.hidden ___setjmp
> > +.global __setjmp
> > +.global _setjmp
> > +.global setjmp
> > +.type __setjmp,@function
> > +.type _setjmp,@function
> > +.type setjmp,@function
> > +.align 4
> > +___setjmp:
> > +__setjmp:
> > +_setjmp:
> > +setjmp:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     s32i    a0, a2, 0
> > +     s32i    a1, a2, 4
> > +     s32i    a12, a2, 8
> > +     s32i    a13, a2, 12
> > +     s32i    a14, a2, 16
> > +     s32i    a15, a2, 20
> > +     movi    a2, 0
> > +     ret
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> Ditto.
>
> > diff --git a/src/signal/xtensa/sigsetjmp.S b/src/signal/xtensa/sigsetjmp.S
> > new file mode 100644
> > index 00000000..e44bba88
> > --- /dev/null
> > +++ b/src/signal/xtensa/sigsetjmp.S
> > @@ -0,0 +1,24 @@
> > +.global sigsetjmp
> > +.global __sigsetjmp
> > +.type sigsetjmp,@function
> > +.type __sigsetjmp,@function
> > +.align 4
> > +sigsetjmp:
> > +__sigsetjmp:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     bnez    a3, 1f
> > +.hidden ___setjmp
> > +     j       ___setjmp
> > +1:
> > +     mov     a3, a0
> > +     mov     a4, a2
> > +     call0   ___setjmp
> > +     mov     a0, a3
> > +     mov     a3, a2
> > +     mov     a2, a4
> > +
> > +.hidden __sigsetjmp_tail
> > +     j       __sigsetjmp_tail
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> I don't think this code works. It does not seem to save the return
> address and argument address before calling ___setjmp, so they will be
> clobbered after it returns.

It does:

+     mov     a3, a0
+     mov     a4, a2

do exactly this and ___setjmp is hidden, so it's always going to be
the implementation above that doesn't clobber a3 and a4.

> > diff --git a/src/thread/xtensa/__set_thread_area.S b/src/thread/xtensa/__set_thread_area.S
> > new file mode 100644
> > index 00000000..1f402125
> > --- /dev/null
> > +++ b/src/thread/xtensa/__set_thread_area.S
> > @@ -0,0 +1,16 @@
> > +.global __set_thread_area
> > +.type   __set_thread_area,@function
> > +.align 4
> > +__set_thread_area:
> > +#ifdef __XTENSA_WINDOWED_ABI__
> > +     entry   a1, 16
> > +#endif
> > +     wur     a2, threadptr
> > +     movi    a2, 0
> > +#if defined(__XTENSA_WINDOWED_ABI__)
> > +     retw
> > +#elif defined(__XTENSA_CALL0_ABI__)
> > +     ret
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> FWIW, few archs do this yet, but this function can be written in C
> with inline asm to set the register, and that would avoid any ABI
> conditionals if they're ever needed.

Ok.

> Eventually I'd like to move most functions like this, that are
> currently external asm but have no good reason to be (unlike setjmp,
> vfork, etc. which *do* have a good reason to be), to minimal inline
> asm in C. Some archs' versions of this file would even just become
> __syscall, no further asm.
>
> > diff --git a/src/thread/xtensa/__unmapself.S b/src/thread/xtensa/__unmapself.S
> > new file mode 100644
> > index 00000000..15bf2bdf
> > --- /dev/null
> > +++ b/src/thread/xtensa/__unmapself.S
> > @@ -0,0 +1,13 @@
> > +.global __unmapself
> > +.type   __unmapself,@function
> > +.align 4
> > +__unmapself:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     mov     a6, a2
> > +     movi    a2, 81 # SYS_munmap
> > +     syscall
> > +     movi    a2, 118 # SYS_exit
> > +     syscall
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> This is unsafe on fdpic/nommu if the kernel requires the task to
> always have a valid stack.

How do I know if the kernel requires that?

> You might want to check how it's done on SH
> and whether something similar is needed here. The idea is that it
> temporarily switches to an alternate stack protected by the global
> thread list lock (so only one thread can be using it at a time) in
> order to make it safe to unmap the thread's own stack. The thread list
> is then unlocked by the kernel when the thread dies (via the exit
> futex).
>
> > diff --git a/src/thread/xtensa/clone.S b/src/thread/xtensa/clone.S
> > new file mode 100644
> > index 00000000..8e8514d6
> > --- /dev/null
> > +++ b/src/thread/xtensa/clone.S
> > @@ -0,0 +1,46 @@
> > +# __clone(func, stack, flags, arg, ptid, tls, ctid)
> > +#         a2,   a3,    a4,    a5,  a6,   a7,  [sp]
> > +
> > +# syscall(SYS_clone, flags, stack, ptid, tls, ctid)
> > +#         a2,        a6,    a3,    a4,   a5,  a8
> > +
> > +.global __clone
> > +.type   __clone,@function
> > +.align 4
> > +__clone:
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +     # align stack and save func,arg
> > +     srli    a3, a3, 4
> > +     slli    a3, a3, 4
> > +     addi    a3, a3, -16
> > +     s32i    a2, a3, 0
> > +     s32i    a5, a3, 4
> > +
> > +     # syscall
> > +     mov     a2, a4
> > +     mov     a4, a6
> > +     mov     a6, a2
> > +     mov     a5, a7
> > +     l32i    a8, a1, 16
> > +     movi    a2, 116 # SYS_clone
> > +     syscall
> > +
> > +     beqz    a2, 1f
> > +     # parent
> > +     ret
> > +
> > +     # child
> > +1:
> > +     l32i    a0, a1, 0
> > +     l32i    a2, a1, 4
> > +#ifdef __FDPIC__
> > +     l32i    a11, a0, 4
> > +     l32i    a0, a0, 0
> > +#endif
> > +     callx0  a0
> > +     mov     a6, a2
> > +     movi    a2, 118 # SYS_exit
> > +     syscall
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
>
> Same re: ABIs.
>
> > diff --git a/src/thread/xtensa/syscall_cp.S b/src/thread/xtensa/syscall_cp.S
> > new file mode 100644
> > index 00000000..de7c3e66
> > --- /dev/null
> > +++ b/src/thread/xtensa/syscall_cp.S
> > @@ -0,0 +1,38 @@
> > +# __syscall_cp_asm(&self->cancel, nr, u, v, w, x, y, z)
> > +#                  a2             a3  a4 a5 a6 a7 [sp] [sp+4]
> > +
> > +# syscall(nr, u, v, w, x, y, z)
> > +#         a2  a6 a3 a4 a5 a8 a9
> > +
> > +.global __cp_begin
> > +.hidden __cp_begin
> > +.global __cp_end
> > +.hidden __cp_end
> > +.global __cp_cancel
> > +.hidden __cp_cancel
> > +.hidden __cancel
> > +.global __syscall_cp_asm
> > +.hidden __syscall_cp_asm
> > +.type __syscall_cp_asm,@function
> > +.align 4
> > +#if defined(__XTENSA_CALL0_ABI__)
> > +__syscall_cp_asm:
> > +__cp_begin:
> > +     l32i    a2, a2, 0
> > +     bnez    a2, __cp_cancel
> > +     mov     a2, a4
> > +     mov     a4, a6
> > +     mov     a6, a2
> > +     mov     a2, a3
> > +     mov     a3, a5
> > +     mov     a5, a7
> > +     l32i    a8, a1, 0
> > +     l32i    a9, a1, 4
> > +     syscall
> > +__cp_end:
> > +     ret
> > +__cp_cancel:
> > +     j       __cancel
> > +#else
> > +#error Unsupported Xtensa ABI
> > +#endif
> > --
> > 2.21.0
>
> And this one.
>
>
> > >From f8c5953ecdb948282cc8e573b729c25db60a95a8 Mon Sep 17 00:00:00 2001
> > From: Max Filippov <jcmvbkbc@gmail.com>
> > Date: Wed, 21 Feb 2024 08:27:37 -0800
> > Subject: [PATCH 2/2] WIP
> >
> > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > ---
> >  ldso/dlstart.c         | 7 +++++++
> >  ldso/dynlink.c         | 6 ++++--
> >  src/internal/dynlink.h | 5 +++--
> >  3 files changed, 14 insertions(+), 4 deletions(-)
> >
> > diff --git a/ldso/dlstart.c b/ldso/dlstart.c
> > index 259f5e18..beca953f 100644
> > --- a/ldso/dlstart.c
> > +++ b/ldso/dlstart.c
> > @@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
> >                               - segs[rel_addr[1]].p_vaddr
> >                               + syms[R_SYM(rel[1])].st_value;
> >                       rel_addr[1] = dyn[DT_PLTGOT];
> > +             } else if (R_TYPE(rel[1]) == REL_RELATIVE) {
> > +                     size_t val = *rel_addr;
> > +                     for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > +                     *rel_addr += segs[j].addr - segs[j].p_vaddr;
>
> So xtensa has a "relative" reloc type that's just adjusted by the load
> offset of the segment the relocation lives in, rather than needing to
> use a symbolic relocation referencing a section symbol like other
> fdpic archs do?

I was looking at the ARM BFD code while doing that and
my impression was that they do the same.
Regardless, I wonder why either relocation form might be preferrable?

> If so, this looks right and looks ok.
>
> >               } else {
> >                       size_t val = syms[R_SYM(rel[1])].st_value;
> >                       for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> >                       *rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
> >               }
> >       }
> > +#ifdef __xtensa__
> > +     ((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
> > +#endif
>
> Is this actually needed for anything? Generally musl doesn't use the
> reserved GOT slots itself, and on all the other archs I'm aware of,
> they're essentially reserved to the dynamic linker implementation so
> the dynamic linker is just free not to use them and not to set them
> up.

xtensa doesn't have relative register jumps and calls, so local jumps
and calls to a far off locations need to use absolute target addresses.
One possible solution is to have the address in the GOT entry, the
other is to calculate the target address using the text segment load
offset at runtime. Both have the same instruction count, see
  http://wiki.osll.ru/doku.php/etc:users:jcmvbkbc:binutils-xtensa#local_call
for the details, but the latter doesn't waste GOT space and that saves
a noticeable amount of RAM.

> >  #else
> >       /* If the dynamic linker is invoked as a command, its load
> >        * address is not available in the aux vector. Instead, compute
> > diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> > index ceca3c98..25563af3 100644
> > --- a/ldso/dynlink.c
> > +++ b/ldso/dynlink.c
> > @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
> >               if (!DL_FDPIC)
> >                       do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
> >
> > +#if 0
> >               if (head != &ldso && p->relro_start != p->relro_end) {
> >                       long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
> >                               p->relro_end-p->relro_start, PROT_READ);
> > @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
> >                               if (runtime) longjmp(*rtld_fail, 1);
> >                       }
> >               }
> > +#endif
>
> Was this breaking something? Relro linking probably should have been
> disabled on fdpic, and we ignore ENOSYS anyway for nommu.

Yeah, I do some of the testing in linux-user QEMU, it has MMU
and mprotect calls actually succeed. Failures were coming from the
relocation code, but my impression was that relro should be applied
after the relocation completion and it should just work, hence the
WIP pile.

> >               p->relocated = 1;
> >       }
> > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > -                     while (n--) ((void (*)(void))*--fn)();
> > +                     while (n--) fpaddr(p, *--fn)();
>
> If this is fixable on the tooling side it really should be fixed
> there. init/fini arrays should have actual language-level function
> addresses (descriptor addresses on fdpic), not instruction addresses.

I read libgcc code at
  https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
and the way it's written suggests that this was done on purpose.
I put it into the WIP pile to figure out later what the purpose was.
I thought that SH might not have this issue because it just didn't
use the .array_init/.array_fini.

> If it's not fixable at this point, I guess we need the arch's reloc.h
> to define a macro signaling this difference in behavior.
>
> I have no idea how this would be expected to work on static linked
> programs...

That's true it's not working in this code version.

> > diff --git a/src/internal/dynlink.h b/src/internal/dynlink.h
> > index 06f41d09..6485e883 100644
> > --- a/src/internal/dynlink.h
> > +++ b/src/internal/dynlink.h
> > @@ -78,10 +78,11 @@ struct fdpic_dummy_loadmap {
> >       (R_TYPE(x) == REL_RELATIVE) || \
> >       (R_TYPE(x) == REL_SYM_OR_REL && !R_SYM(x)) )
> >  #else
> > -#define IS_RELATIVE(x,s) ( ( \
> > +#define IS_RELATIVE(x,s) ( \
> > +     (R_TYPE(x) == REL_RELATIVE) || ( ( \
> >       (R_TYPE(x) == REL_FUNCDESC_VAL) || \
> >       (R_TYPE(x) == REL_SYMBOLIC) ) \
> > -     && (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) )
> > +     && (((s)[R_SYM(x)].st_info & 0xf) == STT_SECTION) ) )
> >  #endif
> >
> >  #ifndef NEED_MIPS_GOT_RELOCS
> > --
> > 2.21.0
> >
>
> This looks ok assuming the above about relative relocs.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 17:20   ` Max Filippov
@ 2024-02-28 18:30     ` Rich Felker
  2024-02-28 18:37       ` Rich Felker
                         ` (4 more replies)
  0 siblings, 5 replies; 17+ messages in thread
From: Rich Felker @ 2024-02-28 18:30 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> Hi Rich,
> 
> On Tue, Feb 27, 2024 at 4:12 PM Rich Felker <dalias@libc.org> wrote:
> >
> > First of all, I'm excited to see new work on this and especially the
> > FDPIC ABI. Not only is xtensa a very relevant arch I've been hoping we
> > could support for some time now, but having FDPIC will help getting
> > the parts of musl's FDPIC support that are currently making
> > sh-specific assumptions worked out so that we can get other new FDPIC
> > ports added too (particularly, arm and riscv).
> >
> > I'm not up-to-date on what toolchain status/stability is or how close
> > this is to being intended as upstreamable, but since I was pointed to
> > it and took a look, I wanted to record my initial review thoughts here
> > so they don't get lost and to hopefully move this closer to ready for
> > upstream.
> 
> Thanks for your review.
> I expect to get toolchain components into shape for upstreaming during
> this year.
> 
> > Review follows inline:
> >
> > On Tue, Feb 27, 2024 at 06:24:31PM -0500, Rich Felker wrote:
> > > Attached patches were pulled from
> > > https://github.com/jcmvbkbc/musl-xtensa/commits/xtensa-1.2.4-fdpic/
> > >
> > > I'll reply to comment inline.
> >
> > > >From 868b34272f414323f10528d5b9988886541cb1c0 Mon Sep 17 00:00:00 2001
> > > From: Max Filippov <jcmvbkbc@gmail.com>
> > > Date: Tue, 22 Mar 2016 02:35:58 +0300
> > > Subject: [PATCH 1/2] xtensa: add port
> > >
> > > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > > ---
> > >  arch/xtensa/arch.mak                  |   1 +
> > >  arch/xtensa/atomic_arch.h             |  27 ++
> > >  arch/xtensa/bits/alltypes.h.in        |  27 ++
> > >  arch/xtensa/bits/float.h              |  16 ++
> > >  arch/xtensa/bits/ioctl.h              | 219 ++++++++++++++
> > >  arch/xtensa/bits/limits.h             |   1 +
> > >  arch/xtensa/bits/mman.h               |  68 +++++
> > >  arch/xtensa/bits/posix.h              |   2 +
> > >  arch/xtensa/bits/reg.h                |   2 +
> > >  arch/xtensa/bits/setjmp.h             |   5 +
> > >  arch/xtensa/bits/signal.h             |  88 ++++++
> > >  arch/xtensa/bits/stat.h               |  24 ++
> > >  arch/xtensa/bits/stdint.h             |  20 ++
> > >  arch/xtensa/bits/syscall.h.in         | 396 ++++++++++++++++++++++++++
> > >  arch/xtensa/bits/termios.h            | 167 +++++++++++
> > >  arch/xtensa/bits/user.h               |   4 +
> > >  arch/xtensa/bits/xtensa-config.h      |   8 +
> > >  arch/xtensa/crt_arch.h                |  48 ++++
> > >  arch/xtensa/kstat.h                   |  18 ++
> > >  arch/xtensa/pthread_arch.h            |  11 +
> > >  arch/xtensa/reloc.h                   |  38 +++
> > >  arch/xtensa/syscall_arch.h            | 104 +++++++
> > >  configure                             |   1 +
> > >  crt/xtensa/crti.S                     |  29 ++
> > >  crt/xtensa/crtn.S                     |  23 ++
> > >  include/elf.h                         |  70 +++++
> > >  src/internal/xtensa/syscall.S         |  25 ++
> > >  src/ldso/xtensa/tlsdesc.S             |  36 +++
> > >  src/process/xtensa/vfork.S            |  21 ++
> > >  src/setjmp/xtensa/longjmp.S           |  22 ++
> > >  src/setjmp/xtensa/setjmp.S            |  25 ++
> > >  src/signal/xtensa/restore.s           |  10 +
> > >  src/signal/xtensa/sigsetjmp.S         |  24 ++
> > >  src/thread/xtensa/__set_thread_area.S |  16 ++
> > >  src/thread/xtensa/__unmapself.S       |  13 +
> > >  src/thread/xtensa/clone.S             |  46 +++
> > >  src/thread/xtensa/syscall_cp.S        |  38 +++
> > >  37 files changed, 1693 insertions(+)
> > >  create mode 100644 arch/xtensa/arch.mak
> > >  create mode 100644 arch/xtensa/atomic_arch.h
> > >  create mode 100644 arch/xtensa/bits/alltypes.h.in
> > >  create mode 100644 arch/xtensa/bits/float.h
> > >  create mode 100644 arch/xtensa/bits/ioctl.h
> > >  create mode 100644 arch/xtensa/bits/limits.h
> > >  create mode 100644 arch/xtensa/bits/mman.h
> > >  create mode 100644 arch/xtensa/bits/posix.h
> > >  create mode 100644 arch/xtensa/bits/reg.h
> > >  create mode 100644 arch/xtensa/bits/setjmp.h
> > >  create mode 100644 arch/xtensa/bits/signal.h
> > >  create mode 100644 arch/xtensa/bits/stat.h
> > >  create mode 100644 arch/xtensa/bits/stdint.h
> > >  create mode 100644 arch/xtensa/bits/syscall.h.in
> > >  create mode 100644 arch/xtensa/bits/termios.h
> > >  create mode 100644 arch/xtensa/bits/user.h
> > >  create mode 100644 arch/xtensa/bits/xtensa-config.h
> > >  create mode 100644 arch/xtensa/crt_arch.h
> > >  create mode 100644 arch/xtensa/kstat.h
> > >  create mode 100644 arch/xtensa/pthread_arch.h
> > >  create mode 100644 arch/xtensa/reloc.h
> > >  create mode 100644 arch/xtensa/syscall_arch.h
> > >  create mode 100644 crt/xtensa/crti.S
> > >  create mode 100644 crt/xtensa/crtn.S
> > >  create mode 100644 src/internal/xtensa/syscall.S
> > >  create mode 100644 src/ldso/xtensa/tlsdesc.S
> > >  create mode 100644 src/process/xtensa/vfork.S
> > >  create mode 100644 src/setjmp/xtensa/longjmp.S
> > >  create mode 100644 src/setjmp/xtensa/setjmp.S
> > >  create mode 100644 src/signal/xtensa/restore.s
> > >  create mode 100644 src/signal/xtensa/sigsetjmp.S
> > >  create mode 100644 src/thread/xtensa/__set_thread_area.S
> > >  create mode 100644 src/thread/xtensa/__unmapself.S
> > >  create mode 100644 src/thread/xtensa/clone.S
> > >  create mode 100644 src/thread/xtensa/syscall_cp.S
> > >
> > > diff --git a/arch/xtensa/arch.mak b/arch/xtensa/arch.mak
> > > new file mode 100644
> > > index 00000000..aa4d05ce
> > > --- /dev/null
> > > +++ b/arch/xtensa/arch.mak
> > > @@ -0,0 +1 @@
> > > +COMPAT_SRC_DIRS = compat/time32
> > > diff --git a/arch/xtensa/atomic_arch.h b/arch/xtensa/atomic_arch.h
> > > new file mode 100644
> > > index 00000000..34bf0704
> > > --- /dev/null
> > > +++ b/arch/xtensa/atomic_arch.h
> > > @@ -0,0 +1,27 @@
> > > +#include "bits/xtensa-config.h"
> >
> > This is not what bits headers are for; they are public-installed
> > headers that provide parts of the standard public headers that vary by
> > arch, and are never directly #include'able.
> >
> > If you really need arch-internal headers like this just put them in
> > the top-level dir of the arch. But in the case of these macros, either
> > just use the compiler-provided, __-prefixed ones directly, or if
> > they're not actually variable, don't inspect them at all.
> 
> I carry this header from the time when configuration-specific header
> file was the only available configuration option. __XCHAL_* macros
> only appeared in gcc-13, but since FDPIC support will only be
> available on toolchains that have these macros I guess I'll drop the
> header and use __XCHAL_* macros directly.
> 
> > > +
> > > +#if XCHAL_HAVE_S32C1I
> > > +#define a_cas a_cas
> > > +static inline int a_cas(volatile int *p, int t, int s)
> > > +{
> > > +     __asm__ __volatile__ (
> > > +             "       wsr     %2, scompare1\n"
> > > +             "       s32c1i  %0, %1\n"
> > > +             : "+a"(s), "+m"(*p)
> > > +             : "a"(t)
> > > +             : "memory" );
> > > +        return s;
> > > +}
> > > +#endif
> >
> > For example this is not a useful test, because there's nothing you can
> > do in the case where it's not defined;
> 
> This test is in the spirit of xtensa: it covers one possible
> configuration option.
> There's at least one other hardware option (exclusive access instructions)
> available for this functionality and a fallback to syscalls. They're not listed
> here because I tried to keep this initial implementation to a bare minimum.
> 
> > it's just not supportable
> > (unless there's some kernel fallback mechanism like ARM and SH have;
> > then it would make sense to hard-code the cas instruction as above if
> > the ISA level is known to have it, and otherwise do conditional
> > runtime selection (again, like ARM and SH).
> 
> Xtensa is quite limited on the runtime selection, because opcodes belonging
> to the options not enabled for a particular core just would not assemble.

That's not really a problem. On sh, we use .word to encode the
instructions that might not be present at the ISA level targeted,
which will only be conditionally executed. On ARM, I believe we just
asm directives to suppress the ISA level tagging of the object file so
that it can use the mnemonics for the conditionally-available
instructions. Generally, we find a way to work around whatever the
tooling is doing to make things difficult here. :-)

> > > diff --git a/arch/xtensa/bits/alltypes.h.in b/arch/xtensa/bits/alltypes..h.in
> > > new file mode 100644
> > > index 00000000..21221ce5
> > > --- /dev/null
> > > +++ b/arch/xtensa/bits/alltypes.h.in
> > > @@ -0,0 +1,27 @@
> > > +#define _REDIR_TIME64 1
> > > +#define _Addr int
> > > +#define _Int64 long long
> > > +#define _Reg int
> > > +
> > > +#if __XTENSA_EB__
> > > +#define __BYTE_ORDER 4321
> > > +#elif __XTENSA_EL__
> > > +#define __BYTE_ORDER 1234
> > > +#else
> > > +#error Unknown endianness
> > > +#endif
> > > +
> > > +#define __LONG_MAX 0x7fffffffL
> > > +
> > > +#ifndef __cplusplus
> > > +#ifdef __WCHAR_TYPE__
> > > +TYPEDEF __WCHAR_TYPE__ wchar_t;
> > > +#else
> > > +TYPEDEF unsigned wchar_t;
> > > +#endif
> > > +#endif
> > > +
> > > +TYPEDEF float float_t;
> > > +TYPEDEF double double_t;
> > > +
> > > +TYPEDEF struct { long __l __attribute__((aligned(__BIGGEST_ALIGNMENT__))); } max_align_t;
> >
> > It's best to avoid macros like __BIGGEST_ALIGNMENT__ here that could
> > change out from under us in the future, thereby quietly breaking the
> > ABI. Whatever the ABI is, that's what the alignment here should be,
> > and just using a type that naturally has the alignment if possible
> > rather than attributes.
> 
> The thing is: we don't know the name of the type with the widest
> alignment, as indeed in the future someone may come up with
> a configuration with an arbitrarily named type with an alignment
> as big as allowed physically. But then its their responsibility to
> keep the __BIGGEST_ALIGNMENT__ consistent across the
> updates of that configuration that intend to stay compatible with
> each other.

max_align_t is only a C-language contract for the standard C types.
There is no need for it to cover any sort of extension types, vectors,
etc. So it should be defined in terms of the C ABI types and needs to
be stable. If long long or double is already 8-byte aligned and long
double is the same as double, it should probably just be defined as
long long or a union of long long and long double or similar.

> > > diff --git a/arch/xtensa/reloc.h b/arch/xtensa/reloc.h
> > > new file mode 100644
> > > index 00000000..03482f18
> > > --- /dev/null
> > > +++ b/arch/xtensa/reloc.h
> > > @@ -0,0 +1,38 @@
> > > +#if __BYTE_ORDER == __BIG_ENDIAN
> > > +#define ENDIAN_SUFFIX "eb"
> > > +#else
> > > +#define ENDIAN_SUFFIX ""
> > > +#endif
> > > +
> > > +#if __FDPIC__
> > > +#define ABI_SUFFIX "-fdpic"
> > > +#else
> > > +#define ABI_SUFFIX ""
> > > +#endif
> > > +
> > > +#define LDSO_ARCH "xtensa" ENDIAN_SUFFIX ABI_SUFFIX
> >
> > Any actual incompatible ABI (as opposed to ISA level) needs its own
> > LDSO name.
> 
> There's no real ISA levels in the xtensa. Every CPU core configuration
> is its own unique ISA.

What does that practically mean? I think we're using different
terminology which is making it hard to communicate.

By "ISA level" I mean a node of a tree of possible extensions to a
baseline ISA, where machine code targeted for one node in the tree can
be executed on all descendants, and can be linked with machine code
targeting a particular descendant to produce code that can be executed
on that descendant.

This applies both statically and dynamically. For example, an
application built for a higher ISA level can be executed with
libc/ldso built for the baseline ISA (note: this sometimes requires
conditional logic in a few places like setjmp; see arm for example),
and an application built for the baseline ISA can be executed using
libc/ldso built for a higher ISA level.

I'm guessing you mean which features are supported is combinatorically
selectable as part of the core configuration, not a linear progression
of levels like i386, i486, etc. That's no problem. If I'm mistaken,
plese help me try to understand what you mean.

> > So if there are alternate (call0/windowed?) ABIs you want
> > to support, different names need to be defined for them here and in
> > configure.
> 
> I understand that it means that I should drop the endianness suffix,
> as the endianness of each core is fixed, but add windowed/call0
> discriminator.

If there are both le and be variants, then they need to have separate
names. Same for any other mutually incompatible ABIs.

> > Unless there's a good reason to support more than one ABI,
> > it probably just makes sense to pick the preferred one and have
> > configure error-out if the toolchain's ABI is incompatible.
> 
> I still have hope to support windowed FDPIC and then non-FDPIC
> variants.

Is there a reason to support both windowed and non-windowed? Sorry if
this is a dumb question; I don't know the background on it. If it's
possible to just pick one as better, and it can be supported on all
cores, that's probably the better choice.

> > > diff --git a/src/ldso/xtensa/tlsdesc.S b/src/ldso/xtensa/tlsdesc.S
> > > new file mode 100644
> > > index 00000000..084d5f3e
> > > --- /dev/null
> > > +++ b/src/ldso/xtensa/tlsdesc.S
> > > @@ -0,0 +1,36 @@
> > > +.global __tlsdesc_static
> > > +.hidden __tlsdesc_static
> > > +.type __tlsdesc_static,@function
> > > +.align 4
> > > +__tlsdesc_static:
> > > +#ifdef __XTENSA_WINDOWED_ABI__
> > > +     entry   a1, 16
> > > +#endif
> > > +     rur     a3, threadptr
> > > +     add     a2, a2, a3
> > > +#if defined(__XTENSA_WINDOWED_ABI__)
> > > +     retw
> > > +#elif defined(__XTENSA_CALL0_ABI__)
> > > +     ret
> > > +#else
> > > +#error Unsupported Xtensa ABI
> > > +#endif
> >
> > Are you sure the entry step makes sense with tlsdesc? Is this honoring
> > the ABI for what the tlsdesc function is allowed to clobber?
> 
> But the tlsdesc function is just a normal function, it can do whatever
> a regular function can do AFAIU. In addition there are no relaxations
> that can replace non-call code with a variant with a call.
> 
> 'entry' is the right way to begin a regular function in windowed xtensa ABI..

OK but this is not a regular function. It's a code fragment with
special ABI constraints and is intended to execute as fast as
possible, clobbering as little as possible. I don't understand exactly
what "entry" does, but I suspect it only makes sense if you're
actually introducing a "call frame" of some sort, like if you'll be
spilling and restoring call-saved registers or something (or doing
whatever the windowed-register-file equivalent is to achieve the same
thing) which does not seem to be needed here.

> > > +.hidden __tls_get_new
> > > +
> > > +.global __tlsdesc_dynamic
> > > +.hidden __tlsdesc_dynamic
> > > +.type __tlsdesc_dynamic,@function
> > > +.align 4
> > > +__tlsdesc_dynamic:
> > > +#if defined(__XTENSA_WINDOWED_ABI__)
> > > +     entry   a1, 16
> > > +     mov     a6, a2
> > > +     call4   __tls_get_addr
> > > +     mov     a2, a6
> > > +     retw
> > > +#elif defined(__XTENSA_CALL0_ABI__)
> > > +     j       __tls_get_addr
> > > +#else
> > > +#error Unsupported Xtensa ABI
> > > +#endif
> >
> > This approach is no longer a valid implementation for
> > __tlsdesc_dynamic, because it calls C code which could clobber any
> > call-clobbered registers, not just the ones the tlsdesc function is
> > allowed to clobber.
> >
> > Instead, it needs to actually do the dtv lookup in asm. musl does not
> > use any dynamic allocation here, so the lookup is guaranteed to just
> > work using the indices found with no conditional branches. See the
> > recently added riscv code for a clean example, in commit
> > 407aea628af8c81d9e3f5a068568f2217db71bba.
> 
> Yeah, this whole xtensa TLS code should be in the WIP pile.
> I'll rewrite it in assembly.

OK, just wanted you to be aware that this is something eventually
needing rewrite.

> > > diff --git a/src/signal/xtensa/sigsetjmp.S b/src/signal/xtensa/sigsetjmp.S
> > > new file mode 100644
> > > index 00000000..e44bba88
> > > --- /dev/null
> > > +++ b/src/signal/xtensa/sigsetjmp.S
> > > @@ -0,0 +1,24 @@
> > > +.global sigsetjmp
> > > +.global __sigsetjmp
> > > +.type sigsetjmp,@function
> > > +.type __sigsetjmp,@function
> > > +.align 4
> > > +sigsetjmp:
> > > +__sigsetjmp:
> > > +#if defined(__XTENSA_CALL0_ABI__)
> > > +     bnez    a3, 1f
> > > +.hidden ___setjmp
> > > +     j       ___setjmp
> > > +1:
> > > +     mov     a3, a0
> > > +     mov     a4, a2
> > > +     call0   ___setjmp
> > > +     mov     a0, a3
> > > +     mov     a3, a2
> > > +     mov     a2, a4
> > > +
> > > +.hidden __sigsetjmp_tail
> > > +     j       __sigsetjmp_tail
> > > +#else
> > > +#error Unsupported Xtensa ABI
> > > +#endif
> >
> > I don't think this code works. It does not seem to save the return
> > address and argument address before calling ___setjmp, so they will be
> > clobbered after it returns.
> 
> It does:
> 
> +     mov     a3, a0
> +     mov     a4, a2
> 
> do exactly this and ___setjmp is hidden, so it's always going to be
> the implementation above that doesn't clobber a3 and a4.

Nope, that's not how it works. setjmp returns twice, and the second
time it returns, it's *whatever code has executed since the first
return from setjmp, which most certainly will have clobbered
everything but the call-saved registers.

The return address needs to be saved in the space past the end of the
signal mask in the sigjmp_buf, and at least one call-saved register
also needs to be saved there, so that you can store the address of the
sigjmp_buf in a call-saved register and have it available after a
subsequent return from setjmp.

> > Eventually I'd like to move most functions like this, that are
> > currently external asm but have no good reason to be (unlike setjmp,
> > vfork, etc. which *do* have a good reason to be), to minimal inline
> > asm in C. Some archs' versions of this file would even just become
> > __syscall, no further asm.
> >
> > > diff --git a/src/thread/xtensa/__unmapself.S b/src/thread/xtensa/__unmapself.S
> > > new file mode 100644
> > > index 00000000..15bf2bdf
> > > --- /dev/null
> > > +++ b/src/thread/xtensa/__unmapself.S
> > > @@ -0,0 +1,13 @@
> > > +.global __unmapself
> > > +.type   __unmapself,@function
> > > +.align 4
> > > +__unmapself:
> > > +#if defined(__XTENSA_CALL0_ABI__)
> > > +     mov     a6, a2
> > > +     movi    a2, 81 # SYS_munmap
> > > +     syscall
> > > +     movi    a2, 118 # SYS_exit
> > > +     syscall
> > > +#else
> > > +#error Unsupported Xtensa ABI
> > > +#endif
> >
> > This is unsafe on fdpic/nommu if the kernel requires the task to
> > always have a valid stack.
> 
> How do I know if the kernel requires that?

I don't think it's explicitly documented. For sh I was vaguely
familiar at the ISA level that the sh1/sh2 cpu just pushes stuff to
the (existing, userspace) stack pointer on interrupts, which would
necessitate always having a valid stack. I also read the entry.S code
(or whatever file this part was split off into) for how kernelspace is
entered from traps/interrupts on nommu hardware, and saw how it
shuffled things around to make it look to the kernel more like a
conventional mmu-ful entry to kernelspace.

Maybe xtensa is better about this and always invokes the
trap/interrupt handler with the saved state in special registers
rather than pushing it to the stack. If so, this might not be a
consideration; normally, Linux switches the stack pointer to the
kernel stack before actually doing anything with it.

> > > >From f8c5953ecdb948282cc8e573b729c25db60a95a8 Mon Sep 17 00:00:00 2001
> > > From: Max Filippov <jcmvbkbc@gmail.com>
> > > Date: Wed, 21 Feb 2024 08:27:37 -0800
> > > Subject: [PATCH 2/2] WIP
> > >
> > > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > > ---
> > >  ldso/dlstart.c         | 7 +++++++
> > >  ldso/dynlink.c         | 6 ++++--
> > >  src/internal/dynlink.h | 5 +++--
> > >  3 files changed, 14 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/ldso/dlstart.c b/ldso/dlstart.c
> > > index 259f5e18..beca953f 100644
> > > --- a/ldso/dlstart.c
> > > +++ b/ldso/dlstart.c
> > > @@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
> > >                               - segs[rel_addr[1]].p_vaddr
> > >                               + syms[R_SYM(rel[1])].st_value;
> > >                       rel_addr[1] = dyn[DT_PLTGOT];
> > > +             } else if (R_TYPE(rel[1]) == REL_RELATIVE) {
> > > +                     size_t val = *rel_addr;
> > > +                     for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > > +                     *rel_addr += segs[j].addr - segs[j].p_vaddr;
> >
> > So xtensa has a "relative" reloc type that's just adjusted by the load
> > offset of the segment the relocation lives in, rather than needing to
> > use a symbolic relocation referencing a section symbol like other
> > fdpic archs do?
> 
> I was looking at the ARM BFD code while doing that and
> my impression was that they do the same.
> Regardless, I wonder why either relocation form might be preferrable?

I think the relative type here is perfectly acceptable. At first I
thought it was weaker (less capable of representing addresses of
objects in different segments), but looking again, I don't think
that's the case.

> > If so, this looks right and looks ok.
> >
> > >               } else {
> > >                       size_t val = syms[R_SYM(rel[1])].st_value;
> > >                       for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > >                       *rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
> > >               }
> > >       }
> > > +#ifdef __xtensa__
> > > +     ((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
> > > +#endif
> >
> > Is this actually needed for anything? Generally musl doesn't use the
> > reserved GOT slots itself, and on all the other archs I'm aware of,
> > they're essentially reserved to the dynamic linker implementation so
> > the dynamic linker is just free not to use them and not to set them
> > up.
> 
> xtensa doesn't have relative register jumps and calls, so local jumps
> and calls to a far off locations need to use absolute target addresses.
> One possible solution is to have the address in the GOT entry, the
> other is to calculate the target address using the text segment load
> offset at runtime. Both have the same instruction count, see
>   http://wiki.osll.ru/doku.php/etc:users:jcmvbkbc:binutils-xtensa#local_call
> for the details, but the latter doesn't waste GOT space and that saves
> a noticeable amount of RAM.

I see. Doesn't this limit you to a single text segment, though? In
practice it might not matter, but it's more constraining than fdpic
was designed to be.

Is there a reason local, relative jumps/calls can't be done by
computing the address of a nearby label and using the offset relative
to that? That's how they (at least some types; I forget the details)
work on sh/fdpic.

> > >  #else
> > >       /* If the dynamic linker is invoked as a command, its load
> > >        * address is not available in the aux vector. Instead, compute
> > > diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> > > index ceca3c98..25563af3 100644
> > > --- a/ldso/dynlink.c
> > > +++ b/ldso/dynlink.c
> > > @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
> > >               if (!DL_FDPIC)
> > >                       do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
> > >
> > > +#if 0
> > >               if (head != &ldso && p->relro_start != p->relro_end) {
> > >                       long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
> > >                               p->relro_end-p->relro_start, PROT_READ);
> > > @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
> > >                               if (runtime) longjmp(*rtld_fail, 1);
> > >                       }
> > >               }
> > > +#endif
> >
> > Was this breaking something? Relro linking probably should have been
> > disabled on fdpic, and we ignore ENOSYS anyway for nommu.
> 
> Yeah, I do some of the testing in linux-user QEMU, it has MMU
> and mprotect calls actually succeed. Failures were coming from the
> relocation code, but my impression was that relro should be applied
> after the relocation completion and it should just work, hence the
> WIP pile.

Yes, relro is only supposed to be applied after all relocations were
done.

> 
> > >               p->relocated = 1;
> > >       }
> > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > -                     while (n--) ((void (*)(void))*--fn)();
> > > +                     while (n--) fpaddr(p, *--fn)();
> >
> > If this is fixable on the tooling side it really should be fixed
> > there. init/fini arrays should have actual language-level function
> > addresses (descriptor addresses on fdpic), not instruction addresses.
> 
> I read libgcc code at
>   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> and the way it's written suggests that this was done on purpose.
> I put it into the WIP pile to figure out later what the purpose was.
> I thought that SH might not have this issue because it just didn't
> use the .array_init/.array_fini.

I'm pretty sure we're using it -- musl-cross-make always forces it on
via the gcc configure command line -- but it's possible there's some
override disabling it for sh. I'll try some test cases and confirm
whether sh is doing it right. Maybe the arm folks will have input on
this too..?

> > If it's not fixable at this point, I guess we need the arch's reloc.h
> > to define a macro signaling this difference in behavior.
> >
> > I have no idea how this would be expected to work on static linked
> > programs...
> 
> That's true it's not working in this code version.

And I would really not like to add hackish code to the static init
code that's special-casing some archs that violate the gABI
requirement that these arrays contain function pointers. Let's see if
it can be fixed.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:30     ` Rich Felker
@ 2024-02-28 18:37       ` Rich Felker
  2024-02-28 19:34         ` Max Filippov
  2024-02-28 21:28       ` Max Filippov
                         ` (3 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: Rich Felker @ 2024-02-28 18:37 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > 
> > > >               p->relocated = 1;
> > > >       }
> > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > +                     while (n--) fpaddr(p, *--fn)();
> > >
> > > If this is fixable on the tooling side it really should be fixed
> > > there. init/fini arrays should have actual language-level function
> > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > 
> > I read libgcc code at
> >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > and the way it's written suggests that this was done on purpose.
> > I put it into the WIP pile to figure out later what the purpose was.
> > I thought that SH might not have this issue because it just didn't
> > use the .array_init/.array_fini.
> 
> I'm pretty sure we're using it -- musl-cross-make always forces it on
> via the gcc configure command line -- but it's possible there's some
> override disabling it for sh. I'll try some test cases and confirm
> whether sh is doing it right. Maybe the arm folks will have input on
> this too..?

Confirmed both that it works, and that it's working via init_array.
GCC emits:

	.section        .init_array,"aw"
	.align 2
	.long   foo@FUNCDESC

for

	__attribute__((__constructor__))
	void foo() { ... }

Also, FWIW, I believe there's something of an application-facing
contract that you can declare function pointer arrays with
__attribute__((__section__(".init_array"))) and have them work, which
would not work if instruction addresses rather than function addresses
are expected to be there.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:37       ` Rich Felker
@ 2024-02-28 19:34         ` Max Filippov
  2024-02-28 19:41           ` Max Filippov
  0 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-28 19:34 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 10:36 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > >
> > > > >               p->relocated = 1;
> > > > >       }
> > > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > > +                     while (n--) fpaddr(p, *--fn)();
> > > >
> > > > If this is fixable on the tooling side it really should be fixed
> > > > there. init/fini arrays should have actual language-level function
> > > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > >
> > > I read libgcc code at
> > >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > > and the way it's written suggests that this was done on purpose.
> > > I put it into the WIP pile to figure out later what the purpose was.
> > > I thought that SH might not have this issue because it just didn't
> > > use the .array_init/.array_fini.
> >
> > I'm pretty sure we're using it -- musl-cross-make always forces it on
> > via the gcc configure command line -- but it's possible there's some
> > override disabling it for sh. I'll try some test cases and confirm
> > whether sh is doing it right. Maybe the arm folks will have input on
> > this too..?
>
> Confirmed both that it works, and that it's working via init_array.
> GCC emits:
>
>         .section        .init_array,"aw"
>         .align 2
>         .long   foo@FUNCDESC
>
> for
>
>         __attribute__((__constructor__))
>         void foo() { ... }
>

Oh, no doubt that that C code generates a function descriptor, it
works for xtensa too. But the piece of libgcc quoted above specifically
puts a pointer to an object, not to a function into the .init_array.

> Also, FWIW, I believe there's something of an application-facing
> contract that you can declare function pointer arrays with
> __attribute__((__section__(".init_array"))) and have them work, which
> would not work if instruction addresses rather than function addresses
> are expected to be there.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 19:34         ` Max Filippov
@ 2024-02-28 19:41           ` Max Filippov
  2024-02-28 20:14             ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-28 19:41 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 11:34 AM Max Filippov <jcmvbkbc@gmail.com> wrote:
>
> On Wed, Feb 28, 2024 at 10:36 AM Rich Felker <dalias@libc.org> wrote:
> > On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> > > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > >
> > > > > >               p->relocated = 1;
> > > > > >       }
> > > > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > > > +                     while (n--) fpaddr(p, *--fn)();
> > > > >
> > > > > If this is fixable on the tooling side it really should be fixed
> > > > > there. init/fini arrays should have actual language-level function
> > > > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > > >
> > > > I read libgcc code at
> > > >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > > > and the way it's written suggests that this was done on purpose.
> > > > I put it into the WIP pile to figure out later what the purpose was.
> > > > I thought that SH might not have this issue because it just didn't
> > > > use the .array_init/.array_fini.
> > >
> > > I'm pretty sure we're using it -- musl-cross-make always forces it on
> > > via the gcc configure command line -- but it's possible there's some
> > > override disabling it for sh. I'll try some test cases and confirm
> > > whether sh is doing it right. Maybe the arm folks will have input on
> > > this too..?
> >
> > Confirmed both that it works, and that it's working via init_array.
> > GCC emits:
> >
> >         .section        .init_array,"aw"
> >         .align 2
> >         .long   foo@FUNCDESC
> >
> > for
> >
> >         __attribute__((__constructor__))
> >         void foo() { ... }
> >
>
> Oh, no doubt that that C code generates a function descriptor, it
> works for xtensa too. But the piece of libgcc quoted above specifically
> puts a pointer to an object, not to a function into the .init_array.

It was introduced to gcc by the ARM FDPIC series:
https://github.com/jcmvbkbc/gcc-xtensa/commit/11189793b6ef60645d5d1126d0bd9d0dd83e6583

This is the second change that I find made by the ARM FDPIC
series that appears to be not right for other FDPIC ports, first
being this change to the C++ unwinding code:
https://github.com/jcmvbkbc/gcc-xtensa/commit/67b0605494f32811364e25328d3522467aaf0638

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 19:41           ` Max Filippov
@ 2024-02-28 20:14             ` Rich Felker
  2024-02-28 20:26               ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Rich Felker @ 2024-02-28 20:14 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Wed, Feb 28, 2024 at 11:41:30AM -0800, Max Filippov wrote:
> On Wed, Feb 28, 2024 at 11:34 AM Max Filippov <jcmvbkbc@gmail.com> wrote:
> >
> > On Wed, Feb 28, 2024 at 10:36 AM Rich Felker <dalias@libc.org> wrote:
> > > On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> > > > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > > >
> > > > > > >               p->relocated = 1;
> > > > > > >       }
> > > > > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > > > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > > > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > > > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > > > > +                     while (n--) fpaddr(p, *--fn)();
> > > > > >
> > > > > > If this is fixable on the tooling side it really should be fixed
> > > > > > there. init/fini arrays should have actual language-level function
> > > > > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > > > >
> > > > > I read libgcc code at
> > > > >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > > > > and the way it's written suggests that this was done on purpose.
> > > > > I put it into the WIP pile to figure out later what the purpose was..
> > > > > I thought that SH might not have this issue because it just didn't
> > > > > use the .array_init/.array_fini.
> > > >
> > > > I'm pretty sure we're using it -- musl-cross-make always forces it on
> > > > via the gcc configure command line -- but it's possible there's some
> > > > override disabling it for sh. I'll try some test cases and confirm
> > > > whether sh is doing it right. Maybe the arm folks will have input on
> > > > this too..?
> > >
> > > Confirmed both that it works, and that it's working via init_array.
> > > GCC emits:
> > >
> > >         .section        .init_array,"aw"
> > >         .align 2
> > >         .long   foo@FUNCDESC
> > >
> > > for
> > >
> > >         __attribute__((__constructor__))
> > >         void foo() { ... }
> > >
> >
> > Oh, no doubt that that C code generates a function descriptor, it
> > works for xtensa too. But the piece of libgcc quoted above specifically
> > puts a pointer to an object, not to a function into the .init_array.
> 
> It was introduced to gcc by the ARM FDPIC series:
> https://github.com/jcmvbkbc/gcc-xtensa/commit/11189793b6ef60645d5d1126d0bd9d0dd83e6583
> 
> This is the second change that I find made by the ARM FDPIC
> series that appears to be not right for other FDPIC ports, first
> being this change to the C++ unwinding code:
> https://github.com/jcmvbkbc/gcc-xtensa/commit/67b0605494f32811364e25328d3522467aaf0638

OK, so the arm folks put explicitly wrong/broken code here. That needs
to be reverted, and they can work out the mess they created on glibc.

There is probably wrong arm target code too whereby gcc is generating
instruction addresses for __attribute__((__constructor__)) rather than
function addresses.

If they have compat to worry about with glibc binaries, that's going
to be a mess for them to fix, but we can just patch it out for musl
target regardless of what they do since we have no existing broken
binaries.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 20:14             ` Rich Felker
@ 2024-02-28 20:26               ` Rich Felker
  2024-02-28 20:37                 ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Rich Felker @ 2024-02-28 20:26 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Wed, Feb 28, 2024 at 03:14:12PM -0500, Rich Felker wrote:
> On Wed, Feb 28, 2024 at 11:41:30AM -0800, Max Filippov wrote:
> > On Wed, Feb 28, 2024 at 11:34 AM Max Filippov <jcmvbkbc@gmail.com> wrote:
> > >
> > > On Wed, Feb 28, 2024 at 10:36 AM Rich Felker <dalias@libc.org> wrote:
> > > > On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> > > > > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > > > >
> > > > > > > >               p->relocated = 1;
> > > > > > > >       }
> > > > > > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > > > > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > > > > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > > > > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > > > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > > > > > +                     while (n--) fpaddr(p, *--fn)();
> > > > > > >
> > > > > > > If this is fixable on the tooling side it really should be fixed
> > > > > > > there. init/fini arrays should have actual language-level function
> > > > > > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > > > > >
> > > > > > I read libgcc code at
> > > > > >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > > > > > and the way it's written suggests that this was done on purpose.
> > > > > > I put it into the WIP pile to figure out later what the purpose was..
> > > > > > I thought that SH might not have this issue because it just didn't
> > > > > > use the .array_init/.array_fini.
> > > > >
> > > > > I'm pretty sure we're using it -- musl-cross-make always forces it on
> > > > > via the gcc configure command line -- but it's possible there's some
> > > > > override disabling it for sh. I'll try some test cases and confirm
> > > > > whether sh is doing it right. Maybe the arm folks will have input on
> > > > > this too..?
> > > >
> > > > Confirmed both that it works, and that it's working via init_array.
> > > > GCC emits:
> > > >
> > > >         .section        .init_array,"aw"
> > > >         .align 2
> > > >         .long   foo@FUNCDESC
> > > >
> > > > for
> > > >
> > > >         __attribute__((__constructor__))
> > > >         void foo() { ... }
> > > >
> > >
> > > Oh, no doubt that that C code generates a function descriptor, it
> > > works for xtensa too. But the piece of libgcc quoted above specifically
> > > puts a pointer to an object, not to a function into the .init_array.
> > 
> > It was introduced to gcc by the ARM FDPIC series:
> > https://github.com/jcmvbkbc/gcc-xtensa/commit/11189793b6ef60645d5d1126d0bd9d0dd83e6583
> > 
> > This is the second change that I find made by the ARM FDPIC
> > series that appears to be not right for other FDPIC ports, first
> > being this change to the C++ unwinding code:
> > https://github.com/jcmvbkbc/gcc-xtensa/commit/67b0605494f32811364e25328d3522467aaf0638
> 
> OK, so the arm folks put explicitly wrong/broken code here. That needs
> to be reverted, and they can work out the mess they created on glibc.
> 
> There is probably wrong arm target code too whereby gcc is generating
> instruction addresses for __attribute__((__constructor__)) rather than
> function addresses.
> 
> If they have compat to worry about with glibc binaries, that's going
> to be a mess for them to fix, but we can just patch it out for musl
> target regardless of what they do since we have no existing broken
> binaries.

OK, really good news! They didn't actually botch it. This test on
godbolt shows GCC is doing the right thing:

https://godbolt.org/z/b53ExYoPf

(GCC 13.1.0 ARM, -O2 -mfdpic, following code)

#include <stdio.h>
__attribute__((__constructor__))
void foo()
{
	printf("hello ");
}

int main()
{
	printf("world\n");
}

Rather, someone just made the crtstuff gratuitously do the wrong
thing, then later hard-coded disabling initfiniarray on fdpic because
it was doing the wrong thing:

https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=9c560cf23996271ee26dfc4a1d8484b85173cd12;hp=6bcbf80c6e2bd8a60d88bbcac3d70ffb67f4888f

So all that's needed is to revert the wrong patch to crtstuff and then
xtensa and arm initfiniarray will work as they should.

Anyone want to open a GCC bug tracker item for this?

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 20:26               ` Rich Felker
@ 2024-02-28 20:37                 ` Rich Felker
  0 siblings, 0 replies; 17+ messages in thread
From: Rich Felker @ 2024-02-28 20:37 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Wed, Feb 28, 2024 at 03:26:45PM -0500, Rich Felker wrote:
> On Wed, Feb 28, 2024 at 03:14:12PM -0500, Rich Felker wrote:
> > On Wed, Feb 28, 2024 at 11:41:30AM -0800, Max Filippov wrote:
> > > On Wed, Feb 28, 2024 at 11:34 AM Max Filippov <jcmvbkbc@gmail.com> wrote:
> > > >
> > > > On Wed, Feb 28, 2024 at 10:36 AM Rich Felker <dalias@libc.org> wrote:
> > > > > On Wed, Feb 28, 2024 at 01:30:32PM -0500, Rich Felker wrote:
> > > > > > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > > > > >
> > > > > > > > >               p->relocated = 1;
> > > > > > > > >       }
> > > > > > > > > @@ -1485,7 +1487,7 @@ void __libc_exit_fini()
> > > > > > > > >               if (dyn[0] & (1<<DT_FINI_ARRAY)) {
> > > > > > > > >                       size_t n = dyn[DT_FINI_ARRAYSZ]/sizeof(size_t);
> > > > > > > > >                       size_t *fn = (size_t *)laddr(p, dyn[DT_FINI_ARRAY])+n;
> > > > > > > > > -                     while (n--) ((void (*)(void))*--fn)();
> > > > > > > > > +                     while (n--) fpaddr(p, *--fn)();
> > > > > > > >
> > > > > > > > If this is fixable on the tooling side it really should be fixed
> > > > > > > > there. init/fini arrays should have actual language-level function
> > > > > > > > addresses (descriptor addresses on fdpic), not instruction addresses.
> > > > > > >
> > > > > > > I read libgcc code at
> > > > > > >   https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/libgcc/crtstuff.c#L498-L503
> > > > > > > and the way it's written suggests that this was done on purpose.
> > > > > > > I put it into the WIP pile to figure out later what the purpose was..
> > > > > > > I thought that SH might not have this issue because it just didn't
> > > > > > > use the .array_init/.array_fini.
> > > > > >
> > > > > > I'm pretty sure we're using it -- musl-cross-make always forces it on
> > > > > > via the gcc configure command line -- but it's possible there's some
> > > > > > override disabling it for sh. I'll try some test cases and confirm
> > > > > > whether sh is doing it right. Maybe the arm folks will have input on
> > > > > > this too..?
> > > > >
> > > > > Confirmed both that it works, and that it's working via init_array.
> > > > > GCC emits:
> > > > >
> > > > >         .section        .init_array,"aw"
> > > > >         .align 2
> > > > >         .long   foo@FUNCDESC
> > > > >
> > > > > for
> > > > >
> > > > >         __attribute__((__constructor__))
> > > > >         void foo() { ... }
> > > > >
> > > >
> > > > Oh, no doubt that that C code generates a function descriptor, it
> > > > works for xtensa too. But the piece of libgcc quoted above specifically
> > > > puts a pointer to an object, not to a function into the .init_array.
> > > 
> > > It was introduced to gcc by the ARM FDPIC series:
> > > https://github.com/jcmvbkbc/gcc-xtensa/commit/11189793b6ef60645d5d1126d0bd9d0dd83e6583
> > > 
> > > This is the second change that I find made by the ARM FDPIC
> > > series that appears to be not right for other FDPIC ports, first
> > > being this change to the C++ unwinding code:
> > > https://github.com/jcmvbkbc/gcc-xtensa/commit/67b0605494f32811364e25328d3522467aaf0638
> > 
> > OK, so the arm folks put explicitly wrong/broken code here. That needs
> > to be reverted, and they can work out the mess they created on glibc.
> > 
> > There is probably wrong arm target code too whereby gcc is generating
> > instruction addresses for __attribute__((__constructor__)) rather than
> > function addresses.
> > 
> > If they have compat to worry about with glibc binaries, that's going
> > to be a mess for them to fix, but we can just patch it out for musl
> > target regardless of what they do since we have no existing broken
> > binaries.
> 
> OK, really good news! They didn't actually botch it. This test on
> godbolt shows GCC is doing the right thing:
> 
> https://godbolt.org/z/b53ExYoPf
> 
> (GCC 13.1.0 ARM, -O2 -mfdpic, following code)
> 
> #include <stdio.h>
> __attribute__((__constructor__))
> void foo()
> {
> 	printf("hello ");
> }
> 
> int main()
> {
> 	printf("world\n");
> }
> 
> Rather, someone just made the crtstuff gratuitously do the wrong
> thing, then later hard-coded disabling initfiniarray on fdpic because
> it was doing the wrong thing:
> 
> https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=9c560cf23996271ee26dfc4a1d8484b85173cd12;hp=6bcbf80c6e2bd8a60d88bbcac3d70ffb67f4888f
> 
> So all that's needed is to revert the wrong patch to crtstuff and then
> xtensa and arm initfiniarray will work as they should.
> 
> Anyone want to open a GCC bug tracker item for this?

Filed as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114158

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:30     ` Rich Felker
  2024-02-28 18:37       ` Rich Felker
@ 2024-02-28 21:28       ` Max Filippov
  2024-02-29 12:03         ` Max Filippov
  2024-02-29 16:25       ` Max Filippov
                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-28 21:28 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
>
> On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > Hi Rich,
> >
> > On Tue, Feb 27, 2024 at 4:12 PM Rich Felker <dalias@libc.org> wrote:
> > >
> > > First of all, I'm excited to see new work on this and especially the
> > > FDPIC ABI. Not only is xtensa a very relevant arch I've been hoping we
> > > could support for some time now, but having FDPIC will help getting
> > > the parts of musl's FDPIC support that are currently making
> > > sh-specific assumptions worked out so that we can get other new FDPIC
> > > ports added too (particularly, arm and riscv).
> > >
> > > I'm not up-to-date on what toolchain status/stability is or how close
> > > this is to being intended as upstreamable, but since I was pointed to
> > > it and took a look, I wanted to record my initial review thoughts here
> > > so they don't get lost and to hopefully move this closer to ready for
> > > upstream.
> >
> > Thanks for your review.
> > I expect to get toolchain components into shape for upstreaming during
> > this year.
> >
> > > Review follows inline:
> > >
> > > On Tue, Feb 27, 2024 at 06:24:31PM -0500, Rich Felker wrote:
> > > > Attached patches were pulled from
> > > > https://github.com/jcmvbkbc/musl-xtensa/commits/xtensa-1.2.4-fdpic/
> > > >
> > > > I'll reply to comment inline.
> > >
> > > > >From 868b34272f414323f10528d5b9988886541cb1c0 Mon Sep 17 00:00:00 2001
> > > > From: Max Filippov <jcmvbkbc@gmail.com>
> > > > Date: Tue, 22 Mar 2016 02:35:58 +0300
> > > > Subject: [PATCH 1/2] xtensa: add port
> > > >
> > > > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > > > ---
> > > >  arch/xtensa/arch.mak                  |   1 +
> > > >  arch/xtensa/atomic_arch.h             |  27 ++
> > > >  arch/xtensa/bits/alltypes.h.in        |  27 ++
> > > >  arch/xtensa/bits/float.h              |  16 ++
> > > >  arch/xtensa/bits/ioctl.h              | 219 ++++++++++++++
> > > >  arch/xtensa/bits/limits.h             |   1 +
> > > >  arch/xtensa/bits/mman.h               |  68 +++++
> > > >  arch/xtensa/bits/posix.h              |   2 +
> > > >  arch/xtensa/bits/reg.h                |   2 +
> > > >  arch/xtensa/bits/setjmp.h             |   5 +
> > > >  arch/xtensa/bits/signal.h             |  88 ++++++
> > > >  arch/xtensa/bits/stat.h               |  24 ++
> > > >  arch/xtensa/bits/stdint.h             |  20 ++
> > > >  arch/xtensa/bits/syscall.h.in         | 396 ++++++++++++++++++++++++++
> > > >  arch/xtensa/bits/termios.h            | 167 +++++++++++
> > > >  arch/xtensa/bits/user.h               |   4 +
> > > >  arch/xtensa/bits/xtensa-config.h      |   8 +
> > > >  arch/xtensa/crt_arch.h                |  48 ++++
> > > >  arch/xtensa/kstat.h                   |  18 ++
> > > >  arch/xtensa/pthread_arch.h            |  11 +
> > > >  arch/xtensa/reloc.h                   |  38 +++
> > > >  arch/xtensa/syscall_arch.h            | 104 +++++++
> > > >  configure                             |   1 +
> > > >  crt/xtensa/crti.S                     |  29 ++
> > > >  crt/xtensa/crtn.S                     |  23 ++
> > > >  include/elf.h                         |  70 +++++
> > > >  src/internal/xtensa/syscall.S         |  25 ++
> > > >  src/ldso/xtensa/tlsdesc.S             |  36 +++
> > > >  src/process/xtensa/vfork.S            |  21 ++
> > > >  src/setjmp/xtensa/longjmp.S           |  22 ++
> > > >  src/setjmp/xtensa/setjmp.S            |  25 ++
> > > >  src/signal/xtensa/restore.s           |  10 +
> > > >  src/signal/xtensa/sigsetjmp.S         |  24 ++
> > > >  src/thread/xtensa/__set_thread_area.S |  16 ++
> > > >  src/thread/xtensa/__unmapself.S       |  13 +
> > > >  src/thread/xtensa/clone.S             |  46 +++
> > > >  src/thread/xtensa/syscall_cp.S        |  38 +++
> > > >  37 files changed, 1693 insertions(+)
> > > >  create mode 100644 arch/xtensa/arch.mak
> > > >  create mode 100644 arch/xtensa/atomic_arch.h
> > > >  create mode 100644 arch/xtensa/bits/alltypes.h.in
> > > >  create mode 100644 arch/xtensa/bits/float.h
> > > >  create mode 100644 arch/xtensa/bits/ioctl.h
> > > >  create mode 100644 arch/xtensa/bits/limits.h
> > > >  create mode 100644 arch/xtensa/bits/mman.h
> > > >  create mode 100644 arch/xtensa/bits/posix.h
> > > >  create mode 100644 arch/xtensa/bits/reg.h
> > > >  create mode 100644 arch/xtensa/bits/setjmp.h
> > > >  create mode 100644 arch/xtensa/bits/signal.h
> > > >  create mode 100644 arch/xtensa/bits/stat.h
> > > >  create mode 100644 arch/xtensa/bits/stdint.h
> > > >  create mode 100644 arch/xtensa/bits/syscall.h.in
> > > >  create mode 100644 arch/xtensa/bits/termios.h
> > > >  create mode 100644 arch/xtensa/bits/user.h
> > > >  create mode 100644 arch/xtensa/bits/xtensa-config.h
> > > >  create mode 100644 arch/xtensa/crt_arch.h
> > > >  create mode 100644 arch/xtensa/kstat.h
> > > >  create mode 100644 arch/xtensa/pthread_arch.h
> > > >  create mode 100644 arch/xtensa/reloc.h
> > > >  create mode 100644 arch/xtensa/syscall_arch.h
> > > >  create mode 100644 crt/xtensa/crti.S
> > > >  create mode 100644 crt/xtensa/crtn.S
> > > >  create mode 100644 src/internal/xtensa/syscall.S
> > > >  create mode 100644 src/ldso/xtensa/tlsdesc.S
> > > >  create mode 100644 src/process/xtensa/vfork.S
> > > >  create mode 100644 src/setjmp/xtensa/longjmp.S
> > > >  create mode 100644 src/setjmp/xtensa/setjmp.S
> > > >  create mode 100644 src/signal/xtensa/restore.s
> > > >  create mode 100644 src/signal/xtensa/sigsetjmp.S
> > > >  create mode 100644 src/thread/xtensa/__set_thread_area.S
> > > >  create mode 100644 src/thread/xtensa/__unmapself.S
> > > >  create mode 100644 src/thread/xtensa/clone.S
> > > >  create mode 100644 src/thread/xtensa/syscall_cp.S
> > > >
> > > > diff --git a/arch/xtensa/arch.mak b/arch/xtensa/arch.mak
> > > > new file mode 100644
> > > > index 00000000..aa4d05ce
> > > > --- /dev/null
> > > > +++ b/arch/xtensa/arch.mak
> > > > @@ -0,0 +1 @@
> > > > +COMPAT_SRC_DIRS = compat/time32
> > > > diff --git a/arch/xtensa/atomic_arch.h b/arch/xtensa/atomic_arch.h
> > > > new file mode 100644
> > > > index 00000000..34bf0704
> > > > --- /dev/null
> > > > +++ b/arch/xtensa/atomic_arch.h
> > > > @@ -0,0 +1,27 @@
> > > > +#include "bits/xtensa-config.h"
> > >
> > > This is not what bits headers are for; they are public-installed
> > > headers that provide parts of the standard public headers that vary by
> > > arch, and are never directly #include'able.
> > >
> > > If you really need arch-internal headers like this just put them in
> > > the top-level dir of the arch. But in the case of these macros, either
> > > just use the compiler-provided, __-prefixed ones directly, or if
> > > they're not actually variable, don't inspect them at all.
> >
> > I carry this header from the time when configuration-specific header
> > file was the only available configuration option. __XCHAL_* macros
> > only appeared in gcc-13, but since FDPIC support will only be
> > available on toolchains that have these macros I guess I'll drop the
> > header and use __XCHAL_* macros directly.
> >
> > > > +
> > > > +#if XCHAL_HAVE_S32C1I
> > > > +#define a_cas a_cas
> > > > +static inline int a_cas(volatile int *p, int t, int s)
> > > > +{
> > > > +     __asm__ __volatile__ (
> > > > +             "       wsr     %2, scompare1\n"
> > > > +             "       s32c1i  %0, %1\n"
> > > > +             : "+a"(s), "+m"(*p)
> > > > +             : "a"(t)
> > > > +             : "memory" );
> > > > +        return s;
> > > > +}
> > > > +#endif
> > >
> > > For example this is not a useful test, because there's nothing you can
> > > do in the case where it's not defined;
> >
> > This test is in the spirit of xtensa: it covers one possible
> > configuration option.
> > There's at least one other hardware option (exclusive access instructions)
> > available for this functionality and a fallback to syscalls. They're not listed
> > here because I tried to keep this initial implementation to a bare minimum.
> >
> > > it's just not supportable
> > > (unless there's some kernel fallback mechanism like ARM and SH have;
> > > then it would make sense to hard-code the cas instruction as above if
> > > the ISA level is known to have it, and otherwise do conditional
> > > runtime selection (again, like ARM and SH).
> >
> > Xtensa is quite limited on the runtime selection, because opcodes belonging
> > to the options not enabled for a particular core just would not assemble.
>
> That's not really a problem. On sh, we use .word to encode the
> instructions that might not be present at the ISA level targeted,
> which will only be conditionally executed. On ARM, I believe we just
> asm directives to suppress the ISA level tagging of the object file so
> that it can use the mnemonics for the conditionally-available
> instructions. Generally, we find a way to work around whatever the
> tooling is doing to make things difficult here. :-)

On xtensa there's no guarantee that a particular byte sequence represents
the same instruction in different configurations.

> > > > diff --git a/arch/xtensa/bits/alltypes.h.in b/arch/xtensa/bits/alltypes..h.in
> > > > new file mode 100644
> > > > index 00000000..21221ce5
> > > > --- /dev/null
> > > > +++ b/arch/xtensa/bits/alltypes.h.in
> > > > @@ -0,0 +1,27 @@
> > > > +#define _REDIR_TIME64 1
> > > > +#define _Addr int
> > > > +#define _Int64 long long
> > > > +#define _Reg int
> > > > +
> > > > +#if __XTENSA_EB__
> > > > +#define __BYTE_ORDER 4321
> > > > +#elif __XTENSA_EL__
> > > > +#define __BYTE_ORDER 1234
> > > > +#else
> > > > +#error Unknown endianness
> > > > +#endif
> > > > +
> > > > +#define __LONG_MAX 0x7fffffffL
> > > > +
> > > > +#ifndef __cplusplus
> > > > +#ifdef __WCHAR_TYPE__
> > > > +TYPEDEF __WCHAR_TYPE__ wchar_t;
> > > > +#else
> > > > +TYPEDEF unsigned wchar_t;
> > > > +#endif
> > > > +#endif
> > > > +
> > > > +TYPEDEF float float_t;
> > > > +TYPEDEF double double_t;
> > > > +
> > > > +TYPEDEF struct { long __l __attribute__((aligned(__BIGGEST_ALIGNMENT__))); } max_align_t;
> > >
> > > It's best to avoid macros like __BIGGEST_ALIGNMENT__ here that could
> > > change out from under us in the future, thereby quietly breaking the
> > > ABI. Whatever the ABI is, that's what the alignment here should be,
> > > and just using a type that naturally has the alignment if possible
> > > rather than attributes.
> >
> > The thing is: we don't know the name of the type with the widest
> > alignment, as indeed in the future someone may come up with
> > a configuration with an arbitrarily named type with an alignment
> > as big as allowed physically. But then its their responsibility to
> > keep the __BIGGEST_ALIGNMENT__ consistent across the
> > updates of that configuration that intend to stay compatible with
> > each other.
>
> max_align_t is only a C-language contract for the standard C types.
> There is no need for it to cover any sort of extension types, vectors,
> etc. So it should be defined in terms of the C ABI types and needs to
> be stable. If long long or double is already 8-byte aligned and long
> double is the same as double, it should probably just be defined as
> long long or a union of long long and long double or similar.

Ok, somehow I missed that this name comes from the standard.

> > > > diff --git a/arch/xtensa/reloc.h b/arch/xtensa/reloc.h
> > > > new file mode 100644
> > > > index 00000000..03482f18
> > > > --- /dev/null
> > > > +++ b/arch/xtensa/reloc.h
> > > > @@ -0,0 +1,38 @@
> > > > +#if __BYTE_ORDER == __BIG_ENDIAN
> > > > +#define ENDIAN_SUFFIX "eb"
> > > > +#else
> > > > +#define ENDIAN_SUFFIX ""
> > > > +#endif
> > > > +
> > > > +#if __FDPIC__
> > > > +#define ABI_SUFFIX "-fdpic"
> > > > +#else
> > > > +#define ABI_SUFFIX ""
> > > > +#endif
> > > > +
> > > > +#define LDSO_ARCH "xtensa" ENDIAN_SUFFIX ABI_SUFFIX
> > >
> > > Any actual incompatible ABI (as opposed to ISA level) needs its own
> > > LDSO name.
> >
> > There's no real ISA levels in the xtensa. Every CPU core configuration
> > is its own unique ISA.
>
> What does that practically mean? I think we're using different
> terminology which is making it hard to communicate.
>
> By "ISA level" I mean a node of a tree of possible extensions to a
> baseline ISA, where machine code targeted for one node in the tree can
> be executed on all descendants, and can be linked with machine code
> targeting a particular descendant to produce code that can be executed
> on that descendant.
>
> This applies both statically and dynamically. For example, an
> application built for a higher ISA level can be executed with
> libc/ldso built for the baseline ISA (note: this sometimes requires
> conditional logic in a few places like setjmp; see arm for example),
> and an application built for the baseline ISA can be executed using
> libc/ldso built for a higher ISA level.
>
> I'm guessing you mean which features are supported is combinatorically
> selectable as part of the core configuration, not a linear progression
> of levels like i386, i486, etc.

That's correct. There seems to be no intention that different xtensa core
configurations were binary compatible with each other. The development
model that I know only supports compatibility between a particular
configuration created with one version of the xtensa tools and rebuilds of
that configuration with later versions of the xtensa tools. I.e. each specific
configuration is its own ISA, binary software compatibility between the
different configurations is not guaranteed. Software tools for one
configuration need not generate or understand the correct binary code
for any other configuration.

> That's no problem. If I'm mistaken,
> plese help me try to understand what you mean.
>
> > > So if there are alternate (call0/windowed?) ABIs you want
> > > to support, different names need to be defined for them here and in
> > > configure.
> >
> > I understand that it means that I should drop the endianness suffix,
> > as the endianness of each core is fixed, but add windowed/call0
> > discriminator.
>
> If there are both le and be variants, then they need to have separate
> names.

They can never exist both at the same time for any particular core
configuration. Every core has fixed endianness, there's no standard
way to generate the "other endian" code for a given core.

> Same for any other mutually incompatible ABIs.
>
> > > Unless there's a good reason to support more than one ABI,
> > > it probably just makes sense to pick the preferred one and have
> > > configure error-out if the toolchain's ABI is incompatible.
> >
> > I still have hope to support windowed FDPIC and then non-FDPIC
> > variants.
>
> Is there a reason to support both windowed and non-windowed? Sorry if
> this is a dumb question; I don't know the background on it.

Windowed is a bit more performant, and it produces more compact
code, but it may not be available if the windowed registers option was
not enabled for the particular xtensa core and it's harder to deal with
from the software point of view.

> If it's
> possible to just pick one as better, and it can be supported on all
> cores, that's probably the better choice.

That might be call0, but then all transistors spent on windowed
registers were wasted.

> > > > diff --git a/src/ldso/xtensa/tlsdesc.S b/src/ldso/xtensa/tlsdesc.S
> > > > new file mode 100644
> > > > index 00000000..084d5f3e
> > > > --- /dev/null
> > > > +++ b/src/ldso/xtensa/tlsdesc.S
> > > > @@ -0,0 +1,36 @@
> > > > +.global __tlsdesc_static
> > > > +.hidden __tlsdesc_static
> > > > +.type __tlsdesc_static,@function
> > > > +.align 4
> > > > +__tlsdesc_static:
> > > > +#ifdef __XTENSA_WINDOWED_ABI__
> > > > +     entry   a1, 16
> > > > +#endif
> > > > +     rur     a3, threadptr
> > > > +     add     a2, a2, a3
> > > > +#if defined(__XTENSA_WINDOWED_ABI__)
> > > > +     retw
> > > > +#elif defined(__XTENSA_CALL0_ABI__)
> > > > +     ret
> > > > +#else
> > > > +#error Unsupported Xtensa ABI
> > > > +#endif
> > >
> > > Are you sure the entry step makes sense with tlsdesc? Is this honoring
> > > the ABI for what the tlsdesc function is allowed to clobber?
> >
> > But the tlsdesc function is just a normal function, it can do whatever
> > a regular function can do AFAIU. In addition there are no relaxations
> > that can replace non-call code with a variant with a call.
> >
> > 'entry' is the right way to begin a regular function in windowed xtensa ABI..
>
> OK but this is not a regular function. It's a code fragment with
> special ABI constraints and is intended to execute as fast as
> possible, clobbering as little as possible.

I understand that, but I'm looking at the way it is implemented
in the xtensa gcc port and see an ordinary function call:
  https://github.com/jcmvbkbc/gcc-xtensa/blob/xtensa-14-8789-fdpic/gcc/config/xtensa/xtensa.md#L3026

> I don't understand exactly
> what "entry" does, but I suspect it only makes sense if you're
> actually introducing a "call frame" of some sort, like if you'll be
> spilling and restoring call-saved registers or something (or doing
> whatever the windowed-register-file equivalent is to achieve the same
> thing) which does not seem to be needed here.

Unfortunately windowed calls on xtensa LX can be made with three
different window sizes: 4, 8 or 12 registers, the first outgoing argument
goes in the register a6, a10 or a14 respectively. One thing that 'entry'
does is window rotation so that the first incoming argument appears
in the register a2. AFAIK there's no other way to access arguments in
the unprivileged code.

> > > > +.hidden __tls_get_new
> > > > +
> > > > +.global __tlsdesc_dynamic
> > > > +.hidden __tlsdesc_dynamic
> > > > +.type __tlsdesc_dynamic,@function
> > > > +.align 4
> > > > +__tlsdesc_dynamic:
> > > > +#if defined(__XTENSA_WINDOWED_ABI__)
> > > > +     entry   a1, 16
> > > > +     mov     a6, a2
> > > > +     call4   __tls_get_addr
> > > > +     mov     a2, a6
> > > > +     retw
> > > > +#elif defined(__XTENSA_CALL0_ABI__)
> > > > +     j       __tls_get_addr
> > > > +#else
> > > > +#error Unsupported Xtensa ABI
> > > > +#endif
> > >
> > > This approach is no longer a valid implementation for
> > > __tlsdesc_dynamic, because it calls C code which could clobber any
> > > call-clobbered registers, not just the ones the tlsdesc function is
> > > allowed to clobber.
> > >
> > > Instead, it needs to actually do the dtv lookup in asm. musl does not
> > > use any dynamic allocation here, so the lookup is guaranteed to just
> > > work using the indices found with no conditional branches. See the
> > > recently added riscv code for a clean example, in commit
> > > 407aea628af8c81d9e3f5a068568f2217db71bba.
> >
> > Yeah, this whole xtensa TLS code should be in the WIP pile.
> > I'll rewrite it in assembly.
>
> OK, just wanted you to be aware that this is something eventually
> needing rewrite.

Sure.

> > > > diff --git a/src/signal/xtensa/sigsetjmp.S b/src/signal/xtensa/sigsetjmp.S
> > > > new file mode 100644
> > > > index 00000000..e44bba88
> > > > --- /dev/null
> > > > +++ b/src/signal/xtensa/sigsetjmp.S
> > > > @@ -0,0 +1,24 @@
> > > > +.global sigsetjmp
> > > > +.global __sigsetjmp
> > > > +.type sigsetjmp,@function
> > > > +.type __sigsetjmp,@function
> > > > +.align 4
> > > > +sigsetjmp:
> > > > +__sigsetjmp:
> > > > +#if defined(__XTENSA_CALL0_ABI__)
> > > > +     bnez    a3, 1f
> > > > +.hidden ___setjmp
> > > > +     j       ___setjmp
> > > > +1:
> > > > +     mov     a3, a0
> > > > +     mov     a4, a2
> > > > +     call0   ___setjmp
> > > > +     mov     a0, a3
> > > > +     mov     a3, a2
> > > > +     mov     a2, a4
> > > > +
> > > > +.hidden __sigsetjmp_tail
> > > > +     j       __sigsetjmp_tail
> > > > +#else
> > > > +#error Unsupported Xtensa ABI
> > > > +#endif
> > >
> > > I don't think this code works. It does not seem to save the return
> > > address and argument address before calling ___setjmp, so they will be
> > > clobbered after it returns.
> >
> > It does:
> >
> > +     mov     a3, a0
> > +     mov     a4, a2
> >
> > do exactly this and ___setjmp is hidden, so it's always going to be
> > the implementation above that doesn't clobber a3 and a4.
>
> Nope, that's not how it works. setjmp returns twice, and the second
> time it returns, it's *whatever code has executed since the first
> return from setjmp, which most certainly will have clobbered
> everything but the call-saved registers.

Of course.

> The return address needs to be saved in the space past the end of the
> signal mask in the sigjmp_buf, and at least one call-saved register
> also needs to be saved there, so that you can store the address of the
> sigjmp_buf in a call-saved register and have it available after a
> subsequent return from setjmp.

I will.

> > > Eventually I'd like to move most functions like this, that are
> > > currently external asm but have no good reason to be (unlike setjmp,
> > > vfork, etc. which *do* have a good reason to be), to minimal inline
> > > asm in C. Some archs' versions of this file would even just become
> > > __syscall, no further asm.
> > >
> > > > diff --git a/src/thread/xtensa/__unmapself.S b/src/thread/xtensa/__unmapself.S
> > > > new file mode 100644
> > > > index 00000000..15bf2bdf
> > > > --- /dev/null
> > > > +++ b/src/thread/xtensa/__unmapself.S
> > > > @@ -0,0 +1,13 @@
> > > > +.global __unmapself
> > > > +.type   __unmapself,@function
> > > > +.align 4
> > > > +__unmapself:
> > > > +#if defined(__XTENSA_CALL0_ABI__)
> > > > +     mov     a6, a2
> > > > +     movi    a2, 81 # SYS_munmap
> > > > +     syscall
> > > > +     movi    a2, 118 # SYS_exit
> > > > +     syscall
> > > > +#else
> > > > +#error Unsupported Xtensa ABI
> > > > +#endif
> > >
> > > This is unsafe on fdpic/nommu if the kernel requires the task to
> > > always have a valid stack.
> >
> > How do I know if the kernel requires that?
>
> I don't think it's explicitly documented. For sh I was vaguely
> familiar at the ISA level that the sh1/sh2 cpu just pushes stuff to
> the (existing, userspace) stack pointer on interrupts, which would
> necessitate always having a valid stack.

Oh, I know that part well, xtensa doesn't have anything like that.

> I also read the entry.S code
> (or whatever file this part was split off into) for how kernelspace is
> entered from traps/interrupts on nommu hardware, and saw how it
> shuffled things around to make it look to the kernel more like a
> conventional mmu-ful entry to kernelspace.
>
> Maybe xtensa is better about this and always invokes the
> trap/interrupt handler with the saved state in special registers
> rather than pushing it to the stack. If so, this might not be a
> consideration; normally, Linux switches the stack pointer to the
> kernel stack before actually doing anything with it.

Yep, that's what we do.

> > > > >From f8c5953ecdb948282cc8e573b729c25db60a95a8 Mon Sep 17 00:00:00 2001
> > > > From: Max Filippov <jcmvbkbc@gmail.com>
> > > > Date: Wed, 21 Feb 2024 08:27:37 -0800
> > > > Subject: [PATCH 2/2] WIP
> > > >
> > > > Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
> > > > ---
> > > >  ldso/dlstart.c         | 7 +++++++
> > > >  ldso/dynlink.c         | 6 ++++--
> > > >  src/internal/dynlink.h | 5 +++--
> > > >  3 files changed, 14 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/ldso/dlstart.c b/ldso/dlstart.c
> > > > index 259f5e18..beca953f 100644
> > > > --- a/ldso/dlstart.c
> > > > +++ b/ldso/dlstart.c
> > > > @@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
> > > >                               - segs[rel_addr[1]].p_vaddr
> > > >                               + syms[R_SYM(rel[1])].st_value;
> > > >                       rel_addr[1] = dyn[DT_PLTGOT];
> > > > +             } else if (R_TYPE(rel[1]) == REL_RELATIVE) {
> > > > +                     size_t val = *rel_addr;
> > > > +                     for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > > > +                     *rel_addr += segs[j].addr - segs[j].p_vaddr;
> > >
> > > So xtensa has a "relative" reloc type that's just adjusted by the load
> > > offset of the segment the relocation lives in, rather than needing to
> > > use a symbolic relocation referencing a section symbol like other
> > > fdpic archs do?
> >
> > I was looking at the ARM BFD code while doing that and
> > my impression was that they do the same.
> > Regardless, I wonder why either relocation form might be preferrable?
>
> I think the relative type here is perfectly acceptable. At first I
> thought it was weaker (less capable of representing addresses of
> objects in different segments), but looking again, I don't think
> that's the case.

Ok.

> > > If so, this looks right and looks ok.
> > >
> > > >               } else {
> > > >                       size_t val = syms[R_SYM(rel[1])].st_value;
> > > >                       for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > > >                       *rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
> > > >               }
> > > >       }
> > > > +#ifdef __xtensa__
> > > > +     ((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
> > > > +#endif
> > >
> > > Is this actually needed for anything? Generally musl doesn't use the
> > > reserved GOT slots itself, and on all the other archs I'm aware of,
> > > they're essentially reserved to the dynamic linker implementation so
> > > the dynamic linker is just free not to use them and not to set them
> > > up.
> >
> > xtensa doesn't have relative register jumps and calls, so local jumps
> > and calls to a far off locations need to use absolute target addresses.
> > One possible solution is to have the address in the GOT entry, the
> > other is to calculate the target address using the text segment load
> > offset at runtime. Both have the same instruction count, see
> >   http://wiki.osll.ru/doku.php/etc:users:jcmvbkbc:binutils-xtensa#local_call
> > for the details, but the latter doesn't waste GOT space and that saves
> > a noticeable amount of RAM.
>
> I see. Doesn't this limit you to a single text segment, though?

It does, yes. Also there's a hardcoded 0 as a load map index for
the text segment.

> In practice it might not matter, but it's more constraining than fdpic
> was designed to be.

I guess I could add a compiler switch that would choose how it's done,
but it seems that for a noMMU linux application a single text segment
is a reasonable expectation.

> Is there a reason local, relative jumps/calls can't be done by
> computing the address of a nearby label and using the offset relative
> to that? That's how they (at least some types; I forget the details)
> work on sh/fdpic.

IIUC that would require the actual PC knowledge and there's no direct
access to the PC on xtensa.

> > > >  #else
> > > >       /* If the dynamic linker is invoked as a command, its load
> > > >        * address is not available in the aux vector. Instead, compute
> > > > diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> > > > index ceca3c98..25563af3 100644
> > > > --- a/ldso/dynlink.c
> > > > +++ b/ldso/dynlink.c
> > > > @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
> > > >               if (!DL_FDPIC)
> > > >                       do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
> > > >
> > > > +#if 0
> > > >               if (head != &ldso && p->relro_start != p->relro_end) {
> > > >                       long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
> > > >                               p->relro_end-p->relro_start, PROT_READ);
> > > > @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
> > > >                               if (runtime) longjmp(*rtld_fail, 1);
> > > >                       }
> > > >               }
> > > > +#endif
> > >
> > > Was this breaking something? Relro linking probably should have been
> > > disabled on fdpic, and we ignore ENOSYS anyway for nommu.
> >
> > Yeah, I do some of the testing in linux-user QEMU, it has MMU
> > and mprotect calls actually succeed. Failures were coming from the
> > relocation code, but my impression was that relro should be applied
> > after the relocation completion and it should just work, hence the
> > WIP pile.
>
> Yes, relro is only supposed to be applied after all relocations were
> done.

Yeah, I'll have a look.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 21:28       ` Max Filippov
@ 2024-02-29 12:03         ` Max Filippov
  2024-02-29 15:35           ` Rich Felker
  0 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-29 12:03 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 1:28 PM Max Filippov <jcmvbkbc@gmail.com> wrote:
> On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> > Is there a reason local, relative jumps/calls can't be done by
> > computing the address of a nearby label and using the offset relative
> > to that? That's how they (at least some types; I forget the details)
> > work on sh/fdpic.
>
> IIUC that would require the actual PC knowledge and there's no direct
> access to the PC on xtensa.

I thought about it for a bit and although the current PC can easily
be fetched I don't see how it can work for multiple text segments
without a GOT entry. OTOH a link-time conversion of a call without
GOT to a call with GOT in case the target and the call site are in
the different text segments seems feasible.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-29 12:03         ` Max Filippov
@ 2024-02-29 15:35           ` Rich Felker
  0 siblings, 0 replies; 17+ messages in thread
From: Rich Felker @ 2024-02-29 15:35 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Thu, Feb 29, 2024 at 04:03:02AM -0800, Max Filippov wrote:
> On Wed, Feb 28, 2024 at 1:28 PM Max Filippov <jcmvbkbc@gmail.com> wrote:
> > On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> > > Is there a reason local, relative jumps/calls can't be done by
> > > computing the address of a nearby label and using the offset relative
> > > to that? That's how they (at least some types; I forget the details)
> > > work on sh/fdpic.
> >
> > IIUC that would require the actual PC knowledge and there's no direct
> > access to the PC on xtensa.
> 
> I thought about it for a bit and although the current PC can easily
> be fetched I don't see how it can work for multiple text segments
> without a GOT entry. OTOH a link-time conversion of a call without
> GOT to a call with GOT in case the target and the call site are in
> the different text segments seems feasible.

With multiple text segments, a cross-segment call cannot be relative
to begin with. Because fdpic lets them float independently, it would
have to go thru the GOT.

Rather than having the linker convert calls *to* using GOT, I think
the safe way to do things would be to have the compiler emit calls
thru the GOT but linker relax them to avoid the GOT when they're in
the same segment. But there's no reason this has to be done initially.
I'm just thinking in terms of making sure the design doesn't preclude
or make it difficult to do multiple segments in the future if desired.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:30     ` Rich Felker
  2024-02-28 18:37       ` Rich Felker
  2024-02-28 21:28       ` Max Filippov
@ 2024-02-29 16:25       ` Max Filippov
  2024-02-29 18:16         ` Rich Felker
  2024-03-19 15:25       ` Max Filippov
  2024-03-19 16:08       ` Max Filippov
  4 siblings, 1 reply; 17+ messages in thread
From: Max Filippov @ 2024-02-29 16:25 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > > diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> > > > index ceca3c98..25563af3 100644
> > > > --- a/ldso/dynlink.c
> > > > +++ b/ldso/dynlink.c
> > > > @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
> > > >               if (!DL_FDPIC)
> > > >                       do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
> > > >
> > > > +#if 0
> > > >               if (head != &ldso && p->relro_start != p->relro_end) {
> > > >                       long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
> > > >                               p->relro_end-p->relro_start, PROT_READ);
> > > > @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
> > > >                               if (runtime) longjmp(*rtld_fail, 1);
> > > >                       }
> > > >               }
> > > > +#endif
> > >
> > > Was this breaking something? Relro linking probably should have been
> > > disabled on fdpic, and we ignore ENOSYS anyway for nommu.
> >
> > Yeah, I do some of the testing in linux-user QEMU, it has MMU
> > and mprotect calls actually succeed. Failures were coming from the
> > relocation code, but my impression was that relro should be applied
> > after the relocation completion and it should just work, hence the
> > WIP pile.
>
> Yes, relro is only supposed to be applied after all relocations were
> done.

So I've found two issues with this. First is that loadmap entries generated
by the QEMU (and by the linux kernel AFAICS) are not page-aligned,
but the relro segment addresses fetched by the loader are. Since the
laddr() doesn't check that a translation for the address it was given
was actually found the results are not always correct, mprotect fails
and it all terminates early.

With a workaround for that part in place I see that relro protection is
applied to the executable image before its __fdpic_fixup had a chance
to run, there are rofixups for the .init_array and .fini_array, but they both
are a part of the relro segment.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-29 16:25       ` Max Filippov
@ 2024-02-29 18:16         ` Rich Felker
  0 siblings, 0 replies; 17+ messages in thread
From: Rich Felker @ 2024-02-29 18:16 UTC (permalink / raw)
  To: Max Filippov; +Cc: musl

On Thu, Feb 29, 2024 at 08:25:12AM -0800, Max Filippov wrote:
> On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> > On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > > > > diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> > > > > index ceca3c98..25563af3 100644
> > > > > --- a/ldso/dynlink.c
> > > > > +++ b/ldso/dynlink.c
> > > > > @@ -1420,6 +1420,7 @@ static void reloc_all(struct dso *p)
> > > > >               if (!DL_FDPIC)
> > > > >                       do_relr_relocs(p, laddr(p, dyn[DT_RELR]), dyn[DT_RELRSZ]);
> > > > >
> > > > > +#if 0
> > > > >               if (head != &ldso && p->relro_start != p->relro_end) {
> > > > >                       long ret = __syscall(SYS_mprotect, laddr(p, p->relro_start),
> > > > >                               p->relro_end-p->relro_start, PROT_READ);
> > > > > @@ -1429,6 +1430,7 @@ static void reloc_all(struct dso *p)
> > > > >                               if (runtime) longjmp(*rtld_fail, 1);
> > > > >                       }
> > > > >               }
> > > > > +#endif
> > > >
> > > > Was this breaking something? Relro linking probably should have been
> > > > disabled on fdpic, and we ignore ENOSYS anyway for nommu.
> > >
> > > Yeah, I do some of the testing in linux-user QEMU, it has MMU
> > > and mprotect calls actually succeed. Failures were coming from the
> > > relocation code, but my impression was that relro should be applied
> > > after the relocation completion and it should just work, hence the
> > > WIP pile.
> >
> > Yes, relro is only supposed to be applied after all relocations were
> > done.
> 
> So I've found two issues with this. First is that loadmap entries generated
> by the QEMU (and by the linux kernel AFAICS) are not page-aligned,
> but the relro segment addresses fetched by the loader are. Since the
> laddr() doesn't check that a translation for the address it was given
> was actually found the results are not always correct, mprotect fails
> and it all terminates early.
> 
> With a workaround for that part in place I see that relro protection is
> applied to the executable image before its __fdpic_fixup had a chance
> to run, there are rofixups for the .init_array and .fini_array, but they both
> are a part of the relro segment.

OK, in that case, we probably should disable relro with fdpic until
it's fixed to work properly.

I think relro processing for the main executable should probably be
moved to run as the first step of __libc_start_init, so that it
happens after __fdpic_fixup has run. We also need to either make
relro_start and relro_end maintain the exact addresses and only get
expanded to full-page just before calling mprotect, or use laddr_pg to
get the containing page.

Unrelated to fdpic, there may also be an issue that relro processing
has a broken dependency on runtime-variable pagesize. In practice it's
probaby okay since the binary must have been linked for maxpagesize >=
PAGE_SIZE, but we should probably double-check if the current
alignment logic is semantically correct.

Rich

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:30     ` Rich Felker
                         ` (2 preceding siblings ...)
  2024-02-29 16:25       ` Max Filippov
@ 2024-03-19 15:25       ` Max Filippov
  2024-03-19 16:08       ` Max Filippov
  4 siblings, 0 replies; 17+ messages in thread
From: Max Filippov @ 2024-03-19 15:25 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > On Tue, Feb 27, 2024 at 4:12 PM Rich Felker <dalias@libc.org> wrote:
> > > > diff --git a/ldso/dlstart.c b/ldso/dlstart.c
> > > > index 259f5e18..beca953f 100644
> > > > --- a/ldso/dlstart.c
> > > > +++ b/ldso/dlstart.c
> > > > @@ -90,12 +90,19 @@ hidden void _dlstart_c(size_t *sp, size_t *dynv)
> > > >                               - segs[rel_addr[1]].p_vaddr
> > > >                               + syms[R_SYM(rel[1])].st_value;
> > > >                       rel_addr[1] = dyn[DT_PLTGOT];
> > > > +             } else if (R_TYPE(rel[1]) == REL_RELATIVE) {
> > > > +                     size_t val = *rel_addr;
> > > > +                     for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > > > +                     *rel_addr += segs[j].addr - segs[j].p_vaddr;
> > >
> > > So xtensa has a "relative" reloc type that's just adjusted by the load
> > > offset of the segment the relocation lives in, rather than needing to
> > > use a symbolic relocation referencing a section symbol like other
> > > fdpic archs do?
> >
> > I was looking at the ARM BFD code while doing that and
> > my impression was that they do the same.
> > Regardless, I wonder why either relocation form might be preferable?
>
> I think the relative type here is perfectly acceptable. At first I
> thought it was weaker (less capable of representing addresses of
> objects in different segments), but looking again, I don't think
> that's the case.

I found that this treatment of the REL_RELATIVE is not consistent
with the REL_RELATIVE treatment in do_relocs(), where RELA type
relocation is expected to have an addend, and only the addend
matters, not the initial value of the field being relocated.
I just realized that looking at what ARM does might not be a good
idea for the xtensa port, as ARM uses the REL type relocations and
xtensa uses RELA.

Also do_relocs() does not use the DSO load map, so the following
change is required for it to work in the FDPIC case:
               case REL_RELATIVE:
-                       *reloc_addr = (size_t)base + addend;
+                       *reloc_addr = (size_t)laddr(dso, addend);
                       break;

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [musl] Initial xtensa/fdpic port review
  2024-02-28 18:30     ` Rich Felker
                         ` (3 preceding siblings ...)
  2024-03-19 15:25       ` Max Filippov
@ 2024-03-19 16:08       ` Max Filippov
  4 siblings, 0 replies; 17+ messages in thread
From: Max Filippov @ 2024-03-19 16:08 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Wed, Feb 28, 2024 at 10:30 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Feb 28, 2024 at 09:20:33AM -0800, Max Filippov wrote:
> > On Tue, Feb 27, 2024 at 4:12 PM Rich Felker <dalias@libc.org> wrote:
> > > >               } else {
> > > >                       size_t val = syms[R_SYM(rel[1])].st_value;
> > > >                       for (j=0; val-segs[j].p_vaddr >= segs[j].p_memsz; j++);
> > > >                       *rel_addr = rel[2] + segs[j].addr - segs[j].p_vaddr + val;
> > > >               }
> > > >       }
> > > > +#ifdef __xtensa__
> > > > +     ((unsigned long *)dyn[DT_PLTGOT])[3] = segs[0].addr - segs[0].p_vaddr;
> > > > +#endif
> > >
> > > Is this actually needed for anything? Generally musl doesn't use the
> > > reserved GOT slots itself, and on all the other archs I'm aware of,
> > > they're essentially reserved to the dynamic linker implementation so
> > > the dynamic linker is just free not to use them and not to set them
> > > up.
> >
> > xtensa doesn't have relative register jumps and calls, so local jumps
> > and calls to a far off locations need to use absolute target addresses.
> > One possible solution is to have the address in the GOT entry, the
> > other is to calculate the target address using the text segment load
> > offset at runtime. Both have the same instruction count, see
> >   http://wiki.osll.ru/doku.php/etc:users:jcmvbkbc:binutils-xtensa#local_call
> > for the details, but the latter doesn't waste GOT space and that saves
> > a noticeable amount of RAM.
>
> I see. Doesn't this limit you to a single text segment, though? In
> practice it might not matter, but it's more constraining than fdpic
> was designed to be.

Instead of a fixed dedicated GOT entry there can be multiple
entries, one per independent text segment, giving the maximum of
1024 / 4 - 3 = 253 text segments addressable with this technique.
Instead of generating a fixed load from GOT + 12 there would be a
relocation against that load instruction mentioning the target symbol
and the linker would have a chance to allocate GOT space, fix up
offsets in these instructions and add dynamic relocations against
these GOT entries that would produce runtime load offsets for the
corresponding text segments.

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-03-19 16:08 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-27 23:24 [musl] Initial xtensa/fdpic port review Rich Felker
2024-02-28  0:13 ` Rich Felker
2024-02-28 17:20   ` Max Filippov
2024-02-28 18:30     ` Rich Felker
2024-02-28 18:37       ` Rich Felker
2024-02-28 19:34         ` Max Filippov
2024-02-28 19:41           ` Max Filippov
2024-02-28 20:14             ` Rich Felker
2024-02-28 20:26               ` Rich Felker
2024-02-28 20:37                 ` Rich Felker
2024-02-28 21:28       ` Max Filippov
2024-02-29 12:03         ` Max Filippov
2024-02-29 15:35           ` Rich Felker
2024-02-29 16:25       ` Max Filippov
2024-02-29 18:16         ` Rich Felker
2024-03-19 15:25       ` Max Filippov
2024-03-19 16:08       ` Max Filippov

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).