mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] [PATCH] mallocng: Add MTE support for Aarch64
@ 2024-06-10 12:36 Stefan Jumarea
  2024-06-10 15:59 ` Rich Felker
  2024-06-11 14:09 ` Szabolcs Nagy
  0 siblings, 2 replies; 8+ messages in thread
From: Stefan Jumarea @ 2024-06-10 12:36 UTC (permalink / raw)
  To: musl; +Cc: dalias, Stefan Jumarea

Add support for Memory Tagging Extension.

All the memory tagging code is placed within compiler guards, and is
enabled by using the `--enable-mte` configure option.
The option can only be used if compiling for Aarch64.

All the primitives for generating, storing and loading the memory tags
are placed in a new header under `arch/aarch64/`.

For now, only the actual user data is tagged. All metadata is untagged.

Signed-off-by: Stefan Jumarea <stefanjumarea02@gmail.com>
---
 arch/aarch64/mte.h                  | 44 ++++++++++++++++++++++++++++
 configure                           | 17 +++++++++++
 src/malloc/mallocng/aligned_alloc.c | 24 +++++++++------
 src/malloc/mallocng/free.c          | 28 ++++++++++++++----
 src/malloc/mallocng/malloc.c        | 18 ++++++++++++
 src/malloc/mallocng/meta.h          | 45 ++++++++++++++++++++---------
 src/malloc/mallocng/realloc.c       | 34 ++++++++++++++++++----
 7 files changed, 178 insertions(+), 32 deletions(-)
 create mode 100644 arch/aarch64/mte.h

diff --git a/arch/aarch64/mte.h b/arch/aarch64/mte.h
new file mode 100644
index 00000000..35148381
--- /dev/null
+++ b/arch/aarch64/mte.h
@@ -0,0 +1,44 @@
+#include <stdint.h>
+
+#define MTE_TAG_GRANULE		16
+#define MTE_TAG_MASK		(0xFULL << 56)
+
+static inline uint64_t mte_load_tag(uint64_t addr)
+{
+	uint64_t tag;
+
+	__asm__ __volatile__ ("ldg	%x0, [%x1]\n"
+			      : "=&r"(tag) : "r"(addr));
+
+	return tag;
+}
+
+static inline void mte_store_tag(uint64_t addr)
+{
+	__asm__ __volatile__ ("stg	%0, [%0]"
+			      : : "r"(addr) : "memory");
+}
+
+static inline void mte_store_zero_tag(uint64_t addr)
+{
+	__asm__ __volatile__ ("stzg	%x0, [%x0]"
+			      : : "r"(addr) : "memory");
+}
+
+static inline uint64_t mte_get_exclude_mask(uint64_t addr)
+{
+	uint64_t reg;
+
+	__asm__ __volatile__("gmi	%x0, %x1, xzr\n"
+			     : "=r"(reg) : "r" (addr));
+	return reg;
+}
+
+static inline uint64_t mte_insert_random_tag(uint64_t addr, uint64_t mask)
+{
+	uint64_t reg;
+
+	__asm__ __volatile__("irg	%x0, %x2, %x1\n"
+			     : "=r"(reg) : "r" (mask), "r" (addr));
+	return reg;
+}
diff --git a/configure b/configure
index bc9fbe48..edcd4911 100755
--- a/configure
+++ b/configure
@@ -34,6 +34,8 @@ Optional features:
   --enable-wrapper=...    build given musl toolchain wrapper [auto]
   --disable-shared        inhibit building shared library [enabled]
   --disable-static        inhibit building static library [enabled]
+  --enable-mte            build with MTE support [disabled]
+                          only available for aarch64 and mallocng
 
 Optional packages:
   --with-malloc=...       choose malloc implementation [mallocng]
@@ -139,6 +141,7 @@ debug=no
 warnings=yes
 shared=auto
 static=yes
+mte=no
 wrapper=auto
 gcc_wrapper=no
 clang_wrapper=no
@@ -158,6 +161,8 @@ case "$arg" in
 --disable-shared|--enable-shared=no) shared=no ;;
 --enable-static|--enable-static=yes) static=yes ;;
 --disable-static|--enable-static=no) static=no ;;
+--enable-mte|--enable-mte=yes) mte=yes ;;
+--disable-mte|--enable-mte=no) mte=no ;;
 --enable-optimize) optimize=yes ;;
 --enable-optimize=*) optimize=${arg#*=} ;;
 --disable-optimize) optimize=no ;;
@@ -790,6 +795,18 @@ if trycppif "__FAST_MATH__" \
 fail "$0: error: compiler has broken floating point; check CFLAGS"
 fi
 
+if test "$mte" = "yes" ; then
+	printf "Checking whether target architecture supports MTE... "
+	if test "$ARCH" != "aarch64"; then
+		printf "no\n"
+		fail "$0: error: mte only supported with aarch64"
+	fi
+
+	printf "yes\n"
+	CFLAGS_AUTO="$CFLAGS_AUTO -DMEMTAG -march=armv8.5-a+memtag"
+	SUBARCH=${SUBARCH}+memtag
+fi
+
 printf "creating config.mak... "
 
 cmdline=$(quote "$0")
diff --git a/src/malloc/mallocng/aligned_alloc.c b/src/malloc/mallocng/aligned_alloc.c
index e0862a83..2205f6bb 100644
--- a/src/malloc/mallocng/aligned_alloc.c
+++ b/src/malloc/mallocng/aligned_alloc.c
@@ -25,31 +25,37 @@ void *aligned_alloc(size_t align, size_t len)
 	if (!p)
 		return 0;
 
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = g->mem->storage + stride*(idx+1) - IB;
 	size_t adj = -(uintptr_t)p & (align-1);
 
 	if (!adj) {
-		set_size(p, end, len);
+		set_size(untagged, end, len);
 		return p;
 	}
 	p += adj;
+	untagged += adj;
 	uint32_t offset = (size_t)(p-g->mem->storage)/UNIT;
 	if (offset <= 0xffff) {
-		*(uint16_t *)(p-2) = offset;
-		p[-4] = 0;
+		*(uint16_t *)(untagged-2) = offset;
+		untagged[-4] = 0;
 	} else {
 		// use a 32-bit offset if 16-bit doesn't fit. for this,
 		// 16-bit field must be zero, [-4] byte nonzero.
-		*(uint16_t *)(p-2) = 0;
-		*(uint32_t *)(p-8) = offset;
-		p[-4] = 1;
+		*(uint16_t *)(untagged-2) = 0;
+		*(uint32_t *)(untagged-8) = offset;
+		untagged[-4] = 1;
 	}
-	p[-3] = idx;
-	set_size(p, end, len);
+	untagged[-3] = idx;
+	set_size(untagged, end, len);
 	// store offset to aligned enframing. this facilitates cycling
 	// offset and also iteration of heap for debugging/measurement.
 	// for extreme overalignment it won't fit but these are classless
diff --git a/src/malloc/mallocng/free.c b/src/malloc/mallocng/free.c
index 43f32aad..1a86c8eb 100644
--- a/src/malloc/mallocng/free.c
+++ b/src/malloc/mallocng/free.c
@@ -25,8 +25,13 @@ static struct mapinfo free_group(struct meta *g)
 		mi.len = g->maplen*4096UL;
 	} else {
 		void *p = g->mem;
+#ifdef MEMTAG
+		unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+		unsigned char *untagged = p;
+#endif
 		struct meta *m = get_meta(p);
-		int idx = get_slot_index(p);
+		int idx = get_slot_index(untagged);
 		g->mem->meta = 0;
 		// not checking size/reserved here; it's intentionally invalid
 		mi = nontrivial_free(m, idx);
@@ -102,17 +107,30 @@ void free(void *p)
 {
 	if (!p) return;
 
+#ifdef MEMTAG
+	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	void *untagged = p;
+#endif
+
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = start + stride - IB;
-	get_nominal_size(p, end);
+#ifdef MEMTAG
+	size_t nom_size = get_nominal_size(untagged, end);
+#endif
 	uint32_t self = 1u<<idx, all = (2u<<g->last_idx)-1;
-	((unsigned char *)p)[-3] = 255;
+	((unsigned char *)untagged)[-3] = 255;
 	// invalidate offset to group header, and cycle offset of
 	// used region within slot if current offset is zero.
-	*(uint16_t *)((char *)p-2) = 0;
+	*(uint16_t *)((char *)untagged-2) = 0;
+
+#ifdef MEMTAG
+	for (size_t i = 0; i < nom_size; i += 16)
+		mte_store_tag((uint64_t)((unsigned char *)untagged + i));
+#endif
 
 	// release any whole pages contained in the slot to be freed
 	// unless it's a single-slot group that will be unmapped.
diff --git a/src/malloc/mallocng/malloc.c b/src/malloc/mallocng/malloc.c
index d695ab8e..89294526 100644
--- a/src/malloc/mallocng/malloc.c
+++ b/src/malloc/mallocng/malloc.c
@@ -298,6 +298,8 @@ static int alloc_slot(int sc, size_t req)
 
 void *malloc(size_t n)
 {
+	n = ALIGN_UP(n, 16);
+
 	if (size_overflows(n)) return 0;
 	struct meta *g;
 	uint32_t mask, first;
@@ -310,6 +312,9 @@ void *malloc(size_t n)
 		void *p = mmap(0, needed, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANON, -1, 0);
 		if (p==MAP_FAILED) return 0;
+
+
+
 		wrlock();
 		step_seq();
 		g = alloc_meta();
@@ -376,7 +381,20 @@ void *malloc(size_t n)
 success:
 	ctr = ctx.mmap_counter;
 	unlock();
+
+#if MEMTAG
+	void *ptr = enframe(g, idx, n, ctr);
+
+	uint64_t mask_mte = mte_get_exclude_mask((uint64_t)ptr);
+	uint64_t addr = mte_insert_random_tag((uint64_t)ptr, mask_mte);
+
+	for (size_t i = 0; i < n; i += 16)
+		mte_store_tag(addr + i);
+
+	return (void *)addr;
+#else
 	return enframe(g, idx, n, ctr);
+#endif
 }
 
 int is_allzero(void *p)
diff --git a/src/malloc/mallocng/meta.h b/src/malloc/mallocng/meta.h
index 61ec53f9..f5896fe4 100644
--- a/src/malloc/mallocng/meta.h
+++ b/src/malloc/mallocng/meta.h
@@ -4,6 +4,9 @@
 #include <stdint.h>
 #include <errno.h>
 #include <limits.h>
+#ifdef MEMTAG
+#include <mte.h>
+#endif
 #include "glue.h"
 
 __attribute__((__visibility__("hidden")))
@@ -14,6 +17,10 @@ extern const uint16_t size_classes[];
 #define UNIT 16
 #define IB 4
 
+#ifndef ALIGN_UP
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+#endif
+
 struct group {
 	struct meta *meta;
 	unsigned char active_idx:5;
@@ -129,14 +136,19 @@ static inline int get_slot_index(const unsigned char *p)
 static inline struct meta *get_meta(const unsigned char *p)
 {
 	assert(!((uintptr_t)p & 15));
-	int offset = *(const uint16_t *)(p - 2);
-	int index = get_slot_index(p);
-	if (p[-4]) {
+#ifdef MEMTAG
+	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	const unsigned char *untagged = p;
+#endif
+	int offset = *(const uint16_t *)(untagged - 2);
+	int index = get_slot_index(untagged);
+	if (untagged[-4]) {
 		assert(!offset);
-		offset = *(uint32_t *)(p - 8);
+		offset = *(uint32_t *)(untagged - 8);
 		assert(offset > 0xffff);
 	}
-	const struct group *base = (const void *)(p - UNIT*offset - UNIT);
+	const struct group *base = (const void *)(untagged - UNIT*offset - UNIT);
 	const struct meta *meta = base->meta;
 	assert(meta->mem == base);
 	assert(index <= meta->last_idx);
@@ -199,10 +211,15 @@ static inline void *enframe(struct meta *g, int idx, size_t n, int ctr)
 	size_t slack = (stride-IB-n)/UNIT;
 	unsigned char *p = g->mem->storage + stride*idx;
 	unsigned char *end = p+stride-IB;
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	// cycle offset within slot to increase interval to address
 	// reuse, facilitate trapping double-free.
-	int off = (p[-3] ? *(uint16_t *)(p-2) + 1 : ctr) & 255;
-	assert(!p[-4]);
+	int off = (untagged[-3] ? *(uint16_t *)(untagged-2) + 1 : ctr) & 255;
+	assert(!untagged[-4]);
 	if (off > slack) {
 		size_t m = slack;
 		m |= m>>1; m |= m>>2; m |= m>>4;
@@ -213,16 +230,18 @@ static inline void *enframe(struct meta *g, int idx, size_t n, int ctr)
 	if (off) {
 		// store offset in unused header at offset zero
 		// if enframing at non-zero offset.
-		*(uint16_t *)(p-2) = off;
-		p[-3] = 7<<5;
+		*(uint16_t *)(untagged-2) = off;
+		untagged[-3] = 7<<5;
 		p += UNIT*off;
+		untagged += UNIT*off;
 		// for nonzero offset there is no permanent check
 		// byte, so make one.
-		p[-4] = 0;
+		untagged[-4] = 0;
 	}
-	*(uint16_t *)(p-2) = (size_t)(p-g->mem->storage)/UNIT;
-	p[-3] = idx;
-	set_size(p, end, n);
+	*(uint16_t *)(untagged-2) = (size_t)(untagged-g->mem->storage)/UNIT;
+	untagged[-3] = idx;
+	set_size(untagged, end, n);
+
 	return p;
 }
 
diff --git a/src/malloc/mallocng/realloc.c b/src/malloc/mallocng/realloc.c
index 18769f42..0fab0df7 100644
--- a/src/malloc/mallocng/realloc.c
+++ b/src/malloc/mallocng/realloc.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE
+#include <stdint.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <string.h>
@@ -6,23 +7,46 @@
 
 void *realloc(void *p, size_t n)
 {
+	n = ALIGN_UP(n, 16);
 	if (!p) return malloc(n);
 	if (size_overflows(n)) return 0;
 
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = start + stride - IB;
-	size_t old_size = get_nominal_size(p, end);
-	size_t avail_size = end-(unsigned char *)p;
+	size_t old_size = get_nominal_size(untagged, end);
+	size_t avail_size = end-(unsigned char *)untagged;
 	void *new;
 
 	// only resize in-place if size class matches
 	if (n <= avail_size && n<MMAP_THRESHOLD
 	    && size_to_class(n)+1 >= g->sizeclass) {
-		set_size(p, end, n);
-		return p;
+
+		uint64_t addr;
+
+#ifdef MEMTAG
+		for (size_t i = 0; i < old_size; i += 16)
+			mte_store_tag((uint64_t)(untagged + i));
+
+		uint64_t mask_mte = mte_get_exclude_mask((uint64_t)p);
+		addr = mte_insert_random_tag((uint64_t)p, mask_mte);
+
+		for (size_t i = 0; i < n; i += 16)
+			mte_store_tag(addr + i);
+#else
+		addr = (uint64_t)p;
+#endif
+
+		set_size(untagged, end, n);
+
+		return (void *)addr;
 	}
 
 	// use mremap if old and new size are both mmap-worthy
-- 
2.43.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-10 12:36 [musl] [PATCH] mallocng: Add MTE support for Aarch64 Stefan Jumarea
@ 2024-06-10 15:59 ` Rich Felker
  2024-06-11 14:09 ` Szabolcs Nagy
  1 sibling, 0 replies; 8+ messages in thread
From: Rich Felker @ 2024-06-10 15:59 UTC (permalink / raw)
  To: Stefan Jumarea; +Cc: musl

On Mon, Jun 10, 2024 at 03:36:25PM +0300, Stefan Jumarea wrote:
> Add support for Memory Tagging Extension.
> 
> All the memory tagging code is placed within compiler guards, and is
> enabled by using the `--enable-mte` configure option.
> The option can only be used if compiling for Aarch64.
> 
> All the primitives for generating, storing and loading the memory tags
> are placed in a new header under `arch/aarch64/`.
> 
> For now, only the actual user data is tagged. All metadata is untagged.

It's exciting to see a (presumably working) implementation of this --
mallocng was designed with the intent to support things like MTE. I
haven't read it in detail but I think to be acceptable upstream it
will need some adjustments. One very minor thing is that all the
uint64_t should be uintptr_t. But it looks like there's a lot of
repetitition of the pattern of getting the untagged pointer, where in
many cases, it could be done in the function the pointer is passed to
(things like get_meta).

I'll follow up with more detailed review later.

Rich

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-10 12:36 [musl] [PATCH] mallocng: Add MTE support for Aarch64 Stefan Jumarea
  2024-06-10 15:59 ` Rich Felker
@ 2024-06-11 14:09 ` Szabolcs Nagy
  2024-06-11 14:46   ` Rich Felker
  1 sibling, 1 reply; 8+ messages in thread
From: Szabolcs Nagy @ 2024-06-11 14:09 UTC (permalink / raw)
  To: Stefan Jumarea; +Cc: musl, dalias

* Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> @@ -102,17 +107,30 @@ void free(void *p)
>  {
>  	if (!p) return;
>  
> +#ifdef MEMTAG
> +	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
> +#else
> +	void *untagged = p;
> +#endif
> +
>  	struct meta *g = get_meta(p);
...
>  static inline struct meta *get_meta(const unsigned char *p)
>  {
>  	assert(!((uintptr_t)p & 15));
> -	int offset = *(const uint16_t *)(p - 2);
> -	int index = get_slot_index(p);
> -	if (p[-4]) {
> +#ifdef MEMTAG
> +	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
> +#else
> +	const unsigned char *untagged = p;
> +#endif

if free just drops the tag, then incorrectly tagged pointer
will not be detected. musl does some use-after-free checks,
so i dont know how important this is, but i'd check that the
passed pointer matches the memory tag (unless 0 sized and
that the tag is non-0) otherwise a forged pointer may cause
corruption.

i don't see where you enable tagged pointer abi and checks
(prctl) or where you add PROT_MTE to the mmapped memory.
https://www.kernel.org/doc/html/latest/arch/arm64/memory-tagging-extension.html
https://www.kernel.org/doc/html/latest/arch/arm64/tagged-address-abi.html
(i think we want the synchronous tag check option.)
note there is software that assumes page granularity for
memory protection e.g. does read access at p&-4096, and
there may software that assumes the top bits of a valid
pointer is 0 so unconditionally enabling tagging and tag
checks can be an issue.

another potential issue is the early ALIGN_UP of the malloc
size: this overflows malloc(-1) and i think changes
malloc_usable_size(malloc(1)).

iirc i changed IB when i tried out mte with mallocng.

i would avoid excessive ifdefs in the code, e.g. by using
'p = untag(p);' and define untag appropriately in a header.
(this might as well do the tag checks when mte is enabled,
but might need special-casing 0 sized allocations.)

tagging large areas can be slow, just like 'dc zva,x0' in
aarch64 memset, there is 'dc gva,x0' that tags an entire
cacheline, alternatively it may be better to just rely on
page protection for large allocations to avoid tagging
overheads (don't even mmap them as PROT_MTE). i've never
actually benchmarked what makes sense in practice, i'd at
least put the hand written loop behind an abstraction.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-11 14:09 ` Szabolcs Nagy
@ 2024-06-11 14:46   ` Rich Felker
  2024-06-11 15:37     ` Stefan Jumarea
  0 siblings, 1 reply; 8+ messages in thread
From: Rich Felker @ 2024-06-11 14:46 UTC (permalink / raw)
  To: Stefan Jumarea, musl

On Tue, Jun 11, 2024 at 04:09:22PM +0200, Szabolcs Nagy wrote:
> * Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> > @@ -102,17 +107,30 @@ void free(void *p)
> >  {
> >  	if (!p) return;
> >  
> > +#ifdef MEMTAG
> > +	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
> > +#else
> > +	void *untagged = p;
> > +#endif
> > +
> >  	struct meta *g = get_meta(p);
> ....
> >  static inline struct meta *get_meta(const unsigned char *p)
> >  {
> >  	assert(!((uintptr_t)p & 15));
> > -	int offset = *(const uint16_t *)(p - 2);
> > -	int index = get_slot_index(p);
> > -	if (p[-4]) {
> > +#ifdef MEMTAG
> > +	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
> > +#else
> > +	const unsigned char *untagged = p;
> > +#endif
> 
> if free just drops the tag, then incorrectly tagged pointer
> will not be detected. musl does some use-after-free checks,
> so i dont know how important this is, but i'd check that the
> passed pointer matches the memory tag (unless 0 sized and
> that the tag is non-0) otherwise a forged pointer may cause
> corruption.

Yes. Would just accessing a byte at the start fix this?

> i don't see where you enable tagged pointer abi and checks
> (prctl) or where you add PROT_MTE to the mmapped memory.
> https://www.kernel.org/doc/html/latest/arch/arm64/memory-tagging-extension.html
> https://www.kernel.org/doc/html/latest/arch/arm64/tagged-address-abi.html
> (i think we want the synchronous tag check option.)
> note there is software that assumes page granularity for
> memory protection e.g. does read access at p&-4096, and
> there may software that assumes the top bits of a valid
> pointer is 0 so unconditionally enabling tagging and tag
> checks can be an issue.

I don't think that's a problem. malloc has no contract to allow those
things.

> another potential issue is the early ALIGN_UP of the malloc
> size: this overflows malloc(-1) and i think changes
> malloc_usable_size(malloc(1)).

Yes, I saw that was incorrect and don't understand the motivation. If
it's to avoid tag mismatch at the final 12b, that could be done where
the size class is selected instead. But it would be better if we could
make it work with the tag mismatch (not sure how hard that is) so this
space is still usable (ignoring tag when accessing the header).

> iirc i changed IB when i tried out mte with mallocng.
> 
> i would avoid excessive ifdefs in the code, e.g. by using
> 'p = untag(p);' and define untag appropriately in a header.
> (this might as well do the tag checks when mte is enabled,

Yes.

> but might need special-casing 0 sized allocations.)

Zero-sized allocations could maybe be implemented as a wrong tag? But
then we'd need a way for free to let them pass untrapped.

> tagging large areas can be slow, just like 'dc zva,x0' in
> aarch64 memset, there is 'dc gva,x0' that tags an entire
> cacheline, alternatively it may be better to just rely on
> page protection for large allocations to avoid tagging
> overheads (don't even mmap them as PROT_MTE). i've never
> actually benchmarked what makes sense in practice, i'd at
> least put the hand written loop behind an abstraction.

That loses a lot of the benefit of MTE. Page protections don't work
because there would be too many VMAs if we put guard pages after each
mmap-serviced allocation. However, one thing you can do to protect
against overflow in mmap-serviced allocations without tagging the
whole thing with O(n) operation is just tagging the tail up to end of
page. This would make accesses past the end trap.

Rich

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-11 14:46   ` Rich Felker
@ 2024-06-11 15:37     ` Stefan Jumarea
  2024-06-11 16:42       ` Rich Felker
  0 siblings, 1 reply; 8+ messages in thread
From: Stefan Jumarea @ 2024-06-11 15:37 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Tue, Jun 11, 2024 at 10:46:25AM -0400, Rich Felker wrote:
> On Tue, Jun 11, 2024 at 04:09:22PM +0200, Szabolcs Nagy wrote:
> > * Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> > > @@ -102,17 +107,30 @@ void free(void *p)
> > >  {
> > >  	if (!p) return;
> > >  
> > > +#ifdef MEMTAG
> > > +	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
> > > +#else
> > > +	void *untagged = p;
> > > +#endif
> > > +
> > >  	struct meta *g = get_meta(p);
> > ....
> > >  static inline struct meta *get_meta(const unsigned char *p)
> > >  {
> > >  	assert(!((uintptr_t)p & 15));
> > > -	int offset = *(const uint16_t *)(p - 2);
> > > -	int index = get_slot_index(p);
> > > -	if (p[-4]) {
> > > +#ifdef MEMTAG
> > > +	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
> > > +#else
> > > +	const unsigned char *untagged = p;
> > > +#endif
> > 
> > if free just drops the tag, then incorrectly tagged pointer
> > will not be detected. musl does some use-after-free checks,
> > so i dont know how important this is, but i'd check that the
> > passed pointer matches the memory tag (unless 0 sized and
> > that the tag is non-0) otherwise a forged pointer may cause
> > corruption.
> 
> Yes. Would just accessing a byte at the start fix this?

Yes, a byte access can solve this. Will add.

> 
> > i don't see where you enable tagged pointer abi and checks
> > (prctl) or where you add PROT_MTE to the mmapped memory.
> > https://www.kernel.org/doc/html/latest/arch/arm64/memory-tagging-extension.html
> > https://www.kernel.org/doc/html/latest/arch/arm64/tagged-address-abi.html
> > (i think we want the synchronous tag check option.)
> > note there is software that assumes page granularity for
> > memory protection e.g. does read access at p&-4096, and
> > there may software that assumes the top bits of a valid
> > pointer is 0 so unconditionally enabling tagging and tag
> > checks can be an issue.
> 
> I don't think that's a problem. malloc has no contract to allow those
> things.
> 
> > another potential issue is the early ALIGN_UP of the malloc
> > size: this overflows malloc(-1) and i think changes
> > malloc_usable_size(malloc(1)).
> 
> Yes, I saw that was incorrect and don't understand the motivation. If
> it's to avoid tag mismatch at the final 12b, that could be done where
> the size class is selected instead. But it would be better if we could
> make it work with the tag mismatch (not sure how hard that is) so this
> space is still usable (ignoring tag when accessing the header).
> 

This was done since MTE has a 16 byte granule for tagging.
Makes more sense to do this where the class is selected, yes. I'm not
sure about making it work for smaller allocations, I'll try to think of
a way to cover that.

> > iirc i changed IB when i tried out mte with mallocng.
> > 
> > i would avoid excessive ifdefs in the code, e.g. by using
> > 'p = untag(p);' and define untag appropriately in a header.
> > (this might as well do the tag checks when mte is enabled,

Agree, will do.

> 
> Yes.
> 
> > but might need special-casing 0 sized allocations.)
> 
> Zero-sized allocations could maybe be implemented as a wrong tag? But
> then we'd need a way for free to let them pass untrapped.
> 

Hm, a wrong tag seems like a nice idea, but I don't see an easy way to
let the free pass untrapped. Can we do a special case and return NULL on
zero-size allocations?

Stefan

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-11 15:37     ` Stefan Jumarea
@ 2024-06-11 16:42       ` Rich Felker
  2024-06-11 17:13         ` Stefan Jumarea
  0 siblings, 1 reply; 8+ messages in thread
From: Rich Felker @ 2024-06-11 16:42 UTC (permalink / raw)
  To: Stefan Jumarea; +Cc: musl

On Tue, Jun 11, 2024 at 06:37:11PM +0300, Stefan Jumarea wrote:
> On Tue, Jun 11, 2024 at 10:46:25AM -0400, Rich Felker wrote:
> > On Tue, Jun 11, 2024 at 04:09:22PM +0200, Szabolcs Nagy wrote:
> > > * Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> > > > @@ -102,17 +107,30 @@ void free(void *p)
> > > >  {
> > > >  	if (!p) return;
> > > >  
> > > > +#ifdef MEMTAG
> > > > +	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
> > > > +#else
> > > > +	void *untagged = p;
> > > > +#endif
> > > > +
> > > >  	struct meta *g = get_meta(p);
> > > ....
> > > >  static inline struct meta *get_meta(const unsigned char *p)
> > > >  {
> > > >  	assert(!((uintptr_t)p & 15));
> > > > -	int offset = *(const uint16_t *)(p - 2);
> > > > -	int index = get_slot_index(p);
> > > > -	if (p[-4]) {
> > > > +#ifdef MEMTAG
> > > > +	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
> > > > +#else
> > > > +	const unsigned char *untagged = p;
> > > > +#endif
> > > 
> > > if free just drops the tag, then incorrectly tagged pointer
> > > will not be detected. musl does some use-after-free checks,
> > > so i dont know how important this is, but i'd check that the
> > > passed pointer matches the memory tag (unless 0 sized and
> > > that the tag is non-0) otherwise a forged pointer may cause
> > > corruption.
> > 
> > Yes. Would just accessing a byte at the start fix this?
> 
> Yes, a byte access can solve this. Will add.
> 
> > 
> > > i don't see where you enable tagged pointer abi and checks
> > > (prctl) or where you add PROT_MTE to the mmapped memory.
> > > https://www.kernel.org/doc/html/latest/arch/arm64/memory-tagging-extension.html
> > > https://www.kernel.org/doc/html/latest/arch/arm64/tagged-address-abi.html
> > > (i think we want the synchronous tag check option.)
> > > note there is software that assumes page granularity for
> > > memory protection e.g. does read access at p&-4096, and
> > > there may software that assumes the top bits of a valid
> > > pointer is 0 so unconditionally enabling tagging and tag
> > > checks can be an issue.
> > 
> > I don't think that's a problem. malloc has no contract to allow those
> > things.
> > 
> > > another potential issue is the early ALIGN_UP of the malloc
> > > size: this overflows malloc(-1) and i think changes
> > > malloc_usable_size(malloc(1)).
> > 
> > Yes, I saw that was incorrect and don't understand the motivation. If
> > it's to avoid tag mismatch at the final 12b, that could be done where
> > the size class is selected instead. But it would be better if we could
> > make it work with the tag mismatch (not sure how hard that is) so this
> > space is still usable (ignoring tag when accessing the header).
> > 
> 
> This was done since MTE has a 16 byte granule for tagging.
> Makes more sense to do this where the class is selected, yes. I'm not
> sure about making it work for smaller allocations, I'll try to think of
> a way to cover that.

Is there any way to perform a "load, ignoring tag mismatch" operation?

> > > iirc i changed IB when i tried out mte with mallocng.
> > > 
> > > i would avoid excessive ifdefs in the code, e.g. by using
> > > 'p = untag(p);' and define untag appropriately in a header.
> > > (this might as well do the tag checks when mte is enabled,
> 
> Agree, will do.
> 
> > 
> > Yes.
> > 
> > > but might need special-casing 0 sized allocations.)
> > 
> > Zero-sized allocations could maybe be implemented as a wrong tag? But
> > then we'd need a way for free to let them pass untrapped.
> > 
> 
> Hm, a wrong tag seems like a nice idea, but I don't see an easy way to
> let the free pass untrapped.

You'd need to be able to peek at the metadata and see that it's
zero-sized.

> Can we do a special case and return NULL on
> zero-size allocations?

Nope, that's not an option. While the standard allows it, it's awful
behavior and fundamentally inconsistent with how realloc works (null
return is ambiguous between failure to realloc, in which case old
object would still exit, and successful realloc to size zero, in which
case old object does not exist).

Rich

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-11 16:42       ` Rich Felker
@ 2024-06-11 17:13         ` Stefan Jumarea
  2024-06-11 19:39           ` Szabolcs Nagy
  0 siblings, 1 reply; 8+ messages in thread
From: Stefan Jumarea @ 2024-06-11 17:13 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

On Tue, Jun 11, 2024 at 12:42:22PM -0400, Rich Felker wrote:
> On Tue, Jun 11, 2024 at 06:37:11PM +0300, Stefan Jumarea wrote:
> > On Tue, Jun 11, 2024 at 10:46:25AM -0400, Rich Felker wrote:
> > > On Tue, Jun 11, 2024 at 04:09:22PM +0200, Szabolcs Nagy wrote:
> > > > * Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> > 
> > This was done since MTE has a 16 byte granule for tagging.
> > Makes more sense to do this where the class is selected, yes. I'm not
> > sure about making it work for smaller allocations, I'll try to think of
> > a way to cover that.
> 
> Is there any way to perform a "load, ignoring tag mismatch" operation?
> 

Yes, there is the `ldg` instruction that can load the tag of a given
address, so we can use that.

> > > > iirc i changed IB when i tried out mte with mallocng.
> > > > 
> > > > i would avoid excessive ifdefs in the code, e.g. by using
> > > > 'p = untag(p);' and define untag appropriately in a header.
> > > > (this might as well do the tag checks when mte is enabled,
> > 
> > Agree, will do.
> > 
> > > 
> > > Yes.
> > > 
> > > > but might need special-casing 0 sized allocations.)
> > > 
> > > Zero-sized allocations could maybe be implemented as a wrong tag? But
> > > then we'd need a way for free to let them pass untrapped.
> > > 
> > 
> > Hm, a wrong tag seems like a nice idea, but I don't see an easy way to
> > let the free pass untrapped.
> 
> You'd need to be able to peek at the metadata and see that it's
> zero-sized.
> 
> > Can we do a special case and return NULL on
> > zero-size allocations?
> 
> Nope, that's not an option. While the standard allows it, it's awful
> behavior and fundamentally inconsistent with how realloc works (null
> return is ambiguous between failure to realloc, in which case old
> object would still exit, and successful realloc to size zero, in which
> case old object does not exist).

I see, makes sense.

Stefan

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [musl] [PATCH] mallocng: Add MTE support for Aarch64
  2024-06-11 17:13         ` Stefan Jumarea
@ 2024-06-11 19:39           ` Szabolcs Nagy
  0 siblings, 0 replies; 8+ messages in thread
From: Szabolcs Nagy @ 2024-06-11 19:39 UTC (permalink / raw)
  To: Stefan Jumarea; +Cc: Rich Felker, musl

* Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-11 20:13:28 +0300]:

> On Tue, Jun 11, 2024 at 12:42:22PM -0400, Rich Felker wrote:
> > On Tue, Jun 11, 2024 at 06:37:11PM +0300, Stefan Jumarea wrote:
> > > On Tue, Jun 11, 2024 at 10:46:25AM -0400, Rich Felker wrote:
> > > > On Tue, Jun 11, 2024 at 04:09:22PM +0200, Szabolcs Nagy wrote:
> > > > > * Stefan Jumarea <stefanjumarea02@gmail.com> [2024-06-10 15:36:25 +0300]:
> > > 
> > > This was done since MTE has a 16 byte granule for tagging.
> > > Makes more sense to do this where the class is selected, yes. I'm not
> > > sure about making it work for smaller allocations, I'll try to think of
> > > a way to cover that.
> > 
> > Is there any way to perform a "load, ignoring tag mismatch" operation?
> > 
> 
> Yes, there is the `ldg` instruction that can load the tag of a given
> address, so we can use that.
> 

there is no atomic load with tag ignored. ldg+ld works if you know the
memory tag cannot change asynchronously.

writing to the tco register can disable (and then enable) tag checks
(see the linux docs).

i think it's a bad idea to share the same 16byte granule between user
allocation and in-band malloc meta data for the next allocation,
the two can be independently freed and thus the tag of the granule
can change asynchronously when accessed. and using tco to access the
in-band data might have overheads.

moving the in-band meta data to another granule is imho the right
solution but it costs more space than the compact mallocng design.


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-06-11 19:39 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-10 12:36 [musl] [PATCH] mallocng: Add MTE support for Aarch64 Stefan Jumarea
2024-06-10 15:59 ` Rich Felker
2024-06-11 14:09 ` Szabolcs Nagy
2024-06-11 14:46   ` Rich Felker
2024-06-11 15:37     ` Stefan Jumarea
2024-06-11 16:42       ` Rich Felker
2024-06-11 17:13         ` Stefan Jumarea
2024-06-11 19:39           ` Szabolcs Nagy

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).