Currently, wmemcpy is implemented as a loop, which the compiler aggressively optimizes.  From a binary perspective it's probably cleaner to implement it as a call to memcpy.

diff --git a/src/string/wmemcpy.c b/src/string/wmemcpy.c
index 52e6e6e..272f37a 100644
--- a/src/string/wmemcpy.c
+++ b/src/string/wmemcpy.c
@@ -1,8 +1,7 @@
+#include <string.h>
 #include <wchar.h>
 
 wchar_t *wmemcpy(wchar_t *restrict d, const wchar_t *restrict s, size_t n)
 {
- wchar_t *a = d;
- while (n--) *d++ = *s++;
- return a;
+ return memcpy(d, s, n * sizeof(wchar_t));
 }