mailing list of musl libc
 help / color / mirror / code / Atom feed
* [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
@ 2021-07-13 12:02 Stefan Kanthak
  2021-07-13 14:22 ` Érico Nogueira
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Stefan Kanthak @ 2021-07-13 12:02 UTC (permalink / raw)
  To: musl

<https://git.musl-libc.org/cgit/musl/plain/src/string/strcspn.c>

 #include <string.h>
 
 #define BITOP(a,b,op) \
  ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
 
-size_t strcspn(const char *s, const char *c)
+size_t strcspn(const char *restrict s, const char *c)
 {
-        const char *a = s;
+        const char *a;
-        size_t byteset[32/sizeof(size_t)];
+        size_t byteset[32/sizeof(size_t)] = { 0 };
 
-        if (!c[0] || !c[1]) return __strchrnul(s, *c)-a;
+        if (!c[0] || !c[1]) return __strchrnul(a=s, *c)-a;
 
-        memset(byteset, 0, sizeof byteset);
         for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
-        for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
-        return s-a;
+        for (a=s; *a && !BITOP(byteset, *(unsigned char *)a, &); a++);
+        return a-s;
 }

After this change, musl's favorite compiler (GCC) will generate code
like the following (here for x86-64), just like the code it already
generates for strspn(), where the initialization of byteset[] is NOT
done via memset():

strcspn:
...
        xor    %eax, %eax
        movq   %rax, byteset(%rsp)
        movq   %rax, byteset+8(%rsp)
        movq   %rax, byteset+16(%rsp)
        movq   %rax, byteset+24(%rsp)
...

<https://git.musl-libc.org/cgit/musl/plain/src/string/strspn.c>

 #include <string.h>
 
 #define BITOP(a,b,op) \
  ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
 
-size_t strspn(const char *s, const char *c)
+size_t strspn(const char *restrict s, const char *c)
 {
-        const char *a = s;
+        const char *a;
         size_t byteset[32/sizeof(size_t)] = { 0 };
 
         if (!c[0]) return 0;
         if (!c[1]) {
-                for (; *s == *c; s++);
-                return s-a;
+                for (a=s; *a == *c; a++);
+                return a-s;
         }
 
         for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
-        for (; *s && BITOP(byteset, *(unsigned char *)s, &); s++);
-        return s-a;
+        for (a=s; *a && BITOP(byteset, *(unsigned char *)a, &); a++);
+        return a-s;
 }

<https://git.musl-libc.org/cgit/musl/plain/src/string/strtok.c>

 #include <string.h>
 
 char *strtok(char *restrict s, const char *restrict sep)
 {
         static char *p;
+        return strtok_r(s, sep, &p);
-        if (!s && !(s = p)) return NULL;
-        s += strspn(s, sep);
-        if (!*s) return p = 0;
-        p = s + strcspn(s, sep);
-        if (*p) *p++ = 0;
-        else p = 0;
-        return s;
 }

<https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>

 #include <string.h>
 
 char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
 {
         if (!s && !(s = *p)) return NULL;
         s += strspn(s, sep);
-        if (!*s) return *p = 0;
+        if (!*s) return *p = NULL;
         *p = s + strcspn(s, sep);
         if (**p) *(*p)++ = 0;
-        else *p = 0;
+        else *p = NULL;
         return s;
 }

If you want to go a step further, avoid to build the same byteset twice:

<https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>

 #include <string.h>
 
 char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
 {
+        size_t byteset[32/sizeof(size_t)] = { 0 };
+
         if (!s && !(s = *p)) return NULL;
+        if (!*s) return *p = NULL;
+        if (!*sep) return *p = NULL, *s;
-        s += strspn(s, sep);
+        for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
+        for (; *s && BITOP(byteset, *(unsigned char *)s, &); s++);
-        if (!*s) return *p = 0;
+        if (!*s) return *p = NULL;
-        *p = s + strcspn(s, sep);
-        if (**p) *(*p)++ = 0;
-        else *p = 0;
-        return s;
+        sep = s;
+        for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
+        if (*s) *s++ = 0;
+        else *s = NULL;
+        *p = s;
+        return sep;
 }

Stefan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
  2021-07-13 12:02 [musl] Changes for strcspn(), strspn(), strtok() and strtok_r() Stefan Kanthak
@ 2021-07-13 14:22 ` Érico Nogueira
  2021-07-13 14:49 ` Érico Nogueira
  2021-07-13 15:45 ` Rich Felker
  2 siblings, 0 replies; 6+ messages in thread
From: Érico Nogueira @ 2021-07-13 14:22 UTC (permalink / raw)
  To: musl

On Tue Jul 13, 2021 at 9:02 AM -03, Stefan Kanthak wrote:
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strcspn.c>
>
> #include <string.h>
>  
> #define BITOP(a,b,op) \
> ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof
> *(a))))
>  
> -size_t strcspn(const char *s, const char *c)
> +size_t strcspn(const char *restrict s, const char *c)
> {
> - const char *a = s;
> + const char *a;
> - size_t byteset[32/sizeof(size_t)];
> + size_t byteset[32/sizeof(size_t)] = { 0 };
>  
> - if (!c[0] || !c[1]) return __strchrnul(s, *c)-a;
> + if (!c[0] || !c[1]) return __strchrnul(a=s, *c)-a;
>  
> - memset(byteset, 0, sizeof byteset);
> for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> - for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
> - return s-a;
> + for (a=s; *a && !BITOP(byteset, *(unsigned char *)a, &); a++);
> + return a-s;
> }

I don't know if it makes sense to give such a hint to the compiler, but
doing something like:

[...]
- size_t byteset[32/sizeof(size_t)];

- if (!c[0] || !c[1]) return __strchrnul(s, *c)-a;
+ if (!c[0] || !c[1]) return __strchrnul(a=s, *c)-a;

- memset(byteset, 0, sizeof byteset);
+ size_t byteset[32/sizeof(size_t)] = { 0 };
[...]

would possibly leave the idea about initialization order clearer, at the
expense of not following the code style.

>
> After this change, musl's favorite compiler (GCC) will generate code
> like the following (here for x86-64), just like the code it already
> generates for strspn(), where the initialization of byteset[] is NOT
> done via memset():
>
> strcspn:
> ...
> xor %eax, %eax
> movq %rax, byteset(%rsp)
> movq %rax, byteset+8(%rsp)
> movq %rax, byteset+16(%rsp)
> movq %rax, byteset+24(%rsp)
> ...
>
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strspn.c>
>
> #include <string.h>
>  
> #define BITOP(a,b,op) \
> ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof
> *(a))))
>  
> -size_t strspn(const char *s, const char *c)
> +size_t strspn(const char *restrict s, const char *c)
> {
> - const char *a = s;
> + const char *a;
> size_t byteset[32/sizeof(size_t)] = { 0 };
>  
> if (!c[0]) return 0;
> if (!c[1]) {
> - for (; *s == *c; s++);
> - return s-a;
> + for (a=s; *a == *c; a++);
> + return a-s;
> }
>  
> for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> - for (; *s && BITOP(byteset, *(unsigned char *)s, &); s++);
> - return s-a;
> + for (a=s; *a && BITOP(byteset, *(unsigned char *)a, &); a++);
> + return a-s;
> }

Does this improve register utilization? It would be nice to explain why
moving the assignment and changing the loop variable is an improvement.
(Sending full git patches helps with that ;)

This question applies to the previous diff as well.

>
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strtok.c>
>
> #include <string.h>
>  
> char *strtok(char *restrict s, const char *restrict sep)
> {
> static char *p;
> + return strtok_r(s, sep, &p);
> - if (!s && !(s = p)) return NULL;
> - s += strspn(s, sep);
> - if (!*s) return p = 0;
> - p = s + strcspn(s, sep);
> - if (*p) *p++ = 0;
> - else p = 0;
> - return s;
> }

It's been this way since the initial check-in commit, I wonder if it was
intentional. Your way should save some bytes, though :)

>
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>
>
> #include <string.h>
>  
> char *strtok_r(char *restrict s, const char *restrict sep, char
> **restrict p)
> {
> if (!s && !(s = *p)) return NULL;
> s += strspn(s, sep);
> - if (!*s) return *p = 0;
> + if (!*s) return *p = NULL;
> *p = s + strcspn(s, sep);
> if (**p) *(*p)++ = 0;
> - else *p = 0;
> + else *p = NULL;
> return s;
> }

AFAIK musl style doesn't like using NULL, so 0 would always be
correct... Unless using NULL here is intentional to not confuse with the
string's null terminator?

>
> If you want to go a step further, avoid to build the same byteset twice:
>
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>
>
> #include <string.h>
>  
> char *strtok_r(char *restrict s, const char *restrict sep, char
> **restrict p)
> {
> + size_t byteset[32/sizeof(size_t)] = { 0 };
> +
> if (!s && !(s = *p)) return NULL;
> + if (!*s) return *p = NULL;
> + if (!*sep) return *p = NULL, *s;
> - s += strspn(s, sep);
> + for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> + for (; *s && BITOP(byteset, *(unsigned char *)s, &); s++);
> - if (!*s) return *p = 0;
> + if (!*s) return *p = NULL;
> - *p = s + strcspn(s, sep);
> - if (**p) *(*p)++ = 0;
> - else *p = 0;
> - return s;
> + sep = s;
> + for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
> + if (*s) *s++ = 0;
> + else *s = NULL;
> + *p = s;
> + return sep;
> }
>
> Stefan


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
  2021-07-13 12:02 [musl] Changes for strcspn(), strspn(), strtok() and strtok_r() Stefan Kanthak
  2021-07-13 14:22 ` Érico Nogueira
@ 2021-07-13 14:49 ` Érico Nogueira
  2021-07-13 15:45 ` Rich Felker
  2 siblings, 0 replies; 6+ messages in thread
From: Érico Nogueira @ 2021-07-13 14:49 UTC (permalink / raw)
  To: musl

On Tue Jul 13, 2021 at 9:02 AM -03, Stefan Kanthak wrote:
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strcspn.c>
>
> #include <string.h>
>  
> #define BITOP(a,b,op) \
> ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof
> *(a))))
>  
> -size_t strcspn(const char *s, const char *c)
> +size_t strcspn(const char *restrict s, const char *c)
> {
> - const char *a = s;
> + const char *a;
> - size_t byteset[32/sizeof(size_t)];
> + size_t byteset[32/sizeof(size_t)] = { 0 };
>  
> - if (!c[0] || !c[1]) return __strchrnul(s, *c)-a;
> + if (!c[0] || !c[1]) return __strchrnul(a=s, *c)-a;

This is unspecified behavior and could lead to UB (reading from
uninitialized a). musl enables -Wsequence-point, so building with
--enable-warnings is important to catch such issues.

>  
> - memset(byteset, 0, sizeof byteset);
> for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> - for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
> - return s-a;
> + for (a=s; *a && !BITOP(byteset, *(unsigned char *)a, &); a++);
> + return a-s;
> }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
  2021-07-13 12:02 [musl] Changes for strcspn(), strspn(), strtok() and strtok_r() Stefan Kanthak
  2021-07-13 14:22 ` Érico Nogueira
  2021-07-13 14:49 ` Érico Nogueira
@ 2021-07-13 15:45 ` Rich Felker
  2021-07-13 16:31   ` Stefan Kanthak
  2 siblings, 1 reply; 6+ messages in thread
From: Rich Felker @ 2021-07-13 15:45 UTC (permalink / raw)
  To: Stefan Kanthak; +Cc: musl

On Tue, Jul 13, 2021 at 02:02:06PM +0200, Stefan Kanthak wrote:
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strcspn.c>
> 
>  #include <string.h>
>  
>  #define BITOP(a,b,op) \
>   ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
>  
> -size_t strcspn(const char *s, const char *c)
> +size_t strcspn(const char *restrict s, const char *c)
>  {
> -        const char *a = s;
> +        const char *a;
> -        size_t byteset[32/sizeof(size_t)];
> +        size_t byteset[32/sizeof(size_t)] = { 0 };
>  
> -        if (!c[0] || !c[1]) return __strchrnul(s, *c)-a;
> +        if (!c[0] || !c[1]) return __strchrnul(a=s, *c)-a;
>  
> -        memset(byteset, 0, sizeof byteset);
>          for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> -        for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
> -        return s-a;
> +        for (a=s; *a && !BITOP(byteset, *(unsigned char *)a, &); a++);
> +        return a-s;
>  }
> 
> After this change, musl's favorite compiler (GCC) will generate code
> like the following (here for x86-64), just like the code it already
> generates for strspn(), where the initialization of byteset[] is NOT
> done via memset():

That's probably a good change, but it's a 2-line change. The above
diff has at least 4 other gratuitous changes that have nothing to do
with getting rid of the memset call. Even if they were desirable, they
can't be done as part of the same change; that destroys the ability to
review/audit the history.

Note that I'd like to restore the ability of the compiler to transform
calls to memset into a builtin, but it's nontrivial because we have to
have -fno-builtin (via -ffreestanding) as the baseline to prevent
circular definitions.

>  #include <string.h>
>  
>  char *strtok(char *restrict s, const char *restrict sep)
>  {
>          static char *p;
> +        return strtok_r(s, sep, &p);
> -        if (!s && !(s = p)) return NULL;
> -        s += strspn(s, sep);
> -        if (!*s) return p = 0;
> -        p = s + strcspn(s, sep);
> -        if (*p) *p++ = 0;
> -        else p = 0;
> -        return s;
>  }
> 
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>

This cannot be done due to namespace. It may be desirable to make a
namespace-safe alias __strtok_r that could be called to implement
strtok, though.

>  #include <string.h>
>  
>  char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
>  {
>          if (!s && !(s = *p)) return NULL;
>          s += strspn(s, sep);
> -        if (!*s) return *p = 0;
> +        if (!*s) return *p = NULL;
>          *p = s + strcspn(s, sep);
>          if (**p) *(*p)++ = 0;
> -        else *p = 0;
> +        else *p = NULL;
>          return s;
>  }

This is a gratuitous style change in the opposite direction of what's
preferred in musl.

> If you want to go a step further, avoid to build the same byteset twice:
> 
> <https://git.musl-libc.org/cgit/musl/plain/src/string/strtok_r.c>
> 
>  #include <string.h>
>  
>  char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
>  {
> +        size_t byteset[32/sizeof(size_t)] = { 0 };
> +
>          if (!s && !(s = *p)) return NULL;
> +        if (!*s) return *p = NULL;
> +        if (!*sep) return *p = NULL, *s;
> -        s += strspn(s, sep);
> +        for (; *c && BITOP(byteset, *(unsigned char *)c, |=); c++);
> +        for (; *s && BITOP(byteset, *(unsigned char *)s, &); s++);
> -        if (!*s) return *p = 0;
> +        if (!*s) return *p = NULL;
> -        *p = s + strcspn(s, sep);
> -        if (**p) *(*p)++ = 0;
> -        else *p = 0;
> -        return s;
> +        sep = s;
> +        for (; *s && !BITOP(byteset, *(unsigned char *)s, &); s++);
> +        if (*s) *s++ = 0;
> +        else *s = NULL;
> +        *p = s;
> +        return sep;
>  }

Rewriting code that works and does not have any problems (note: UB or
platform assumptions that could bite someone later would count as a
problem) is not improvement. It consumes maintainer time *and* time of
anyone consuming musl who wants to review all changes for possible
security bugs/backdoors. If there are any worthwhile changes to this
type of legacy function, they should be made as isolated and simple as
possible to ensure that they can be reviewed easily by anyone who
wants to.

Rich

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
  2021-07-13 15:45 ` Rich Felker
@ 2021-07-13 16:31   ` Stefan Kanthak
  2021-07-13 17:40     ` Rich Felker
  0 siblings, 1 reply; 6+ messages in thread
From: Stefan Kanthak @ 2021-07-13 16:31 UTC (permalink / raw)
  To: Rich Felker; +Cc: musl

Rich Felker wrote:

> On Tue, Jul 13, 2021 at 02:02:06PM +0200, Stefan Kanthak wrote:

>>  #include <string.h>
>>  
>>  char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
>>  {
>>          if (!s && !(s = *p)) return NULL;
>>          s += strspn(s, sep);
>> -        if (!*s) return *p = 0;
>> +        if (!*s) return *p = NULL;
>>          *p = s + strcspn(s, sep);
>>          if (**p) *(*p)++ = 0;
>> -        else *p = 0;
>> +        else *p = NULL;
>>          return s;
>>  }
> 
> This is a gratuitous style change in the opposite direction of what's
> preferred in musl.

Do you REALLY prefer using NULL and 0 in parallel for the null pointer?

Stefan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [musl] Changes for strcspn(), strspn(), strtok() and strtok_r()
  2021-07-13 16:31   ` Stefan Kanthak
@ 2021-07-13 17:40     ` Rich Felker
  0 siblings, 0 replies; 6+ messages in thread
From: Rich Felker @ 2021-07-13 17:40 UTC (permalink / raw)
  To: musl

On Tue, Jul 13, 2021 at 06:31:12PM +0200, Stefan Kanthak wrote:
> Rich Felker wrote:
> 
> > On Tue, Jul 13, 2021 at 02:02:06PM +0200, Stefan Kanthak wrote:
> 
> >>  #include <string.h>
> >>  
> >>  char *strtok_r(char *restrict s, const char *restrict sep, char **restrict p)
> >>  {
> >>          if (!s && !(s = *p)) return NULL;
> >>          s += strspn(s, sep);
> >> -        if (!*s) return *p = 0;
> >> +        if (!*s) return *p = NULL;
> >>          *p = s + strcspn(s, sep);
> >>          if (**p) *(*p)++ = 0;
> >> -        else *p = 0;
> >> +        else *p = NULL;
> >>          return s;
> >>  }
> > 
> > This is a gratuitous style change in the opposite direction of what's
> > preferred in musl.
> 
> Do you REALLY prefer using NULL and 0 in parallel for the null pointer?

No, just 0. I didn't see that NULL also appeared above too; most
places it appears are just leftovers from really old code.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-07-13 17:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13 12:02 [musl] Changes for strcspn(), strspn(), strtok() and strtok_r() Stefan Kanthak
2021-07-13 14:22 ` Érico Nogueira
2021-07-13 14:49 ` Érico Nogueira
2021-07-13 15:45 ` Rich Felker
2021-07-13 16:31   ` Stefan Kanthak
2021-07-13 17:40     ` Rich Felker

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).