Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS

mailing list of musl libc
 help / color / mirror / code / Atom feed

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
@ 2021-07-10  5:31 jason
  2021-07-10 17:11 ` Rich Felker
  2021-07-10 17:29 ` Samuel Holland
  0 siblings, 2 replies; 12+ messages in thread
From: jason @ 2021-07-10  5:31 UTC (permalink / raw)
  To: musl, vincent.donnefort; +Cc: jason

Here's a simpler bit of code to do the same thing.  The differences:
- Doesn't bother using stdio.  /proc and /sys reads like this don't sleep
  and so don't return short or EINTR or any of that hair.

- Check for the terminating newline, and use it in the parsing loop
  to trigger completing a range, just like ','.

  The previous code never actually ensured that the buffer was '\0'
  terminated. :-(

- Use one "current number being parsed" variable rather than two.
  Just copy n to prev_n on finding a '-'.  (Feel free to bikeshed the
  variable names to "range_start" or something if you prefer.)

If you do want to use a FILE *, musl should probably have an internal
API for reading into the stdio buffer and parsing straight from there,
avoiding the separate stack buffer.  (Or even allocating the FILE on
the stack and avoiding the malloc.)

This is probably far from the last small config file (/proc or /sys or
perhaps something like /etc/hostname) that someone will want to access.

(You could even get ambitious and make it reallocate the buffer as
required to get the whole file into memory  Probably something other
code would be interested in, like timezone parsing.)


#include <unistd.h>
#include <fcntl.h>
#include <errno.h>

long get_nrprocessors_conf1(void)
{
	int fd = open("/sys/devices/system/cpu/possible",
			O_RDONLY | O_NOCTTY | O_CLOEXEC);
	if (fd < 0)
		goto failure;

	char buffer[128];	/* Some reasonable size. */
	ssize_t ss = read(fd, buffer, sizeof buffer);
	/*
	 * Short reads, EINTR, etc. can't happen with /sys files.
	 * We could in theory preserve the read's errno in case the close
	 * overwrites it, but that seems excessive.
	 */
	if (close(fd) < 0 || ss < 0)
		goto failure;
	if (ss < 2 || buffer[ss-1] != '\n')
		goto parse_error;
	/*
	 * The possible CPUs are given as a comma-separated list of ranges,
	 * with each range being either a single decimal integer ("1") or
	 * a hyphen-separated range ("1-10").
	 *
	 * Q: How careful do we want the parsing to be?  For now, assume
	 * the kernel can be trusted not to give us "1-2-3" or ",," or
	 * other strangeness.
	 */
	char const *p = buffer;
	unsigned prev_n = -1u;
	unsigned n = 0;
	unsigned total = 0;
	for (;;) {
		char c = *p++;
		switch (c) {
		  case '0': case '1': case '2': case '3': case '4':
		  case '5': case '6': case '7': case '8': case '9':
			if (n > (-1u - 9)/10)
				goto parse_error;
			n = 10 * n + c - '0';
			break;
		  case '-':
			prev_n = n;
			n = 0;
			break;
		  case ',': case '\n':
			if (n > prev_n)
				total += n - prev_n;
			total++;
			if (c == '\n')
				return (long)total;	/* Success! */
			prev_n = -1u;
			n = 0;
			break;
		  default:
			goto parse_error;
		}
	}

parse_error:
	errno = ENOTSUP;	/* What's a good error to return? */
failure:
	return -1;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-10  5:31 [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support jason
@ 2021-07-10 17:11 ` Rich Felker
  2021-07-10 17:29 ` Samuel Holland
  1 sibling, 0 replies; 12+ messages in thread
From: Rich Felker @ 2021-07-10 17:11 UTC (permalink / raw)
  To: musl

On Sat, Jul 10, 2021 at 07:31:57AM +0200, jason wrote:
> Here's a simpler bit of code to do the same thing.  The differences:
> - Doesn't bother using stdio.  /proc and /sys reads like this don't sleep
>   and so don't return short or EINTR or any of that hair.

Non-use of stdio may be better but also requires care. The code as
you've done it is not safe against cancellation (which it needs to
block).

> - Check for the terminating newline, and use it in the parsing loop
>   to trigger completing a range, just like ','.
> 
>   The previous code never actually ensured that the buffer was '\0'
>   terminated. :-(
> 
> - Use one "current number being parsed" variable rather than two.
>   Just copy n to prev_n on finding a '-'.  (Feel free to bikeshed the
>   variable names to "range_start" or something if you prefer.)
> 
> If you do want to use a FILE *, musl should probably have an internal
> API for reading into the stdio buffer and parsing straight from there,
> avoiding the separate stack buffer.  (Or even allocating the FILE on
> the stack and avoiding the malloc.)

It does. __fopen_rb_ca. This is probably the preferred way to do it.
You can then just consume bytes with getc_unlocked (which expands as a
macro within libc).

> This is probably far from the last small config file (/proc or /sys or
> perhaps something like /etc/hostname) that someone will want to access.
> 
> (You could even get ambitious and make it reallocate the buffer as
> required to get the whole file into memory

That's not a good thing.

> Probably something other
> code would be interested in, like timezone parsing.)

Time zones are mmapped and used in-place.

> #include <unistd.h>
> #include <fcntl.h>
> #include <errno.h>
> 
> long get_nrprocessors_conf1(void)
> {
> 	int fd = open("/sys/devices/system/cpu/possible",
> 			O_RDONLY | O_NOCTTY | O_CLOEXEC);
> 	if (fd < 0)
> 		goto failure;
> 
> 	char buffer[128];	/* Some reasonable size. */
> 	ssize_t ss = read(fd, buffer, sizeof buffer);
> 	/*
> 	 * Short reads, EINTR, etc. can't happen with /sys files.
> 	 * We could in theory preserve the read's errno in case the close
> 	 * overwrites it, but that seems excessive.
> 	 */
> 	if (close(fd) < 0 || ss < 0)
> 		goto failure;
> 	if (ss < 2 || buffer[ss-1] != '\n')
> 		goto parse_error;
> 	/*
> 	 * The possible CPUs are given as a comma-separated list of ranges,
> 	 * with each range being either a single decimal integer ("1") or
> 	 * a hyphen-separated range ("1-10").
> 	 *
> 	 * Q: How careful do we want the parsing to be?  For now, assume
> 	 * the kernel can be trusted not to give us "1-2-3" or ",," or
> 	 * other strangeness.
> 	 */
> 	char const *p = buffer;
> 	unsigned prev_n = -1u;
> 	unsigned n = 0;
> 	unsigned total = 0;
> 	for (;;) {
> 		char c = *p++;
> 		switch (c) {
> 		  case '0': case '1': case '2': case '3': case '4':
> 		  case '5': case '6': case '7': case '8': case '9':
> 			if (n > (-1u - 9)/10)
> 				goto parse_error;
> 			n = 10 * n + c - '0';
> 			break;
> 		  case '-':
> 			prev_n = n;
> 			n = 0;
> 			break;
> 		  case ',': case '\n':
> 			if (n > prev_n)
> 				total += n - prev_n;
> 			total++;
> 			if (c == '\n')
> 				return (long)total;	/* Success! */
> 			prev_n = -1u;
> 			n = 0;
> 			break;
> 		  default:
> 			goto parse_error;
> 		}
> 	}
> 
> parse_error:
> 	errno = ENOTSUP;	/* What's a good error to return? */

There is no need to account for data errors at all. If you can't trust
the kernel not to give you something invalid, you're running in a
compromised context where you can't do anything.

If this is really the right interface to get this data from, I would
hope there's some simpler way to do it, simplifying it down to assume
the data is well-formed (don't invoke UB if it's not, but returning an
utterly bogus value if it's not would be fine).

A couple minor style nits: no spaces for indentation or extra
indentation of case labels, and no need to cast in any place where
there's an implicit conversion that does the same (unsigned to long).

Rich

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-10  5:31 [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support jason
  2021-07-10 17:11 ` Rich Felker
@ 2021-07-10 17:29 ` Samuel Holland
  2021-07-10 19:44   ` Rich Felker
  1 sibling, 1 reply; 12+ messages in thread
From: Samuel Holland @ 2021-07-10 17:29 UTC (permalink / raw)
  To: musl; +Cc: jason, vincent.donnefort

Hi all,

While this topic is being discussed, I'd like to bring up another possible
method for consideration. It turns out that some files in procfs have contents
depending on the number of {possible,online} CPUs; we can parse them to get the
count we need.

The benefits are:
 - procfs is more likely to be mounted than sysfs, and it is already
   documented as being required by musl.
 - Some of the procfs interfaces have been around longer than the
   sysfs interface.
 - The parsing logic is somewhat simpler than for the files in
   /sys/devices/system/cpu, because we are just counting the number
   of occurrences of some string.

The downsides are:
 - It depends on the stability of the procfs file formats.
 - The kernel uses more CPU to generate the contents of these files.

I looked through the procfs code for uses of for_each_possible_cpu and
for_each_online_cpu, and the best candidates I found are:
 - /proc/softirqs for _SC_NPROCESSORS_CONF. It was added in 2009
   (commit d3d64df21d3d, v2.6.31). Its first line contains a column
   for_each_possible_cpu.
 - /proc/stat for _SC_NPROCESSORS_ONLN. It has been around since
   before git. It contains a line for_each_online_cpu.

Here's an example implementation:

diff --git a/src/conf/sysconf.c b/src/conf/sysconf.c
index 3baaed32..8f787b81 100644
--- a/src/conf/sysconf.c
+++ b/src/conf/sysconf.c
@@ -193,10 +193,27 @@ long sysconf(int name)
 		return SEM_VALUE_MAX;
 	case JT_DELAYTIMER_MAX & 255:
 		return DELAYTIMER_MAX;
-	case JT_NPROCESSORS_CONF & 255:
-	case JT_NPROCESSORS_ONLN & 255: ;
+	case JT_NPROCESSORS_CONF & 255: ;
+		FILE *f = fopen("/proc/softirqs", "rbe");
+		int i, cnt = 0;
+		if (f) {
+			while (fscanf(f, " CPU%u", &i) > 0) ++cnt;
+			fclose(f);
+		}
+		if (cnt)
+			return cnt;
+		/* fallthrough */
+	case JT_NPROCESSORS_ONLN & 255:
+		f = fopen("/proc/stat", "rbe");
+		cnt = 0;
+		if (f) {
+			fscanf(f, "cpu %*[^c]");
+			while (fscanf(f, "cpu%u %*[^c]", &i) > 0) ++cnt;
+			fclose(f);
+		}
+		if (cnt)
+			return cnt;
 		unsigned char set[128] = {1};
-		int i, cnt;
 		__syscall(SYS_sched_getaffinity, 0, sizeof set, set);
 		for (i=cnt=0; i<sizeof set; i++)
 			for (; set[i]; set[i]&=set[i]-1, cnt++);

(Apologies if this gets mangled by my MUA,)

Regards,
Samuel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-10 17:29 ` Samuel Holland
@ 2021-07-10 19:44   ` Rich Felker
  2021-07-12 13:08     ` Vincent Donnefort
  0 siblings, 1 reply; 12+ messages in thread
From: Rich Felker @ 2021-07-10 19:44 UTC (permalink / raw)
  To: Samuel Holland; +Cc: musl, jason, vincent.donnefort

On Sat, Jul 10, 2021 at 12:29:34PM -0500, Samuel Holland wrote:
> Hi all,
> 
> While this topic is being discussed, I'd like to bring up another possible
> method for consideration. It turns out that some files in procfs have contents
> depending on the number of {possible,online} CPUs; we can parse them to get the
> count we need.
> 
> The benefits are:
>  - procfs is more likely to be mounted than sysfs, and it is already
>    documented as being required by musl.
>  - Some of the procfs interfaces have been around longer than the
>    sysfs interface.
>  - The parsing logic is somewhat simpler than for the files in
>    /sys/devices/system/cpu, because we are just counting the number
>    of occurrences of some string.
> 
> The downsides are:
>  - It depends on the stability of the procfs file formats.
>  - The kernel uses more CPU to generate the contents of these files.

My understanding is that the procfs files have stronger stability
mandate than sysfs if anything (originally, only procfs was stable and
sysfs was not). So I think it's perfectly acceptable to use and prefer
procfs. And indeed it's more likely to be mounted; some
container/sandbox setups I use bind-mount (or mount a new) procfs but
do not expose any sysfs.

> I looked through the procfs code for uses of for_each_possible_cpu and
> for_each_online_cpu, and the best candidates I found are:
>  - /proc/softirqs for _SC_NPROCESSORS_CONF. It was added in 2009
>    (commit d3d64df21d3d, v2.6.31). Its first line contains a column
>    for_each_possible_cpu.

This should be okay; nonstandard functionality like this does not need
to work back to ancient kernels. Maybe there's a way we could use the
kernel cpuset size (is that observable?) as a fallback -- or, if
kernels that old didn't have variable cpuset size, just default to
whatever the fixed size (1024?) was back then.

>  - /proc/stat for _SC_NPROCESSORS_ONLN. It has been around since
>    before git. It contains a line for_each_online_cpu.
> 
> Here's an example implementation:
> 
> diff --git a/src/conf/sysconf.c b/src/conf/sysconf.c
> index 3baaed32..8f787b81 100644
> --- a/src/conf/sysconf.c
> +++ b/src/conf/sysconf.c
> @@ -193,10 +193,27 @@ long sysconf(int name)
>  		return SEM_VALUE_MAX;
>  	case JT_DELAYTIMER_MAX & 255:
>  		return DELAYTIMER_MAX;
> -	case JT_NPROCESSORS_CONF & 255:
> -	case JT_NPROCESSORS_ONLN & 255: ;
> +	case JT_NPROCESSORS_CONF & 255: ;
> +		FILE *f = fopen("/proc/softirqs", "rbe");
> +		int i, cnt = 0;
> +		if (f) {
> +			while (fscanf(f, " CPU%u", &i) > 0) ++cnt;
> +			fclose(f);
> +		}
> +		if (cnt)
> +			return cnt;
> +		/* fallthrough */

This is nice and simple, but the fscanf machinery is rather large and
undesirable to pull in as a linking consequence of sysconf. Likewise,
it should not be pulling in malloc/free. __fopen_rb_ca could be used
if we want to use stdio though, along with a trivial state machine to
count occurrances of "CPU" in first line.

> +	case JT_NPROCESSORS_ONLN & 255:
> +		f = fopen("/proc/stat", "rbe");
> +		cnt = 0;
> +		if (f) {
> +			fscanf(f, "cpu %*[^c]");
> +			while (fscanf(f, "cpu%u %*[^c]", &i) > 0) ++cnt;
> +			fclose(f);
> +		}
> +		if (cnt)
> +			return cnt;
>  		unsigned char set[128] = {1};
> -		int i, cnt;
>  		__syscall(SYS_sched_getaffinity, 0, sizeof set, set);
>  		for (i=cnt=0; i<sizeof set; i++)
>  			for (; set[i]; set[i]&=set[i]-1, cnt++);

Same principles here. fgets is a lot more suitable than fscanf here
anyway.

Rich

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-10 19:44   ` Rich Felker
@ 2021-07-12 13:08     ` Vincent Donnefort
  2021-07-12 15:41       ` Rich Felker
  0 siblings, 1 reply; 12+ messages in thread
From: Vincent Donnefort @ 2021-07-12 13:08 UTC (permalink / raw)
  To: Rich Felker; +Cc: Samuel Holland, musl, jason

On Sat, Jul 10, 2021 at 03:44:50PM -0400, Rich Felker wrote:
> On Sat, Jul 10, 2021 at 12:29:34PM -0500, Samuel Holland wrote:
> > Hi all,
> > 
> > While this topic is being discussed, I'd like to bring up another possible
> > method for consideration. It turns out that some files in procfs have contents
> > depending on the number of {possible,online} CPUs; we can parse them to get the
> > count we need.
> > 
> > The benefits are:
> >  - procfs is more likely to be mounted than sysfs, and it is already
> >    documented as being required by musl.
> >  - Some of the procfs interfaces have been around longer than the
> >    sysfs interface.
> >  - The parsing logic is somewhat simpler than for the files in
> >    /sys/devices/system/cpu, because we are just counting the number
> >    of occurrences of some string.
> > 
> > The downsides are:
> >  - It depends on the stability of the procfs file formats.
> >  - The kernel uses more CPU to generate the contents of these files.
> 
> My understanding is that the procfs files have stronger stability
> mandate than sysfs if anything (originally, only procfs was stable and
> sysfs was not). So I think it's perfectly acceptable to use and prefer
> procfs. And indeed it's more likely to be mounted; some
> container/sandbox setups I use bind-mount (or mount a new) procfs but
> do not expose any sysfs.

Only to cover the case where a user needs _CONF and doesn't have the sysfs
mounted, which is very unlikely, we are losing a lot. The masks, found in the
CPU subsystem are well documented, widely available, describe precisely the CPU
topology (which is what we want) and are "easy" to decode without fscanf.
Moreover, in the case of a non-available sysfs, the fallback to sched_getaffinity
would help and at least align _CONF with _ONL (which is the current behavior).

Reading /proc/stat and /proc/softirq looks like a hack. They just happen to
align on the _ONL/_CONF and doesn't produce a machine-readable output. While
the CPU sysfs subsys is really meant to describe CPUs.

-- 
Vincent

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 13:08     ` Vincent Donnefort
@ 2021-07-12 15:41       ` Rich Felker
  2021-07-12 16:32         ` Vincent Donnefort
  0 siblings, 1 reply; 12+ messages in thread
From: Rich Felker @ 2021-07-12 15:41 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: Samuel Holland, musl, jason

On Mon, Jul 12, 2021 at 02:08:42PM +0100, Vincent Donnefort wrote:
> On Sat, Jul 10, 2021 at 03:44:50PM -0400, Rich Felker wrote:
> > On Sat, Jul 10, 2021 at 12:29:34PM -0500, Samuel Holland wrote:
> > > Hi all,
> > > 
> > > While this topic is being discussed, I'd like to bring up another possible
> > > method for consideration. It turns out that some files in procfs have contents
> > > depending on the number of {possible,online} CPUs; we can parse them to get the
> > > count we need.
> > > 
> > > The benefits are:
> > >  - procfs is more likely to be mounted than sysfs, and it is already
> > >    documented as being required by musl.
> > >  - Some of the procfs interfaces have been around longer than the
> > >    sysfs interface.
> > >  - The parsing logic is somewhat simpler than for the files in
> > >    /sys/devices/system/cpu, because we are just counting the number
> > >    of occurrences of some string.
> > > 
> > > The downsides are:
> > >  - It depends on the stability of the procfs file formats.
> > >  - The kernel uses more CPU to generate the contents of these files.
> > 
> > My understanding is that the procfs files have stronger stability
> > mandate than sysfs if anything (originally, only procfs was stable and
> > sysfs was not). So I think it's perfectly acceptable to use and prefer
> > procfs. And indeed it's more likely to be mounted; some
> > container/sandbox setups I use bind-mount (or mount a new) procfs but
> > do not expose any sysfs.
> 
> Only to cover the case where a user needs _CONF and doesn't have the sysfs
> mounted, which is very unlikely, we are losing a lot.

Can you quantify what we're "losing"? As I understood the proposal,
nothing at all is lost.

> The masks, found in the
> CPU subsystem are well documented, widely available, describe precisely the CPU
> topology (which is what we want) and are "easy" to decode without fscanf.

fscanf is not available here. The same can be done without it, if
needed, though.

> Moreover, in the case of a non-available sysfs, the fallback to sched_getaffinity
> would help and at least align _CONF with _ONL (which is the current behavior).
> 
> Reading /proc/stat and /proc/softirq looks like a hack. They just happen to
> align on the _ONL/_CONF and doesn't produce a machine-readable output. While
> the CPU sysfs subsys is really meant to describe CPUs.

If they're mandated stable interfaces that "happen" to have the exact
data we need, I don't see what the objection to using them is. A
better question would be whether access to them is restricted in any
hardening profiles. At least they don't seem to be hidden by the
hidepid mount option.

Rich

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 15:41       ` Rich Felker
@ 2021-07-12 16:32         ` Vincent Donnefort
  2021-07-12 17:14           ` Rich Felker
  2021-08-07 16:29           ` Leah Neukirchen
  0 siblings, 2 replies; 12+ messages in thread
From: Vincent Donnefort @ 2021-07-12 16:32 UTC (permalink / raw)
  To: Rich Felker; +Cc: Samuel Holland, musl, jason

On Mon, Jul 12, 2021 at 11:41:35AM -0400, Rich Felker wrote:
> On Mon, Jul 12, 2021 at 02:08:42PM +0100, Vincent Donnefort wrote:
> > On Sat, Jul 10, 2021 at 03:44:50PM -0400, Rich Felker wrote:
> > > On Sat, Jul 10, 2021 at 12:29:34PM -0500, Samuel Holland wrote:
> > > > Hi all,
> > > > 
> > > > While this topic is being discussed, I'd like to bring up another possible
> > > > method for consideration. It turns out that some files in procfs have contents
> > > > depending on the number of {possible,online} CPUs; we can parse them to get the
> > > > count we need.
> > > > 
> > > > The benefits are:
> > > >  - procfs is more likely to be mounted than sysfs, and it is already
> > > >    documented as being required by musl.
> > > >  - Some of the procfs interfaces have been around longer than the
> > > >    sysfs interface.
> > > >  - The parsing logic is somewhat simpler than for the files in
> > > >    /sys/devices/system/cpu, because we are just counting the number
> > > >    of occurrences of some string.
> > > > 
> > > > The downsides are:
> > > >  - It depends on the stability of the procfs file formats.
> > > >  - The kernel uses more CPU to generate the contents of these files.
> > > 
> > > My understanding is that the procfs files have stronger stability
> > > mandate than sysfs if anything (originally, only procfs was stable and
> > > sysfs was not). So I think it's perfectly acceptable to use and prefer
> > > procfs. And indeed it's more likely to be mounted; some
> > > container/sandbox setups I use bind-mount (or mount a new) procfs but
> > > do not expose any sysfs.
> > 
> > Only to cover the case where a user needs _CONF and doesn't have the sysfs
> > mounted, which is very unlikely, we are losing a lot.
> 
> Can you quantify what we're "losing"? As I understood the proposal,
> nothing at all is lost.

Sorry, I wasn't clear. No functional differences. What we're losing is a
simple interface: a CPU mask.

> 
> > The masks, found in the
> > CPU subsystem are well documented, widely available, describe precisely the CPU
> > topology (which is what we want) and are "easy" to decode without fscanf.
> 
> fscanf is not available here. The same can be done without it, if
> needed, though.
> 
> > Moreover, in the case of a non-available sysfs, the fallback to sched_getaffinity
> > would help and at least align _CONF with _ONL (which is the current behavior).
> > 
> > Reading /proc/stat and /proc/softirq looks like a hack. They just happen to
> > align on the _ONL/_CONF and doesn't produce a machine-readable output. While
> > the CPU sysfs subsys is really meant to describe CPUs.
> 
> If they're mandated stable interfaces that "happen" to have the exact
> data we need, I don't see what the objection to using them is. A
> better question would be whether access to them is restricted in any
> hardening profiles. At least they don't seem to be hidden by the
> hidepid mount option.
> 
> Rich

Indeed the function hasn't changed for 10y, which is I guess somewhat stable
and it is currently walking through all the possible CPUs (which is what we want
to count). Also, this file is currently always present whenever procfs is
mounted.

Nonetheless, as this is human-readable, nothing mandates the format. And as an
example, on my desktop machine, with 448 allocated CPUS, the first line of
/proc/softirqs (the line that contains all the CPUs) is 4949 long. The
"possible" mask describes same information with only 6 characters.

-- 
Vincent.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 16:32         ` Vincent Donnefort
@ 2021-07-12 17:14           ` Rich Felker
  2021-07-12 18:12             ` Vincent Donnefort
  2021-08-07 14:52             ` James Y Knight
  2021-08-07 16:29           ` Leah Neukirchen
  1 sibling, 2 replies; 12+ messages in thread
From: Rich Felker @ 2021-07-12 17:14 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: Samuel Holland, musl, jason

On Mon, Jul 12, 2021 at 05:32:46PM +0100, Vincent Donnefort wrote:
> On Mon, Jul 12, 2021 at 11:41:35AM -0400, Rich Felker wrote:
> > On Mon, Jul 12, 2021 at 02:08:42PM +0100, Vincent Donnefort wrote:
> > > On Sat, Jul 10, 2021 at 03:44:50PM -0400, Rich Felker wrote:
> > > > On Sat, Jul 10, 2021 at 12:29:34PM -0500, Samuel Holland wrote:
> > > > > Hi all,
> > > > > 
> > > > > While this topic is being discussed, I'd like to bring up another possible
> > > > > method for consideration. It turns out that some files in procfs have contents
> > > > > depending on the number of {possible,online} CPUs; we can parse them to get the
> > > > > count we need.
> > > > > 
> > > > > The benefits are:
> > > > >  - procfs is more likely to be mounted than sysfs, and it is already
> > > > >    documented as being required by musl.
> > > > >  - Some of the procfs interfaces have been around longer than the
> > > > >    sysfs interface.
> > > > >  - The parsing logic is somewhat simpler than for the files in
> > > > >    /sys/devices/system/cpu, because we are just counting the number
> > > > >    of occurrences of some string.
> > > > > 
> > > > > The downsides are:
> > > > >  - It depends on the stability of the procfs file formats.
> > > > >  - The kernel uses more CPU to generate the contents of these files.
> > > > 
> > > > My understanding is that the procfs files have stronger stability
> > > > mandate than sysfs if anything (originally, only procfs was stable and
> > > > sysfs was not). So I think it's perfectly acceptable to use and prefer
> > > > procfs. And indeed it's more likely to be mounted; some
> > > > container/sandbox setups I use bind-mount (or mount a new) procfs but
> > > > do not expose any sysfs.
> > > 
> > > Only to cover the case where a user needs _CONF and doesn't have the sysfs
> > > mounted, which is very unlikely, we are losing a lot.
> > 
> > Can you quantify what we're "losing"? As I understood the proposal,
> > nothing at all is lost.
> 
> Sorry, I wasn't clear. No functional differences. What we're losing is a
> simple interface: a CPU mask.
> 
> > 
> > > The masks, found in the
> > > CPU subsystem are well documented, widely available, describe precisely the CPU
> > > topology (which is what we want) and are "easy" to decode without fscanf.
> > 
> > fscanf is not available here. The same can be done without it, if
> > needed, though.
> > 
> > > Moreover, in the case of a non-available sysfs, the fallback to sched_getaffinity
> > > would help and at least align _CONF with _ONL (which is the current behavior).
> > > 
> > > Reading /proc/stat and /proc/softirq looks like a hack. They just happen to
> > > align on the _ONL/_CONF and doesn't produce a machine-readable output. While
> > > the CPU sysfs subsys is really meant to describe CPUs.
> > 
> > If they're mandated stable interfaces that "happen" to have the exact
> > data we need, I don't see what the objection to using them is. A
> > better question would be whether access to them is restricted in any
> > hardening profiles. At least they don't seem to be hidden by the
> > hidepid mount option.
> 
> Indeed the function hasn't changed for 10y, which is I guess somewhat stable
> and it is currently walking through all the possible CPUs (which is what we want
> to count). Also, this file is currently always present whenever procfs is
> mounted.
> 
> Nonetheless, as this is human-readable, nothing mandates the format. And as an
> example, on my desktop machine, with 448 allocated CPUS, the first line of
> /proc/softirqs (the line that contains all the CPUs) is 4949 long. The
> "possible" mask describes same information with only 6 characters.

It's not just a matter of age and de facto stability. Kernel policy is
that the procfs files are stable interfaces. Despite being "human
readable", they're also intended to be readable by, and actually read
by, utilities such as the procps suite of tools, "top"-type programs,
etc.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 17:14           ` Rich Felker
@ 2021-07-12 18:12             ` Vincent Donnefort
  2021-08-07 14:52             ` James Y Knight
  1 sibling, 0 replies; 12+ messages in thread
From: Vincent Donnefort @ 2021-07-12 18:12 UTC (permalink / raw)
  To: Rich Felker; +Cc: Samuel Holland, musl, jason

[...]

> > > If they're mandated stable interfaces that "happen" to have the exact
> > > data we need, I don't see what the objection to using them is. A
> > > better question would be whether access to them is restricted in any
> > > hardening profiles. At least they don't seem to be hidden by the
> > > hidepid mount option.
> > 
> > Indeed the function hasn't changed for 10y, which is I guess somewhat stable
> > and it is currently walking through all the possible CPUs (which is what we want
> > to count). Also, this file is currently always present whenever procfs is
> > mounted.
> > 
> > Nonetheless, as this is human-readable, nothing mandates the format. And as an
> > example, on my desktop machine, with 448 allocated CPUS, the first line of
> > /proc/softirqs (the line that contains all the CPUs) is 4949 long. The
> > "possible" mask describes same information with only 6 characters.
> 
> It's not just a matter of age and de facto stability. Kernel policy is
> that the procfs files are stable interfaces. Despite being "human
> readable", they're also intended to be readable by, and actually read
> by, utilities such as the procps suite of tools, "top"-type programs,
> etc.
>

What I wanted to emphasis is the cost of reading that interface vs the CPU mask
in the sysfs. The only gain is for a system with a _CONF user, where some CPUs
have been hotunplug'ed, and where the sysfs isn't mounted while the procfs is.
This can surely happen, but compared with the cost of parsing a long string
(see the ~x1000 in the example above) it doesn't seem a good trade-off to me.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 17:14           ` Rich Felker
  2021-07-12 18:12             ` Vincent Donnefort
@ 2021-08-07 14:52             ` James Y Knight
  1 sibling, 0 replies; 12+ messages in thread
From: James Y Knight @ 2021-08-07 14:52 UTC (permalink / raw)
  To: musl; +Cc: Vincent Donnefort, Samuel Holland, jason

[-- Attachment #1: Type: text/plain, Size: 1898 bytes --]

On Mon, Jul 12, 2021 at 1:15 PM Rich Felker <dalias@libc.org> wrote:

> It's not just a matter of age and de facto stability. Kernel policy is
> that the procfs files are stable interfaces. Despite being "human
> readable", they're also intended to be readable by, and actually read
> by, utilities such as the procps suite of tools, "top"-type programs,
> etc.
>

I agree the format isn't going to change.  However, I'd be extremely wary
of using the emergent property that /proc/softirqs contains an entry for
every "possible" CPU on the system. Yes, linux has a no regressions policy,
but that only applies if someone notices.

So, when someone's making some future refactoring in the kernel, maybe
that happens to modify /proc/softirqs to only emit data for online CPUs.
The /proc/interrupts file already reports only online-cpus, and it seems a
bit of a weird inconsistency that softirqs iterates the possible-cpus set.
Since this file is a _very_ strange source from which to determine the set
of possible CPU ids, I wouldn't be surprised if code review did not notice
that the change would cause a userspace regression. So it probably makes it
into a stable kernel release.

And, sure, eventually someone may indeed notice that the change subtly
broke musl's sysconf implementation, and report that. At which point the
change would definitely be reverted. (And a series of annoyed "WTF did you
people use _that_ to figure out the possible CPUs??? Userspace developers
are crazy." emails would ensue). But, that's just a lot of pain to go
through, for, it seems to me, not a whole lot of benefit.

I think it'll just be better to use the kernel interface which was actually
designed to report this data. And the proposed patch falls back to using
sched_getaffinity if sysfs doesn't exist, which -- while incorrect -- is
likely to suffice in practice for those users that fail to mount sysfs.

[-- Attachment #2: Type: text/html, Size: 2370 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
  2021-07-12 16:32         ` Vincent Donnefort
  2021-07-12 17:14           ` Rich Felker
@ 2021-08-07 16:29           ` Leah Neukirchen
  1 sibling, 0 replies; 12+ messages in thread
From: Leah Neukirchen @ 2021-08-07 16:29 UTC (permalink / raw)
  To: Vincent Donnefort; +Cc: Rich Felker, musl, Samuel Holland, jason

Vincent Donnefort <vincent.donnefort@arm.com> writes:

>> > Moreover, in the case of a non-available sysfs, the fallback to sched_getaffinity
>> > would help and at least align _CONF with _ONL (which is the current behavior).
>> > 
>> > Reading /proc/stat and /proc/softirq looks like a hack. They just happen to
>> > align on the _ONL/_CONF and doesn't produce a machine-readable output. While
>> > the CPU sysfs subsys is really meant to describe CPUs.
>> 
>> If they're mandated stable interfaces that "happen" to have the exact
>> data we need, I don't see what the objection to using them is. A
>> better question would be whether access to them is restricted in any
>> hardening profiles. At least they don't seem to be hidden by the
>> hidepid mount option.
>> 
> Indeed the function hasn't changed for 10y, which is I guess somewhat stable
> and it is currently walking through all the possible CPUs (which is what we want
> to count). Also, this file is currently always present whenever procfs is
> mounted.
>
> Nonetheless, as this is human-readable, nothing mandates the format. And as an
> example, on my desktop machine, with 448 allocated CPUS, the first line of
> /proc/softirqs (the line that contains all the CPUs) is 4949 long. The
> "possible" mask describes same information with only 6 characters.

For the record, on my single-cpu mainboard with a "AMD Ryzen 7 3700X
8-Core Processor", there are 16 processor entries in /proc/cpuinfo,
but /proc/softirq goes to CPU31.  /sys/devices/system/cpu has entries
up to cpu15, and glibc getconf _NPROCESSORS_CONF also says 16.

-- 
Leah Neukirchen  <leah@vuxu.org>  https://leahneukirchen.org/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support
@ 2021-07-06  9:55 Vincent Donnefort
  0 siblings, 0 replies; 12+ messages in thread
From: Vincent Donnefort @ 2021-07-06  9:55 UTC (permalink / raw)
  To: musl; +Cc: jyknight, Vincent Donnefort

Currently, _SC_NPROCESSORS_CONF is always equal to _SC_NPROCESSORS_ONLN.
However, it is expected from the first one to give the total number of
CPUs that the system can contain, while the later must return only the
number of CPUs which are currently online. This distinction is important
for a software such as trace-cmd. Trace-cmd is a front-end for the kernel
tracing tool ftrace. When recording traces, trace-cmd needs to get the
otal number of CPUs available in the system (_SC_NPROCESSORS_CONF) and not
only the online ones otherwise if a CPU goes offline some data might be
missing.

Hence, add a specific method to get _SC_NPROCESSORS_CONF, based on the
sysfs file /sys/devices/system/cpu/possible. This static file will not
change during the lifespan of the Linux kernel and contains a CPU mask
with all the CPUs that can be brought online.

diff --git a/src/conf/sysconf.c b/src/conf/sysconf.c
index 3baaed32..b4b842f5 100644
--- a/src/conf/sysconf.c
+++ b/src/conf/sysconf.c
@@ -1,8 +1,10 @@
+#include <ctype.h>
 #include <unistd.h>
 #include <limits.h>
 #include <errno.h>
 #include <sys/resource.h>
 #include <signal.h>
+#include <stdbool.h>
 #include <sys/sysinfo.h>
 #include "syscall.h"
 #include "libc.h"
@@ -22,6 +24,73 @@
 
 #define RLIM(x) (-32768|(RLIMIT_ ## x))
 
+#define POSSIBLE_MAX_READ 128
+#define char_to_int(prev, num) (10 * prev + (num - '0'))
+
+static inline int get_nrprocessors_conf(void)
+{
+	FILE *f;
+	char buffer[POSSIBLE_MAX_READ];
+	int start_chunk = 0, end_chunk = -1;
+	unsigned int cnt = 0, i = 0;
+	size_t ret;
+
+	f = fopen("/sys/devices/system/cpu/possible", "r");
+	if (!f)
+		return 0;
+
+	ret = fread(buffer, sizeof(*buffer),
+		    sizeof(buffer) / sizeof(*buffer), f);
+	if (!feof(f) || ferror(f) || ret < 2)
+		goto end;
+
+	/*
+	 * Count the number of CPUs in the CPU mask. A CPU Mask is described by
+	 * chunks. Chunks have the following format: "<start>-<end>" and are
+	 * separated by ",". A chunk can be composed of a single CPU.
+	 *
+	 * e.g. "0-1,4" -> 3 CPUs
+	 */
+        while (i < POSSIBLE_MAX_READ) {
+                if (buffer[i] == ',' || buffer[i] == '\0') {
+			if (end_chunk > -1)
+				cnt += (end_chunk - start_chunk) + 1;
+			else
+				cnt++;
+                        start_chunk = 0;
+			end_chunk = -1;
+			if (buffer[i] == '\0')
+				break;
+                } else if (buffer[i] == '-') {
+			end_chunk = 0;
+                } else if (isdigit(buffer[i])) {
+                        if (end_chunk == -1)
+                                start_chunk =
+					char_to_int(start_chunk, buffer[i]);
+                        else
+                                end_chunk = char_to_int(end_chunk, buffer[i]);
+                }
+                i++;
+        }
+
+end:
+	fclose(f);
+
+	return cnt;
+}
+
+static inline int get_nrprocessors_onln(void)
+{
+	unsigned char set[128] = {1};
+	int i, cnt;
+
+	__syscall(SYS_sched_getaffinity, 0, sizeof set, set);
+	for (i=cnt=0; i<sizeof set; i++)
+		for (; set[i]; set[i]&=set[i]-1, cnt++);
+
+	return cnt;
+}
+
 long sysconf(int name)
 {
 	static const short values[] = {
@@ -193,14 +262,13 @@ long sysconf(int name)
 		return SEM_VALUE_MAX;
 	case JT_DELAYTIMER_MAX & 255:
 		return DELAYTIMER_MAX;
-	case JT_NPROCESSORS_CONF & 255:
+	case JT_NPROCESSORS_CONF & 255: ;
+		int cnt = get_nrprocessors_conf();
+		if (cnt > 0)
+			return cnt;
+		return get_nrprocessors_onln();
 	case JT_NPROCESSORS_ONLN & 255: ;
-		unsigned char set[128] = {1};
-		int i, cnt;
-		__syscall(SYS_sched_getaffinity, 0, sizeof set, set);
-		for (i=cnt=0; i<sizeof set; i++)
-			for (; set[i]; set[i]&=set[i]-1, cnt++);
-		return cnt;
+		return get_nrprocessors_onln();
 	case JT_PHYS_PAGES & 255:
 	case JT_AVPHYS_PAGES & 255: ;
 		unsigned long long mem;
-- 
2.27.0


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-08-07 16:31 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-10  5:31 [musl] [PATCH v3] sysconf: add _SC_NPROCESSORS_CONF support jason
2021-07-10 17:11 ` Rich Felker
2021-07-10 17:29 ` Samuel Holland
2021-07-10 19:44   ` Rich Felker
2021-07-12 13:08     ` Vincent Donnefort
2021-07-12 15:41       ` Rich Felker
2021-07-12 16:32         ` Vincent Donnefort
2021-07-12 17:14           ` Rich Felker
2021-07-12 18:12             ` Vincent Donnefort
2021-08-07 14:52             ` James Y Knight
2021-08-07 16:29           ` Leah Neukirchen
  -- strict thread matches above, loose matches on Subject: below --
2021-07-06  9:55 Vincent Donnefort

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).