cpumask: Add optimized helpers when NR_CPUS fits in a long

When NR_CPUS fits in a long, it's possible to use compiler built-ins to
produce much faster code when operating on cpumasks compared to just using
the generic bitops APIs.

Therefore, add optimized helpers using compiler built-ins when NR_CPUS fits
in a long. This also turns nr_cpu_ids into a compile-time constant for
further optimization potential.

Note that compared to the upstream cpumask rewrite with this feature, these
optimized helpers perfectly preserve the semantics of the helpers they
replace. And this change is much smaller than the upstream version.

Change-Id: I1ac6058a19bd3b22a491176eef9d661cca78e521
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
Signed-off-by: Richard Raya <rdxzv.dev@gmail.com>
This commit is contained in:
Sultan Alsawaf 2023-07-03 17:22:38 -07:00 committed by Richard Raya
parent 2c65e6655c
commit 0fa3521bf8
3 changed files with 116 additions and 0 deletions

View File

@ -34,6 +34,8 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
#if NR_CPUS == 1
#define nr_cpu_ids 1U
#elif NR_CPUS <= BITS_PER_LONG
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif
@ -189,6 +191,95 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
#define for_each_cpu_and(cpu, mask, and) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
#else
#if NR_CPUS <= BITS_PER_LONG
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
unsigned int nr;
nr = __builtin_ffsl(*cpumask_bits(srcp)) - 1;
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
unsigned long bits = *cpumask_bits(srcp);
unsigned int nr;
if (unlikely(!bits))
return nr_cpumask_bits;
nr = BITS_PER_LONG - 1 - __builtin_clzl(bits);
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
unsigned int nr, shift;
unsigned long bits;
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
shift = n + 1;
if (unlikely(shift >= nr_cpumask_bits))
return nr_cpumask_bits;
bits = *cpumask_bits(srcp);
nr = __builtin_ffsl((bits >> shift) << shift) - 1;
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
unsigned int nr, shift;
unsigned long bits;
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
shift = n + 1;
if (unlikely(shift >= nr_cpumask_bits))
return nr_cpumask_bits;
bits = ~*cpumask_bits(srcp);
nr = __builtin_ffsl((bits >> shift) << shift) - 1;
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
static inline int cpumask_next_and(int n, const struct cpumask *srcp,
const struct cpumask *andp)
{
unsigned int nr, shift;
unsigned long bits;
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
shift = n + 1;
if (unlikely(shift >= nr_cpumask_bits))
return nr_cpumask_bits;
bits = *cpumask_bits(srcp) & *cpumask_bits(andp);
nr = __builtin_ffsl((bits >> shift) << shift) - 1;
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
static inline int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
unsigned long bits = *cpumask_bits(mask);
unsigned int nr;
cpumask_check(cpu);
if (likely(cpu < nr_cpumask_bits))
bits &= ~BIT(cpu);
nr = __builtin_ffsl(bits) - 1;
return nr > nr_cpumask_bits ? nr_cpumask_bits : nr;
}
#else /* NR_CPUS > BITS_PER_LONG */
/**
* cpumask_first - get the first cpu in a cpumask
* @srcp: the cpumask pointer
@ -219,6 +310,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
#endif /* NR_CPUS <= BITS_PER_LONG */
unsigned int cpumask_local_spread(unsigned int i, int node);
/**
@ -369,7 +462,12 @@ static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
*/
static inline void cpumask_setall(struct cpumask *dstp)
{
/* bitmap_fill() isn't optimized for compile-time constants */
#if NR_CPUS <= BITS_PER_LONG
*cpumask_bits(dstp) = BIT(NR_CPUS) - 1;
#else
bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
#endif
}
/**
@ -378,7 +476,12 @@ static inline void cpumask_setall(struct cpumask *dstp)
*/
static inline void cpumask_clear(struct cpumask *dstp)
{
/* bitmap_zero() isn't optimized for compile-time constants */
#if NR_CPUS <= BITS_PER_LONG
*cpumask_bits(dstp) = 0;
#else
bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
#endif
}
/**
@ -551,7 +654,12 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
static inline void cpumask_copy(struct cpumask *dstp,
const struct cpumask *srcp)
{
/* bitmap_copy() isn't optimized for compile-time constants */
#if NR_CPUS <= BITS_PER_LONG
*cpumask_bits(dstp) = *cpumask_bits(srcp);
#else
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
#endif
}
/**

View File

@ -567,6 +567,7 @@ static int __init nosmp(char *str)
early_param("nosmp", nosmp);
#if NR_CPUS > BITS_PER_LONG
/* this is hard limit */
static int __init nrcpus(char *str)
{
@ -580,6 +581,7 @@ static int __init nrcpus(char *str)
}
early_param("nr_cpus", nrcpus);
#endif
static int __init maxcpus(char *str)
{
@ -605,14 +607,18 @@ static int __init boot_cpus(char *str)
early_param("boot_cpus", boot_cpus);
#if NR_CPUS > BITS_PER_LONG
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
#if NR_CPUS > BITS_PER_LONG
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
#endif
}
static inline bool boot_cpu(int cpu)

View File

@ -6,6 +6,7 @@
#include <linux/export.h>
#include <linux/bootmem.h>
#if NR_CPUS > BITS_PER_LONG
/**
* cpumask_next - get the next cpu in a cpumask
* @n: the cpu prior to the place to search (ie. return will be > @n)
@ -60,6 +61,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
return i;
}
EXPORT_SYMBOL(cpumask_any_but);
#endif /* NR_CPUS > BITS_PER_LONG */
/**
* cpumask_next_wrap - helper to implement for_each_cpu_wrap