diff mbox series

[userspace,v2] libsepol: cache ebitmap cardinality value

Message ID 20200213133959.14217-1-omosnace@redhat.com (mailing list archive)
State Accepted
Headers show
Series [userspace,v2] libsepol: cache ebitmap cardinality value | expand

Commit Message

Ondrej Mosnacek Feb. 13, 2020, 1:39 p.m. UTC
According to profiling of semodule -BN, ebitmap_cardinality() is called
quite often and contributes a lot to the total runtime. Cache its result
in the ebitmap struct to reduce this overhead. The cached value is
invalidated on most modifying operations, but ebitmap_cardinality() is
usually called once the ebitmap doesn't change any more.

After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
decreased from ~14.6s to ~12.4s (2.2s saved).

Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
---

v2: corrected time values in commit message

 libsepol/include/sepol/policydb/ebitmap.h |  1 +
 libsepol/src/ebitmap.c                    | 10 ++++++++++
 2 files changed, 11 insertions(+)

Comments

Stephen Smalley Feb. 14, 2020, 5:38 p.m. UTC | #1
On 2/13/20 8:39 AM, Ondrej Mosnacek wrote:
> According to profiling of semodule -BN, ebitmap_cardinality() is called
> quite often and contributes a lot to the total runtime. Cache its result
> in the ebitmap struct to reduce this overhead. The cached value is
> invalidated on most modifying operations, but ebitmap_cardinality() is
> usually called once the ebitmap doesn't change any more.
> 
> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> decreased from ~14.6s to ~12.4s (2.2s saved).
> 
> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>

This seems fine but I was wondering how many of the callers of 
ebitmap_cardinality() actually need anything more than ebitmap_length()?

> ---
> 
> v2: corrected time values in commit message
> 
>   libsepol/include/sepol/policydb/ebitmap.h |  1 +
>   libsepol/src/ebitmap.c                    | 10 ++++++++++
>   2 files changed, 11 insertions(+)
> 
> diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h
> index e62df01c..53fafdaa 100644
> --- a/libsepol/include/sepol/policydb/ebitmap.h
> +++ b/libsepol/include/sepol/policydb/ebitmap.h
> @@ -37,6 +37,7 @@ typedef struct ebitmap_node {
>   typedef struct ebitmap {
>   	ebitmap_node_t *node;	/* first node in the bitmap */
>   	uint32_t highbit;	/* highest position in the total bitmap */
> +	unsigned int cardinality;	/* cached value of cardinality */
>   } ebitmap_t;
>   
>   #define ebitmap_length(e) ((e)->highbit)
> diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c
> index 6c9951b7..d23444ce 100644
> --- a/libsepol/src/ebitmap.c
> +++ b/libsepol/src/ebitmap.c
> @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1)
>   	ebitmap_destroy(dst);
>   	dst->node = tmp.node;
>   	dst->highbit = tmp.highbit;
> +	dst->cardinality = 0;
>   
>   	return 0;
>   }
> @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma
>   unsigned int ebitmap_cardinality(ebitmap_t *e1)
>   {
>   	unsigned int i, count = 0;
> +
> +	if (e1->cardinality || e1->highbit == 0)
> +		return e1->cardinality;
> +
>   	for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
>   		if (ebitmap_get_bit(e1, i))
>   			count++;
> +	e1->cardinality = count;
>   	return count;
>   }
>   
> @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src)
>   	}
>   
>   	dst->highbit = src->highbit;
> +	dst->cardinality = src->cardinality;
>   	return 0;
>   }
>   
> @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
>   					free(n);
>   				}
>   			}
> +			e->cardinality = 0; /* invalidate cached cardinality */
>   			return 0;
>   		}
>   		prev = n;
> @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
>   		e->node = new;
>   	}
>   
> +	e->cardinality = 0; /* invalidate cached cardinality */
>   	return 0;
>   }
>   
> @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e)
>   
>   	e->highbit = 0;
>   	e->node = 0;
> +	e->cardinality = 0;
>   	return;
>   }
>   
>
Stephen Smalley Feb. 14, 2020, 6:20 p.m. UTC | #2
On 2/14/20 12:38 PM, Stephen Smalley wrote:
> On 2/13/20 8:39 AM, Ondrej Mosnacek wrote:
>> According to profiling of semodule -BN, ebitmap_cardinality() is called
>> quite often and contributes a lot to the total runtime. Cache its result
>> in the ebitmap struct to reduce this overhead. The cached value is
>> invalidated on most modifying operations, but ebitmap_cardinality() is
>> usually called once the ebitmap doesn't change any more.
>>
>> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
>> decreased from ~14.6s to ~12.4s (2.2s saved).
>>
>> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
> 
> This seems fine but I was wondering how many of the callers of 
> ebitmap_cardinality() actually need anything more than ebitmap_length()?

Regardless,

Acked-by: Stephen Smalley <sds@tycho.nsa.gov>

> 
>> ---
>>
>> v2: corrected time values in commit message
>>
>>   libsepol/include/sepol/policydb/ebitmap.h |  1 +
>>   libsepol/src/ebitmap.c                    | 10 ++++++++++
>>   2 files changed, 11 insertions(+)
>>
>> diff --git a/libsepol/include/sepol/policydb/ebitmap.h 
>> b/libsepol/include/sepol/policydb/ebitmap.h
>> index e62df01c..53fafdaa 100644
>> --- a/libsepol/include/sepol/policydb/ebitmap.h
>> +++ b/libsepol/include/sepol/policydb/ebitmap.h
>> @@ -37,6 +37,7 @@ typedef struct ebitmap_node {
>>   typedef struct ebitmap {
>>       ebitmap_node_t *node;    /* first node in the bitmap */
>>       uint32_t highbit;    /* highest position in the total bitmap */
>> +    unsigned int cardinality;    /* cached value of cardinality */
>>   } ebitmap_t;
>>   #define ebitmap_length(e) ((e)->highbit)
>> diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c
>> index 6c9951b7..d23444ce 100644
>> --- a/libsepol/src/ebitmap.c
>> +++ b/libsepol/src/ebitmap.c
>> @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * 
>> e1)
>>       ebitmap_destroy(dst);
>>       dst->node = tmp.node;
>>       dst->highbit = tmp.highbit;
>> +    dst->cardinality = 0;
>>       return 0;
>>   }
>> @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, 
>> ebitmap_t *e2, unsigned int ma
>>   unsigned int ebitmap_cardinality(ebitmap_t *e1)
>>   {
>>       unsigned int i, count = 0;
>> +
>> +    if (e1->cardinality || e1->highbit == 0)
>> +        return e1->cardinality;
>> +
>>       for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
>>           if (ebitmap_get_bit(e1, i))
>>               count++;
>> +    e1->cardinality = count;
>>       return count;
>>   }
>> @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * 
>> src)
>>       }
>>       dst->highbit = src->highbit;
>> +    dst->cardinality = src->cardinality;
>>       return 0;
>>   }
>> @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int 
>> bit, int value)
>>                       free(n);
>>                   }
>>               }
>> +            e->cardinality = 0; /* invalidate cached cardinality */
>>               return 0;
>>           }
>>           prev = n;
>> @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int 
>> bit, int value)
>>           e->node = new;
>>       }
>> +    e->cardinality = 0; /* invalidate cached cardinality */
>>       return 0;
>>   }
>> @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e)
>>       e->highbit = 0;
>>       e->node = 0;
>> +    e->cardinality = 0;
>>       return;
>>   }
>>
>
Ondrej Mosnacek Feb. 14, 2020, 7:51 p.m. UTC | #3
On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> On 2/13/20 8:39 AM, Ondrej Mosnacek wrote:
> > According to profiling of semodule -BN, ebitmap_cardinality() is called
> > quite often and contributes a lot to the total runtime. Cache its result
> > in the ebitmap struct to reduce this overhead. The cached value is
> > invalidated on most modifying operations, but ebitmap_cardinality() is
> > usually called once the ebitmap doesn't change any more.
> >
> > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > decreased from ~14.6s to ~12.4s (2.2s saved).
> >
> > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
>
> This seems fine but I was wondering how many of the callers of
> ebitmap_cardinality() actually need anything more than ebitmap_length()?

The caller that calls it the most (>99%) during a 'semodule -B' is
__cil_should_expand_attribute(), which logically needs the actual
cardinality. It might be possible to cache the decision directly in
'struct cil_typeattribute', but I don't know the CIL code well enough
to attempt that...
Stephen Smalley Feb. 14, 2020, 7:57 p.m. UTC | #4
On 2/14/20 2:51 PM, Ondrej Mosnacek wrote:
> On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
>> On 2/13/20 8:39 AM, Ondrej Mosnacek wrote:
>>> According to profiling of semodule -BN, ebitmap_cardinality() is called
>>> quite often and contributes a lot to the total runtime. Cache its result
>>> in the ebitmap struct to reduce this overhead. The cached value is
>>> invalidated on most modifying operations, but ebitmap_cardinality() is
>>> usually called once the ebitmap doesn't change any more.
>>>
>>> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
>>> decreased from ~14.6s to ~12.4s (2.2s saved).
>>>
>>> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
>>
>> This seems fine but I was wondering how many of the callers of
>> ebitmap_cardinality() actually need anything more than ebitmap_length()?
> 
> The caller that calls it the most (>99%) during a 'semodule -B' is
> __cil_should_expand_attribute(), which logically needs the actual
> cardinality. It might be possible to cache the decision directly in
> 'struct cil_typeattribute', but I don't know the CIL code well enough
> to attempt that...

That's fine - I'm ok with the patch itself.  I just happened to notice 
that there appear to be quite a few callers elsewhere in libsepol that 
only seem to care whether the result is zero or non-zero, which 
seemingly would be just as happy using ebitmap_length().
Ondrej Mosnacek Feb. 14, 2020, 8:19 p.m. UTC | #5
On Fri, Feb 14, 2020 at 8:51 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > On 2/13/20 8:39 AM, Ondrej Mosnacek wrote:
> > > According to profiling of semodule -BN, ebitmap_cardinality() is called
> > > quite often and contributes a lot to the total runtime. Cache its result
> > > in the ebitmap struct to reduce this overhead. The cached value is
> > > invalidated on most modifying operations, but ebitmap_cardinality() is
> > > usually called once the ebitmap doesn't change any more.
> > >
> > > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > > decreased from ~14.6s to ~12.4s (2.2s saved).
> > >
> > > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
> >
> > This seems fine but I was wondering how many of the callers of
> > ebitmap_cardinality() actually need anything more than ebitmap_length()?
>
> The caller that calls it the most (>99%) during a 'semodule -B' is
> __cil_should_expand_attribute(), which logically needs the actual
> cardinality. It might be possible to cache the decision directly in
> 'struct cil_typeattribute', but I don't know the CIL code well enough
> to attempt that...

BTW, in case anyone is wondering how I'm getting these numbers/facts -
I use Callgrind [1] to profile a program's run and then analyze it
with KCachegrind [2]. It is a surprisingly nice and easy to use GUI
for analyzing where the program spends most of its time.

Collecting the profile data is as simple as:

LD_BIND_NOW=1 valgrind --tool=callgrind <your_command> <args>...

(The LD_BIND_NOW=1 is to prevent the dynamic linker's lazy binding
from messing with the results.)

Then you can just open the generated "callgrind.out.<pid>" file with
KCachegrind and click around... Note that to see the function names,
you need to compile the program with -g (but you should keep -O2 et
al. to get the same optimized code as an actual build). Alternatively,
Callgrind will also auto-detect and use debug symbols provided by
distributions in their -debug/-debuginfo packages.

Maybe this is common knowledge for most, but perhaps someone here will
be one of today's lucky 10000 :) [3]

[1] https://valgrind.org/docs/manual/cl-manual.html
[2] https://kcachegrind.github.io/html/Home.html
[3] https://xkcd.com/1053/
Ondrej Mosnacek Feb. 18, 2020, 3:22 p.m. UTC | #6
On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> According to profiling of semodule -BN, ebitmap_cardinality() is called
> quite often and contributes a lot to the total runtime. Cache its result
> in the ebitmap struct to reduce this overhead. The cached value is
> invalidated on most modifying operations, but ebitmap_cardinality() is
> usually called once the ebitmap doesn't change any more.
>
> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> decreased from ~14.6s to ~12.4s (2.2s saved).

I have no idea why, but I'm now getting completely different times
(10.9s vs. 8.9s) with the same builds on the same setup... I can no
longer reproduce the slower times anywhere (F31/locally/...) so I have
to assume it was some kind of glitch. Since the numbers show a similar
magnitude of speed-up (and they depend on a bunch of HW/SW factors
anyway), I'm not going to do another respin. The applying person (most
likely Stephen) is free to fix the numbers when applying if they wish
to do so.

>
> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
> ---
>
> v2: corrected time values in commit message
>
>  libsepol/include/sepol/policydb/ebitmap.h |  1 +
>  libsepol/src/ebitmap.c                    | 10 ++++++++++
>  2 files changed, 11 insertions(+)
>
> diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h
> index e62df01c..53fafdaa 100644
> --- a/libsepol/include/sepol/policydb/ebitmap.h
> +++ b/libsepol/include/sepol/policydb/ebitmap.h
> @@ -37,6 +37,7 @@ typedef struct ebitmap_node {
>  typedef struct ebitmap {
>         ebitmap_node_t *node;   /* first node in the bitmap */
>         uint32_t highbit;       /* highest position in the total bitmap */
> +       unsigned int cardinality;       /* cached value of cardinality */
>  } ebitmap_t;
>
>  #define ebitmap_length(e) ((e)->highbit)
> diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c
> index 6c9951b7..d23444ce 100644
> --- a/libsepol/src/ebitmap.c
> +++ b/libsepol/src/ebitmap.c
> @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1)
>         ebitmap_destroy(dst);
>         dst->node = tmp.node;
>         dst->highbit = tmp.highbit;
> +       dst->cardinality = 0;
>
>         return 0;
>  }
> @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma
>  unsigned int ebitmap_cardinality(ebitmap_t *e1)
>  {
>         unsigned int i, count = 0;
> +
> +       if (e1->cardinality || e1->highbit == 0)
> +               return e1->cardinality;
> +
>         for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
>                 if (ebitmap_get_bit(e1, i))
>                         count++;
> +       e1->cardinality = count;
>         return count;
>  }
>
> @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src)
>         }
>
>         dst->highbit = src->highbit;
> +       dst->cardinality = src->cardinality;
>         return 0;
>  }
>
> @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
>                                         free(n);
>                                 }
>                         }
> +                       e->cardinality = 0; /* invalidate cached cardinality */
>                         return 0;
>                 }
>                 prev = n;
> @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
>                 e->node = new;
>         }
>
> +       e->cardinality = 0; /* invalidate cached cardinality */
>         return 0;
>  }
>
> @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e)
>
>         e->highbit = 0;
>         e->node = 0;
> +       e->cardinality = 0;
>         return;
>  }
>
> --
> 2.24.1
>


--
Ondrej Mosnacek <omosnace at redhat dot com>
Software Engineer, Security Technologies
Red Hat, Inc.
Stephen Smalley Feb. 18, 2020, 3:41 p.m. UTC | #7
On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
>> According to profiling of semodule -BN, ebitmap_cardinality() is called
>> quite often and contributes a lot to the total runtime. Cache its result
>> in the ebitmap struct to reduce this overhead. The cached value is
>> invalidated on most modifying operations, but ebitmap_cardinality() is
>> usually called once the ebitmap doesn't change any more.
>>
>> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
>> decreased from ~14.6s to ~12.4s (2.2s saved).
> 
> I have no idea why, but I'm now getting completely different times
> (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> longer reproduce the slower times anywhere (F31/locally/...) so I have
> to assume it was some kind of glitch. Since the numbers show a similar
> magnitude of speed-up (and they depend on a bunch of HW/SW factors
> anyway), I'm not going to do another respin. The applying person (most
> likely Stephen) is free to fix the numbers when applying if they wish
> to do so.

Thanks, applied with fixed times (although I don't really think it 
matters very much).  Maybe you're also picking up the difference from 
the "libsepol/cil: remove unnecessary hash tables" change.
Ondrej Mosnacek Feb. 18, 2020, 4:01 p.m. UTC | #8
On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> >> According to profiling of semodule -BN, ebitmap_cardinality() is called
> >> quite often and contributes a lot to the total runtime. Cache its result
> >> in the ebitmap struct to reduce this overhead. The cached value is
> >> invalidated on most modifying operations, but ebitmap_cardinality() is
> >> usually called once the ebitmap doesn't change any more.
> >>
> >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> >> decreased from ~14.6s to ~12.4s (2.2s saved).
> >
> > I have no idea why, but I'm now getting completely different times
> > (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> > longer reproduce the slower times anywhere (F31/locally/...) so I have
> > to assume it was some kind of glitch. Since the numbers show a similar
> > magnitude of speed-up (and they depend on a bunch of HW/SW factors
> > anyway), I'm not going to do another respin. The applying person (most
> > likely Stephen) is free to fix the numbers when applying if they wish
> > to do so.
>
> Thanks, applied with fixed times (although I don't really think it
> matters very much).  Maybe you're also picking up the difference from
> the "libsepol/cil: remove unnecessary hash tables" change.

No, that was actually the reason for the first correction.

--
Ondrej Mosnacek <omosnace at redhat dot com>
Software Engineer, Security Technologies
Red Hat, Inc.
Nicolas Iooss Feb. 25, 2020, 9:24 p.m. UTC | #9
On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
>
> On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > >> According to profiling of semodule -BN, ebitmap_cardinality() is called
> > >> quite often and contributes a lot to the total runtime. Cache its result
> > >> in the ebitmap struct to reduce this overhead. The cached value is
> > >> invalidated on most modifying operations, but ebitmap_cardinality() is
> > >> usually called once the ebitmap doesn't change any more.
> > >>
> > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > >> decreased from ~14.6s to ~12.4s (2.2s saved).
> > >
> > > I have no idea why, but I'm now getting completely different times
> > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> > > longer reproduce the slower times anywhere (F31/locally/...) so I have
> > > to assume it was some kind of glitch. Since the numbers show a similar
> > > magnitude of speed-up (and they depend on a bunch of HW/SW factors
> > > anyway), I'm not going to do another respin. The applying person (most
> > > likely Stephen) is free to fix the numbers when applying if they wish
> > > to do so.
> >
> > Thanks, applied with fixed times (although I don't really think it
> > matters very much).  Maybe you're also picking up the difference from
> > the "libsepol/cil: remove unnecessary hash tables" change.
>
> No, that was actually the reason for the first correction.

Hello,
About performance issues, the current implementation of
ebitmap_cardinality() is quadratic:

for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
    if (ebitmap_get_bit(e1, i))
        count++;

... because ebitmap_get_bit() browse the bitmap:

while (n && (n->startbit <= bit)) {
   if ((n->startbit + MAPSIZE) > bit) {
      /*... */

A few years ago, I tried modifying this function to make it linear in
the bitmap size:

unsigned int ebitmap_cardinality(ebitmap_t *e1)
{
    unsigned int count = 0;
    ebitmap_node_t *n;

   for (n = e1->node; n; n = n->next) {
        count += __builtin_popcountll(n->map);
    }
    return count;
}

... but never actually sent a patch for this, because I wanted to
assess how __builtin_popcountll() was supported by several compilers
beforehand. Would this be helpful to gain even more performance gain?

Cheers,
Nicolas
William Roberts Feb. 25, 2020, 9:56 p.m. UTC | #10
On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote:
>
> On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> >
> > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called
> > > >> quite often and contributes a lot to the total runtime. Cache its result
> > > >> in the ebitmap struct to reduce this overhead. The cached value is
> > > >> invalidated on most modifying operations, but ebitmap_cardinality() is
> > > >> usually called once the ebitmap doesn't change any more.
> > > >>
> > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > > >> decreased from ~14.6s to ~12.4s (2.2s saved).
> > > >
> > > > I have no idea why, but I'm now getting completely different times
> > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> > > > longer reproduce the slower times anywhere (F31/locally/...) so I have
> > > > to assume it was some kind of glitch. Since the numbers show a similar
> > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors
> > > > anyway), I'm not going to do another respin. The applying person (most
> > > > likely Stephen) is free to fix the numbers when applying if they wish
> > > > to do so.
> > >
> > > Thanks, applied with fixed times (although I don't really think it
> > > matters very much).  Maybe you're also picking up the difference from
> > > the "libsepol/cil: remove unnecessary hash tables" change.
> >
> > No, that was actually the reason for the first correction.
>
> Hello,
> About performance issues, the current implementation of
> ebitmap_cardinality() is quadratic:
>
> for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
>     if (ebitmap_get_bit(e1, i))
>         count++;
>
> ... because ebitmap_get_bit() browse the bitmap:
>
> while (n && (n->startbit <= bit)) {
>    if ((n->startbit + MAPSIZE) > bit) {
>       /*... */
>
> A few years ago, I tried modifying this function to make it linear in
> the bitmap size:
>
> unsigned int ebitmap_cardinality(ebitmap_t *e1)
> {
>     unsigned int count = 0;
>     ebitmap_node_t *n;
>
>    for (n = e1->node; n; n = n->next) {
>         count += __builtin_popcountll(n->map);
>     }
>     return count;
> }
>
> ... but never actually sent a patch for this, because I wanted to
> assess how __builtin_popcountll() was supported by several compilers
> beforehand. Would this be helpful to gain even more performance gain?

Every architecture I've used has an instruction it boils down to:
x86 - POPCNT
ARM (neon): vcnt

For others, (do they even matter at this point) I would imagine GCC
does something relatively sane.

>
> Cheers,
> Nicolas
>
Ondrej Mosnacek Feb. 26, 2020, 1:23 p.m. UTC | #11
On Tue, Feb 25, 2020 at 10:57 PM William Roberts
<bill.c.roberts@gmail.com> wrote:
> On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote:
> >
> > On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > >
> > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> > > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called
> > > > >> quite often and contributes a lot to the total runtime. Cache its result
> > > > >> in the ebitmap struct to reduce this overhead. The cached value is
> > > > >> invalidated on most modifying operations, but ebitmap_cardinality() is
> > > > >> usually called once the ebitmap doesn't change any more.
> > > > >>
> > > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > > > >> decreased from ~14.6s to ~12.4s (2.2s saved).
> > > > >
> > > > > I have no idea why, but I'm now getting completely different times
> > > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> > > > > longer reproduce the slower times anywhere (F31/locally/...) so I have
> > > > > to assume it was some kind of glitch. Since the numbers show a similar
> > > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors
> > > > > anyway), I'm not going to do another respin. The applying person (most
> > > > > likely Stephen) is free to fix the numbers when applying if they wish
> > > > > to do so.
> > > >
> > > > Thanks, applied with fixed times (although I don't really think it
> > > > matters very much).  Maybe you're also picking up the difference from
> > > > the "libsepol/cil: remove unnecessary hash tables" change.
> > >
> > > No, that was actually the reason for the first correction.
> >
> > Hello,
> > About performance issues, the current implementation of
> > ebitmap_cardinality() is quadratic:
> >
> > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
> >     if (ebitmap_get_bit(e1, i))
> >         count++;
> >
> > ... because ebitmap_get_bit() browse the bitmap:
> >
> > while (n && (n->startbit <= bit)) {
> >    if ((n->startbit + MAPSIZE) > bit) {
> >       /*... */

Hm... I didn't realize that the function is actually quadratic.

> >
> > A few years ago, I tried modifying this function to make it linear in
> > the bitmap size:
> >
> > unsigned int ebitmap_cardinality(ebitmap_t *e1)
> > {
> >     unsigned int count = 0;
> >     ebitmap_node_t *n;
> >
> >    for (n = e1->node; n; n = n->next) {
> >         count += __builtin_popcountll(n->map);
> >     }
> >     return count;
> > }
> >
> > ... but never actually sent a patch for this, because I wanted to
> > assess how __builtin_popcountll() was supported by several compilers
> > beforehand. Would this be helpful to gain even more performance gain?
>
> Every architecture I've used has an instruction it boils down to:
> x86 - POPCNT
> ARM (neon): vcnt

Note that the compiler will only emit these instructions if you
compile with the right target platform (-mpopcnt or something that
includes it on x86_64). Portable generic builds will usually not use
it. Still, even without the special instruction __builtin_popcountll()
should generate more optimal code than the naive
add-each-bit-one-by-one approach. For example, I came up with this
pure C implementation of 64-bit popcount [1] that both GCC and Clang
can compile down to ~36 instructions. The generic version of
__builtin_popcountll() likely does something similar. (Actually, here
is what Clang seems to use [2], which is pretty close.)

FWIW, I tested the __builtin_popcountll() version with the caching
patch reverted (built without popcnt support) and it actually
performed even better than the old code + caching (it went down to
~0.11% of semodule -B running time). A naive popcount implementation
without caching didn't perform as good (was slower than the old code +
caching).

So... we could just open-code some good generic C implementation
(cleanly written and properly commented, of course) and then we
wouldn't have to rely on the compiler builtin. OTOH, the SELinux
userspace already uses non-standard compiler extensions
(__attribute__(...)), so maybe sticking to pure C is not worth it...
Either way I think we should revert the caching patch along with
switching to an optimized implementation (it would no longer be worth
the added complexity IMO).

[1] https://gcc.godbolt.org/z/39W7qa
[2] https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/popcountdi2.c

>
> For others, (do they even matter at this point) I would imagine GCC
> does something relatively sane.
>
> >
> > Cheers,
> > Nicolas
> >
>
--
Ondrej Mosnacek <omosnace at redhat dot com>
Software Engineer, Security Technologies
Red Hat, Inc.
William Roberts Feb. 26, 2020, 3:39 p.m. UTC | #12
On Wed, Feb 26, 2020 at 7:23 AM Ondrej Mosnacek <omosnace@redhat.com> wrote:
>
> On Tue, Feb 25, 2020 at 10:57 PM William Roberts
> <bill.c.roberts@gmail.com> wrote:
> > On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote:
> > >
> > > On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > > >
> > > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > > > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote:
> > > > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote:
> > > > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called
> > > > > >> quite often and contributes a lot to the total runtime. Cache its result
> > > > > >> in the ebitmap struct to reduce this overhead. The cached value is
> > > > > >> invalidated on most modifying operations, but ebitmap_cardinality() is
> > > > > >> usually called once the ebitmap doesn't change any more.
> > > > > >>
> > > > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has
> > > > > >> decreased from ~14.6s to ~12.4s (2.2s saved).
> > > > > >
> > > > > > I have no idea why, but I'm now getting completely different times
> > > > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no
> > > > > > longer reproduce the slower times anywhere (F31/locally/...) so I have
> > > > > > to assume it was some kind of glitch. Since the numbers show a similar
> > > > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors
> > > > > > anyway), I'm not going to do another respin. The applying person (most
> > > > > > likely Stephen) is free to fix the numbers when applying if they wish
> > > > > > to do so.
> > > > >
> > > > > Thanks, applied with fixed times (although I don't really think it
> > > > > matters very much).  Maybe you're also picking up the difference from
> > > > > the "libsepol/cil: remove unnecessary hash tables" change.
> > > >
> > > > No, that was actually the reason for the first correction.
> > >
> > > Hello,
> > > About performance issues, the current implementation of
> > > ebitmap_cardinality() is quadratic:
> > >
> > > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
> > >     if (ebitmap_get_bit(e1, i))
> > >         count++;
> > >
> > > ... because ebitmap_get_bit() browse the bitmap:
> > >
> > > while (n && (n->startbit <= bit)) {
> > >    if ((n->startbit + MAPSIZE) > bit) {
> > >       /*... */
>
> Hm... I didn't realize that the function is actually quadratic.
>
> > >
> > > A few years ago, I tried modifying this function to make it linear in
> > > the bitmap size:
> > >
> > > unsigned int ebitmap_cardinality(ebitmap_t *e1)
> > > {
> > >     unsigned int count = 0;
> > >     ebitmap_node_t *n;
> > >
> > >    for (n = e1->node; n; n = n->next) {
> > >         count += __builtin_popcountll(n->map);
> > >     }
> > >     return count;
> > > }
> > >
> > > ... but never actually sent a patch for this, because I wanted to
> > > assess how __builtin_popcountll() was supported by several compilers
> > > beforehand. Would this be helpful to gain even more performance gain?
> >
> > Every architecture I've used has an instruction it boils down to:
> > x86 - POPCNT
> > ARM (neon): vcnt
>
> Note that the compiler will only emit these instructions if you
> compile with the right target platform (-mpopcnt or something that
> includes it on x86_64). Portable generic builds will usually not use
> it. Still, even without the special instruction __builtin_popcountll()
> should generate more optimal code than the naive
> add-each-bit-one-by-one approach. For example, I came up with this
> pure C implementation of 64-bit popcount [1] that both GCC and Clang
> can compile down to ~36 instructions. The generic version of
> __builtin_popcountll() likely does something similar. (Actually, here
> is what Clang seems to use [2], which is pretty close.)
>
> FWIW, I tested the __builtin_popcountll() version with the caching
> patch reverted (built without popcnt support) and it actually
> performed even better than the old code + caching (it went down to
> ~0.11% of semodule -B running time). A naive popcount implementation
> without caching didn't perform as good (was slower than the old code +
> caching).
>
> So... we could just open-code some good generic C implementation
> (cleanly written and properly commented, of course) and then we
> wouldn't have to rely on the compiler builtin. OTOH, the SELinux
> userspace already uses non-standard compiler extensions
> (__attribute__(...)), so maybe sticking to pure C is not worth it...
> Either way I think we should revert the caching patch along with
> switching to an optimized implementation (it would no longer be worth
> the added complexity IMO).

+2 using the compiler intrinsic. In all reality, for libselinux
targets, their are
two compilers, gcc and clang and they have full support for this.

>
> [1] https://gcc.godbolt.org/z/39W7qa
> [2] https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/popcountdi2.c
>
> >
> > For others, (do they even matter at this point) I would imagine GCC
> > does something relatively sane.
> >
> > >
> > > Cheers,
> > > Nicolas
> > >
> >
> --
> Ondrej Mosnacek <omosnace at redhat dot com>
> Software Engineer, Security Technologies
> Red Hat, Inc.
>
diff mbox series

Patch

diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h
index e62df01c..53fafdaa 100644
--- a/libsepol/include/sepol/policydb/ebitmap.h
+++ b/libsepol/include/sepol/policydb/ebitmap.h
@@ -37,6 +37,7 @@  typedef struct ebitmap_node {
 typedef struct ebitmap {
 	ebitmap_node_t *node;	/* first node in the bitmap */
 	uint32_t highbit;	/* highest position in the total bitmap */
+	unsigned int cardinality;	/* cached value of cardinality */
 } ebitmap_t;
 
 #define ebitmap_length(e) ((e)->highbit)
diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c
index 6c9951b7..d23444ce 100644
--- a/libsepol/src/ebitmap.c
+++ b/libsepol/src/ebitmap.c
@@ -67,6 +67,7 @@  int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1)
 	ebitmap_destroy(dst);
 	dst->node = tmp.node;
 	dst->highbit = tmp.highbit;
+	dst->cardinality = 0;
 
 	return 0;
 }
@@ -128,9 +129,14 @@  int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma
 unsigned int ebitmap_cardinality(ebitmap_t *e1)
 {
 	unsigned int i, count = 0;
+
+	if (e1->cardinality || e1->highbit == 0)
+		return e1->cardinality;
+
 	for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++)
 		if (ebitmap_get_bit(e1, i))
 			count++;
+	e1->cardinality = count;
 	return count;
 }
 
@@ -194,6 +200,7 @@  int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src)
 	}
 
 	dst->highbit = src->highbit;
+	dst->cardinality = src->cardinality;
 	return 0;
 }
 
@@ -309,6 +316,7 @@  int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
 					free(n);
 				}
 			}
+			e->cardinality = 0; /* invalidate cached cardinality */
 			return 0;
 		}
 		prev = n;
@@ -339,6 +347,7 @@  int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value)
 		e->node = new;
 	}
 
+	e->cardinality = 0; /* invalidate cached cardinality */
 	return 0;
 }
 
@@ -358,6 +367,7 @@  void ebitmap_destroy(ebitmap_t * e)
 
 	e->highbit = 0;
 	e->node = 0;
+	e->cardinality = 0;
 	return;
 }