Message ID | 20200213133959.14217-1-omosnace@redhat.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [userspace,v2] libsepol: cache ebitmap cardinality value | expand |
On 2/13/20 8:39 AM, Ondrej Mosnacek wrote: > According to profiling of semodule -BN, ebitmap_cardinality() is called > quite often and contributes a lot to the total runtime. Cache its result > in the ebitmap struct to reduce this overhead. The cached value is > invalidated on most modifying operations, but ebitmap_cardinality() is > usually called once the ebitmap doesn't change any more. > > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > decreased from ~14.6s to ~12.4s (2.2s saved). > > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> This seems fine but I was wondering how many of the callers of ebitmap_cardinality() actually need anything more than ebitmap_length()? > --- > > v2: corrected time values in commit message > > libsepol/include/sepol/policydb/ebitmap.h | 1 + > libsepol/src/ebitmap.c | 10 ++++++++++ > 2 files changed, 11 insertions(+) > > diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h > index e62df01c..53fafdaa 100644 > --- a/libsepol/include/sepol/policydb/ebitmap.h > +++ b/libsepol/include/sepol/policydb/ebitmap.h > @@ -37,6 +37,7 @@ typedef struct ebitmap_node { > typedef struct ebitmap { > ebitmap_node_t *node; /* first node in the bitmap */ > uint32_t highbit; /* highest position in the total bitmap */ > + unsigned int cardinality; /* cached value of cardinality */ > } ebitmap_t; > > #define ebitmap_length(e) ((e)->highbit) > diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c > index 6c9951b7..d23444ce 100644 > --- a/libsepol/src/ebitmap.c > +++ b/libsepol/src/ebitmap.c > @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1) > ebitmap_destroy(dst); > dst->node = tmp.node; > dst->highbit = tmp.highbit; > + dst->cardinality = 0; > > return 0; > } > @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma > unsigned int ebitmap_cardinality(ebitmap_t *e1) > { > unsigned int i, count = 0; > + > + if (e1->cardinality || e1->highbit == 0) > + return e1->cardinality; > + > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) > if (ebitmap_get_bit(e1, i)) > count++; > + e1->cardinality = count; > return count; > } > > @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src) > } > > dst->highbit = src->highbit; > + dst->cardinality = src->cardinality; > return 0; > } > > @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) > free(n); > } > } > + e->cardinality = 0; /* invalidate cached cardinality */ > return 0; > } > prev = n; > @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) > e->node = new; > } > > + e->cardinality = 0; /* invalidate cached cardinality */ > return 0; > } > > @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e) > > e->highbit = 0; > e->node = 0; > + e->cardinality = 0; > return; > } > >
On 2/14/20 12:38 PM, Stephen Smalley wrote: > On 2/13/20 8:39 AM, Ondrej Mosnacek wrote: >> According to profiling of semodule -BN, ebitmap_cardinality() is called >> quite often and contributes a lot to the total runtime. Cache its result >> in the ebitmap struct to reduce this overhead. The cached value is >> invalidated on most modifying operations, but ebitmap_cardinality() is >> usually called once the ebitmap doesn't change any more. >> >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has >> decreased from ~14.6s to ~12.4s (2.2s saved). >> >> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> > > This seems fine but I was wondering how many of the callers of > ebitmap_cardinality() actually need anything more than ebitmap_length()? Regardless, Acked-by: Stephen Smalley <sds@tycho.nsa.gov> > >> --- >> >> v2: corrected time values in commit message >> >> libsepol/include/sepol/policydb/ebitmap.h | 1 + >> libsepol/src/ebitmap.c | 10 ++++++++++ >> 2 files changed, 11 insertions(+) >> >> diff --git a/libsepol/include/sepol/policydb/ebitmap.h >> b/libsepol/include/sepol/policydb/ebitmap.h >> index e62df01c..53fafdaa 100644 >> --- a/libsepol/include/sepol/policydb/ebitmap.h >> +++ b/libsepol/include/sepol/policydb/ebitmap.h >> @@ -37,6 +37,7 @@ typedef struct ebitmap_node { >> typedef struct ebitmap { >> ebitmap_node_t *node; /* first node in the bitmap */ >> uint32_t highbit; /* highest position in the total bitmap */ >> + unsigned int cardinality; /* cached value of cardinality */ >> } ebitmap_t; >> #define ebitmap_length(e) ((e)->highbit) >> diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c >> index 6c9951b7..d23444ce 100644 >> --- a/libsepol/src/ebitmap.c >> +++ b/libsepol/src/ebitmap.c >> @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * >> e1) >> ebitmap_destroy(dst); >> dst->node = tmp.node; >> dst->highbit = tmp.highbit; >> + dst->cardinality = 0; >> return 0; >> } >> @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, >> ebitmap_t *e2, unsigned int ma >> unsigned int ebitmap_cardinality(ebitmap_t *e1) >> { >> unsigned int i, count = 0; >> + >> + if (e1->cardinality || e1->highbit == 0) >> + return e1->cardinality; >> + >> for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) >> if (ebitmap_get_bit(e1, i)) >> count++; >> + e1->cardinality = count; >> return count; >> } >> @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * >> src) >> } >> dst->highbit = src->highbit; >> + dst->cardinality = src->cardinality; >> return 0; >> } >> @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int >> bit, int value) >> free(n); >> } >> } >> + e->cardinality = 0; /* invalidate cached cardinality */ >> return 0; >> } >> prev = n; >> @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int >> bit, int value) >> e->node = new; >> } >> + e->cardinality = 0; /* invalidate cached cardinality */ >> return 0; >> } >> @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e) >> e->highbit = 0; >> e->node = 0; >> + e->cardinality = 0; >> return; >> } >> >
On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > On 2/13/20 8:39 AM, Ondrej Mosnacek wrote: > > According to profiling of semodule -BN, ebitmap_cardinality() is called > > quite often and contributes a lot to the total runtime. Cache its result > > in the ebitmap struct to reduce this overhead. The cached value is > > invalidated on most modifying operations, but ebitmap_cardinality() is > > usually called once the ebitmap doesn't change any more. > > > > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > decreased from ~14.6s to ~12.4s (2.2s saved). > > > > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> > > This seems fine but I was wondering how many of the callers of > ebitmap_cardinality() actually need anything more than ebitmap_length()? The caller that calls it the most (>99%) during a 'semodule -B' is __cil_should_expand_attribute(), which logically needs the actual cardinality. It might be possible to cache the decision directly in 'struct cil_typeattribute', but I don't know the CIL code well enough to attempt that...
On 2/14/20 2:51 PM, Ondrej Mosnacek wrote: > On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: >> On 2/13/20 8:39 AM, Ondrej Mosnacek wrote: >>> According to profiling of semodule -BN, ebitmap_cardinality() is called >>> quite often and contributes a lot to the total runtime. Cache its result >>> in the ebitmap struct to reduce this overhead. The cached value is >>> invalidated on most modifying operations, but ebitmap_cardinality() is >>> usually called once the ebitmap doesn't change any more. >>> >>> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has >>> decreased from ~14.6s to ~12.4s (2.2s saved). >>> >>> Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> >> >> This seems fine but I was wondering how many of the callers of >> ebitmap_cardinality() actually need anything more than ebitmap_length()? > > The caller that calls it the most (>99%) during a 'semodule -B' is > __cil_should_expand_attribute(), which logically needs the actual > cardinality. It might be possible to cache the decision directly in > 'struct cil_typeattribute', but I don't know the CIL code well enough > to attempt that... That's fine - I'm ok with the patch itself. I just happened to notice that there appear to be quite a few callers elsewhere in libsepol that only seem to care whether the result is zero or non-zero, which seemingly would be just as happy using ebitmap_length().
On Fri, Feb 14, 2020 at 8:51 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > On Fri, Feb 14, 2020 at 6:37 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > > On 2/13/20 8:39 AM, Ondrej Mosnacek wrote: > > > According to profiling of semodule -BN, ebitmap_cardinality() is called > > > quite often and contributes a lot to the total runtime. Cache its result > > > in the ebitmap struct to reduce this overhead. The cached value is > > > invalidated on most modifying operations, but ebitmap_cardinality() is > > > usually called once the ebitmap doesn't change any more. > > > > > > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > > decreased from ~14.6s to ~12.4s (2.2s saved). > > > > > > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> > > > > This seems fine but I was wondering how many of the callers of > > ebitmap_cardinality() actually need anything more than ebitmap_length()? > > The caller that calls it the most (>99%) during a 'semodule -B' is > __cil_should_expand_attribute(), which logically needs the actual > cardinality. It might be possible to cache the decision directly in > 'struct cil_typeattribute', but I don't know the CIL code well enough > to attempt that... BTW, in case anyone is wondering how I'm getting these numbers/facts - I use Callgrind [1] to profile a program's run and then analyze it with KCachegrind [2]. It is a surprisingly nice and easy to use GUI for analyzing where the program spends most of its time. Collecting the profile data is as simple as: LD_BIND_NOW=1 valgrind --tool=callgrind <your_command> <args>... (The LD_BIND_NOW=1 is to prevent the dynamic linker's lazy binding from messing with the results.) Then you can just open the generated "callgrind.out.<pid>" file with KCachegrind and click around... Note that to see the function names, you need to compile the program with -g (but you should keep -O2 et al. to get the same optimized code as an actual build). Alternatively, Callgrind will also auto-detect and use debug symbols provided by distributions in their -debug/-debuginfo packages. Maybe this is common knowledge for most, but perhaps someone here will be one of today's lucky 10000 :) [3] [1] https://valgrind.org/docs/manual/cl-manual.html [2] https://kcachegrind.github.io/html/Home.html [3] https://xkcd.com/1053/
On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > According to profiling of semodule -BN, ebitmap_cardinality() is called > quite often and contributes a lot to the total runtime. Cache its result > in the ebitmap struct to reduce this overhead. The cached value is > invalidated on most modifying operations, but ebitmap_cardinality() is > usually called once the ebitmap doesn't change any more. > > After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > decreased from ~14.6s to ~12.4s (2.2s saved). I have no idea why, but I'm now getting completely different times (10.9s vs. 8.9s) with the same builds on the same setup... I can no longer reproduce the slower times anywhere (F31/locally/...) so I have to assume it was some kind of glitch. Since the numbers show a similar magnitude of speed-up (and they depend on a bunch of HW/SW factors anyway), I'm not going to do another respin. The applying person (most likely Stephen) is free to fix the numbers when applying if they wish to do so. > > Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> > --- > > v2: corrected time values in commit message > > libsepol/include/sepol/policydb/ebitmap.h | 1 + > libsepol/src/ebitmap.c | 10 ++++++++++ > 2 files changed, 11 insertions(+) > > diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h > index e62df01c..53fafdaa 100644 > --- a/libsepol/include/sepol/policydb/ebitmap.h > +++ b/libsepol/include/sepol/policydb/ebitmap.h > @@ -37,6 +37,7 @@ typedef struct ebitmap_node { > typedef struct ebitmap { > ebitmap_node_t *node; /* first node in the bitmap */ > uint32_t highbit; /* highest position in the total bitmap */ > + unsigned int cardinality; /* cached value of cardinality */ > } ebitmap_t; > > #define ebitmap_length(e) ((e)->highbit) > diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c > index 6c9951b7..d23444ce 100644 > --- a/libsepol/src/ebitmap.c > +++ b/libsepol/src/ebitmap.c > @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1) > ebitmap_destroy(dst); > dst->node = tmp.node; > dst->highbit = tmp.highbit; > + dst->cardinality = 0; > > return 0; > } > @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma > unsigned int ebitmap_cardinality(ebitmap_t *e1) > { > unsigned int i, count = 0; > + > + if (e1->cardinality || e1->highbit == 0) > + return e1->cardinality; > + > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) > if (ebitmap_get_bit(e1, i)) > count++; > + e1->cardinality = count; > return count; > } > > @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src) > } > > dst->highbit = src->highbit; > + dst->cardinality = src->cardinality; > return 0; > } > > @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) > free(n); > } > } > + e->cardinality = 0; /* invalidate cached cardinality */ > return 0; > } > prev = n; > @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) > e->node = new; > } > > + e->cardinality = 0; /* invalidate cached cardinality */ > return 0; > } > > @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e) > > e->highbit = 0; > e->node = 0; > + e->cardinality = 0; > return; > } > > -- > 2.24.1 > -- Ondrej Mosnacek <omosnace at redhat dot com> Software Engineer, Security Technologies Red Hat, Inc.
On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: >> According to profiling of semodule -BN, ebitmap_cardinality() is called >> quite often and contributes a lot to the total runtime. Cache its result >> in the ebitmap struct to reduce this overhead. The cached value is >> invalidated on most modifying operations, but ebitmap_cardinality() is >> usually called once the ebitmap doesn't change any more. >> >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has >> decreased from ~14.6s to ~12.4s (2.2s saved). > > I have no idea why, but I'm now getting completely different times > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > longer reproduce the slower times anywhere (F31/locally/...) so I have > to assume it was some kind of glitch. Since the numbers show a similar > magnitude of speed-up (and they depend on a bunch of HW/SW factors > anyway), I'm not going to do another respin. The applying person (most > likely Stephen) is free to fix the numbers when applying if they wish > to do so. Thanks, applied with fixed times (although I don't really think it matters very much). Maybe you're also picking up the difference from the "libsepol/cil: remove unnecessary hash tables" change.
On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > >> According to profiling of semodule -BN, ebitmap_cardinality() is called > >> quite often and contributes a lot to the total runtime. Cache its result > >> in the ebitmap struct to reduce this overhead. The cached value is > >> invalidated on most modifying operations, but ebitmap_cardinality() is > >> usually called once the ebitmap doesn't change any more. > >> > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > >> decreased from ~14.6s to ~12.4s (2.2s saved). > > > > I have no idea why, but I'm now getting completely different times > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > > longer reproduce the slower times anywhere (F31/locally/...) so I have > > to assume it was some kind of glitch. Since the numbers show a similar > > magnitude of speed-up (and they depend on a bunch of HW/SW factors > > anyway), I'm not going to do another respin. The applying person (most > > likely Stephen) is free to fix the numbers when applying if they wish > > to do so. > > Thanks, applied with fixed times (although I don't really think it > matters very much). Maybe you're also picking up the difference from > the "libsepol/cil: remove unnecessary hash tables" change. No, that was actually the reason for the first correction. -- Ondrej Mosnacek <omosnace at redhat dot com> Software Engineer, Security Technologies Red Hat, Inc.
On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called > > >> quite often and contributes a lot to the total runtime. Cache its result > > >> in the ebitmap struct to reduce this overhead. The cached value is > > >> invalidated on most modifying operations, but ebitmap_cardinality() is > > >> usually called once the ebitmap doesn't change any more. > > >> > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > >> decreased from ~14.6s to ~12.4s (2.2s saved). > > > > > > I have no idea why, but I'm now getting completely different times > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > > > longer reproduce the slower times anywhere (F31/locally/...) so I have > > > to assume it was some kind of glitch. Since the numbers show a similar > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors > > > anyway), I'm not going to do another respin. The applying person (most > > > likely Stephen) is free to fix the numbers when applying if they wish > > > to do so. > > > > Thanks, applied with fixed times (although I don't really think it > > matters very much). Maybe you're also picking up the difference from > > the "libsepol/cil: remove unnecessary hash tables" change. > > No, that was actually the reason for the first correction. Hello, About performance issues, the current implementation of ebitmap_cardinality() is quadratic: for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) if (ebitmap_get_bit(e1, i)) count++; ... because ebitmap_get_bit() browse the bitmap: while (n && (n->startbit <= bit)) { if ((n->startbit + MAPSIZE) > bit) { /*... */ A few years ago, I tried modifying this function to make it linear in the bitmap size: unsigned int ebitmap_cardinality(ebitmap_t *e1) { unsigned int count = 0; ebitmap_node_t *n; for (n = e1->node; n; n = n->next) { count += __builtin_popcountll(n->map); } return count; } ... but never actually sent a patch for this, because I wanted to assess how __builtin_popcountll() was supported by several compilers beforehand. Would this be helpful to gain even more performance gain? Cheers, Nicolas
On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote: > > On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called > > > >> quite often and contributes a lot to the total runtime. Cache its result > > > >> in the ebitmap struct to reduce this overhead. The cached value is > > > >> invalidated on most modifying operations, but ebitmap_cardinality() is > > > >> usually called once the ebitmap doesn't change any more. > > > >> > > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > > >> decreased from ~14.6s to ~12.4s (2.2s saved). > > > > > > > > I have no idea why, but I'm now getting completely different times > > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > > > > longer reproduce the slower times anywhere (F31/locally/...) so I have > > > > to assume it was some kind of glitch. Since the numbers show a similar > > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors > > > > anyway), I'm not going to do another respin. The applying person (most > > > > likely Stephen) is free to fix the numbers when applying if they wish > > > > to do so. > > > > > > Thanks, applied with fixed times (although I don't really think it > > > matters very much). Maybe you're also picking up the difference from > > > the "libsepol/cil: remove unnecessary hash tables" change. > > > > No, that was actually the reason for the first correction. > > Hello, > About performance issues, the current implementation of > ebitmap_cardinality() is quadratic: > > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) > if (ebitmap_get_bit(e1, i)) > count++; > > ... because ebitmap_get_bit() browse the bitmap: > > while (n && (n->startbit <= bit)) { > if ((n->startbit + MAPSIZE) > bit) { > /*... */ > > A few years ago, I tried modifying this function to make it linear in > the bitmap size: > > unsigned int ebitmap_cardinality(ebitmap_t *e1) > { > unsigned int count = 0; > ebitmap_node_t *n; > > for (n = e1->node; n; n = n->next) { > count += __builtin_popcountll(n->map); > } > return count; > } > > ... but never actually sent a patch for this, because I wanted to > assess how __builtin_popcountll() was supported by several compilers > beforehand. Would this be helpful to gain even more performance gain? Every architecture I've used has an instruction it boils down to: x86 - POPCNT ARM (neon): vcnt For others, (do they even matter at this point) I would imagine GCC does something relatively sane. > > Cheers, > Nicolas >
On Tue, Feb 25, 2020 at 10:57 PM William Roberts <bill.c.roberts@gmail.com> wrote: > On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote: > > > > On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > > > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > > > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > > > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called > > > > >> quite often and contributes a lot to the total runtime. Cache its result > > > > >> in the ebitmap struct to reduce this overhead. The cached value is > > > > >> invalidated on most modifying operations, but ebitmap_cardinality() is > > > > >> usually called once the ebitmap doesn't change any more. > > > > >> > > > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > > > >> decreased from ~14.6s to ~12.4s (2.2s saved). > > > > > > > > > > I have no idea why, but I'm now getting completely different times > > > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > > > > > longer reproduce the slower times anywhere (F31/locally/...) so I have > > > > > to assume it was some kind of glitch. Since the numbers show a similar > > > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors > > > > > anyway), I'm not going to do another respin. The applying person (most > > > > > likely Stephen) is free to fix the numbers when applying if they wish > > > > > to do so. > > > > > > > > Thanks, applied with fixed times (although I don't really think it > > > > matters very much). Maybe you're also picking up the difference from > > > > the "libsepol/cil: remove unnecessary hash tables" change. > > > > > > No, that was actually the reason for the first correction. > > > > Hello, > > About performance issues, the current implementation of > > ebitmap_cardinality() is quadratic: > > > > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) > > if (ebitmap_get_bit(e1, i)) > > count++; > > > > ... because ebitmap_get_bit() browse the bitmap: > > > > while (n && (n->startbit <= bit)) { > > if ((n->startbit + MAPSIZE) > bit) { > > /*... */ Hm... I didn't realize that the function is actually quadratic. > > > > A few years ago, I tried modifying this function to make it linear in > > the bitmap size: > > > > unsigned int ebitmap_cardinality(ebitmap_t *e1) > > { > > unsigned int count = 0; > > ebitmap_node_t *n; > > > > for (n = e1->node; n; n = n->next) { > > count += __builtin_popcountll(n->map); > > } > > return count; > > } > > > > ... but never actually sent a patch for this, because I wanted to > > assess how __builtin_popcountll() was supported by several compilers > > beforehand. Would this be helpful to gain even more performance gain? > > Every architecture I've used has an instruction it boils down to: > x86 - POPCNT > ARM (neon): vcnt Note that the compiler will only emit these instructions if you compile with the right target platform (-mpopcnt or something that includes it on x86_64). Portable generic builds will usually not use it. Still, even without the special instruction __builtin_popcountll() should generate more optimal code than the naive add-each-bit-one-by-one approach. For example, I came up with this pure C implementation of 64-bit popcount [1] that both GCC and Clang can compile down to ~36 instructions. The generic version of __builtin_popcountll() likely does something similar. (Actually, here is what Clang seems to use [2], which is pretty close.) FWIW, I tested the __builtin_popcountll() version with the caching patch reverted (built without popcnt support) and it actually performed even better than the old code + caching (it went down to ~0.11% of semodule -B running time). A naive popcount implementation without caching didn't perform as good (was slower than the old code + caching). So... we could just open-code some good generic C implementation (cleanly written and properly commented, of course) and then we wouldn't have to rely on the compiler builtin. OTOH, the SELinux userspace already uses non-standard compiler extensions (__attribute__(...)), so maybe sticking to pure C is not worth it... Either way I think we should revert the caching patch along with switching to an optimized implementation (it would no longer be worth the added complexity IMO). [1] https://gcc.godbolt.org/z/39W7qa [2] https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/popcountdi2.c > > For others, (do they even matter at this point) I would imagine GCC > does something relatively sane. > > > > > Cheers, > > Nicolas > > > -- Ondrej Mosnacek <omosnace at redhat dot com> Software Engineer, Security Technologies Red Hat, Inc.
On Wed, Feb 26, 2020 at 7:23 AM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > On Tue, Feb 25, 2020 at 10:57 PM William Roberts > <bill.c.roberts@gmail.com> wrote: > > On Tue, Feb 25, 2020 at 3:33 PM Nicolas Iooss <nicolas.iooss@m4x.org> wrote: > > > > > > On Tue, Feb 18, 2020 at 5:01 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > > > > > > On Tue, Feb 18, 2020 at 4:40 PM Stephen Smalley <sds@tycho.nsa.gov> wrote: > > > > > On 2/18/20 10:22 AM, Ondrej Mosnacek wrote: > > > > > > On Thu, Feb 13, 2020 at 2:40 PM Ondrej Mosnacek <omosnace@redhat.com> wrote: > > > > > >> According to profiling of semodule -BN, ebitmap_cardinality() is called > > > > > >> quite often and contributes a lot to the total runtime. Cache its result > > > > > >> in the ebitmap struct to reduce this overhead. The cached value is > > > > > >> invalidated on most modifying operations, but ebitmap_cardinality() is > > > > > >> usually called once the ebitmap doesn't change any more. > > > > > >> > > > > > >> After this patch, the time to do 'semodule -BN' on Fedora Rawhide has > > > > > >> decreased from ~14.6s to ~12.4s (2.2s saved). > > > > > > > > > > > > I have no idea why, but I'm now getting completely different times > > > > > > (10.9s vs. 8.9s) with the same builds on the same setup... I can no > > > > > > longer reproduce the slower times anywhere (F31/locally/...) so I have > > > > > > to assume it was some kind of glitch. Since the numbers show a similar > > > > > > magnitude of speed-up (and they depend on a bunch of HW/SW factors > > > > > > anyway), I'm not going to do another respin. The applying person (most > > > > > > likely Stephen) is free to fix the numbers when applying if they wish > > > > > > to do so. > > > > > > > > > > Thanks, applied with fixed times (although I don't really think it > > > > > matters very much). Maybe you're also picking up the difference from > > > > > the "libsepol/cil: remove unnecessary hash tables" change. > > > > > > > > No, that was actually the reason for the first correction. > > > > > > Hello, > > > About performance issues, the current implementation of > > > ebitmap_cardinality() is quadratic: > > > > > > for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) > > > if (ebitmap_get_bit(e1, i)) > > > count++; > > > > > > ... because ebitmap_get_bit() browse the bitmap: > > > > > > while (n && (n->startbit <= bit)) { > > > if ((n->startbit + MAPSIZE) > bit) { > > > /*... */ > > Hm... I didn't realize that the function is actually quadratic. > > > > > > > A few years ago, I tried modifying this function to make it linear in > > > the bitmap size: > > > > > > unsigned int ebitmap_cardinality(ebitmap_t *e1) > > > { > > > unsigned int count = 0; > > > ebitmap_node_t *n; > > > > > > for (n = e1->node; n; n = n->next) { > > > count += __builtin_popcountll(n->map); > > > } > > > return count; > > > } > > > > > > ... but never actually sent a patch for this, because I wanted to > > > assess how __builtin_popcountll() was supported by several compilers > > > beforehand. Would this be helpful to gain even more performance gain? > > > > Every architecture I've used has an instruction it boils down to: > > x86 - POPCNT > > ARM (neon): vcnt > > Note that the compiler will only emit these instructions if you > compile with the right target platform (-mpopcnt or something that > includes it on x86_64). Portable generic builds will usually not use > it. Still, even without the special instruction __builtin_popcountll() > should generate more optimal code than the naive > add-each-bit-one-by-one approach. For example, I came up with this > pure C implementation of 64-bit popcount [1] that both GCC and Clang > can compile down to ~36 instructions. The generic version of > __builtin_popcountll() likely does something similar. (Actually, here > is what Clang seems to use [2], which is pretty close.) > > FWIW, I tested the __builtin_popcountll() version with the caching > patch reverted (built without popcnt support) and it actually > performed even better than the old code + caching (it went down to > ~0.11% of semodule -B running time). A naive popcount implementation > without caching didn't perform as good (was slower than the old code + > caching). > > So... we could just open-code some good generic C implementation > (cleanly written and properly commented, of course) and then we > wouldn't have to rely on the compiler builtin. OTOH, the SELinux > userspace already uses non-standard compiler extensions > (__attribute__(...)), so maybe sticking to pure C is not worth it... > Either way I think we should revert the caching patch along with > switching to an optimized implementation (it would no longer be worth > the added complexity IMO). +2 using the compiler intrinsic. In all reality, for libselinux targets, their are two compilers, gcc and clang and they have full support for this. > > [1] https://gcc.godbolt.org/z/39W7qa > [2] https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/popcountdi2.c > > > > > For others, (do they even matter at this point) I would imagine GCC > > does something relatively sane. > > > > > > > > Cheers, > > > Nicolas > > > > > > -- > Ondrej Mosnacek <omosnace at redhat dot com> > Software Engineer, Security Technologies > Red Hat, Inc. >
diff --git a/libsepol/include/sepol/policydb/ebitmap.h b/libsepol/include/sepol/policydb/ebitmap.h index e62df01c..53fafdaa 100644 --- a/libsepol/include/sepol/policydb/ebitmap.h +++ b/libsepol/include/sepol/policydb/ebitmap.h @@ -37,6 +37,7 @@ typedef struct ebitmap_node { typedef struct ebitmap { ebitmap_node_t *node; /* first node in the bitmap */ uint32_t highbit; /* highest position in the total bitmap */ + unsigned int cardinality; /* cached value of cardinality */ } ebitmap_t; #define ebitmap_length(e) ((e)->highbit) diff --git a/libsepol/src/ebitmap.c b/libsepol/src/ebitmap.c index 6c9951b7..d23444ce 100644 --- a/libsepol/src/ebitmap.c +++ b/libsepol/src/ebitmap.c @@ -67,6 +67,7 @@ int ebitmap_union(ebitmap_t * dst, const ebitmap_t * e1) ebitmap_destroy(dst); dst->node = tmp.node; dst->highbit = tmp.highbit; + dst->cardinality = 0; return 0; } @@ -128,9 +129,14 @@ int ebitmap_andnot(ebitmap_t *dst, ebitmap_t *e1, ebitmap_t *e2, unsigned int ma unsigned int ebitmap_cardinality(ebitmap_t *e1) { unsigned int i, count = 0; + + if (e1->cardinality || e1->highbit == 0) + return e1->cardinality; + for (i=ebitmap_startbit(e1); i < ebitmap_length(e1); i++) if (ebitmap_get_bit(e1, i)) count++; + e1->cardinality = count; return count; } @@ -194,6 +200,7 @@ int ebitmap_cpy(ebitmap_t * dst, const ebitmap_t * src) } dst->highbit = src->highbit; + dst->cardinality = src->cardinality; return 0; } @@ -309,6 +316,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) free(n); } } + e->cardinality = 0; /* invalidate cached cardinality */ return 0; } prev = n; @@ -339,6 +347,7 @@ int ebitmap_set_bit(ebitmap_t * e, unsigned int bit, int value) e->node = new; } + e->cardinality = 0; /* invalidate cached cardinality */ return 0; } @@ -358,6 +367,7 @@ void ebitmap_destroy(ebitmap_t * e) e->highbit = 0; e->node = 0; + e->cardinality = 0; return; }
According to profiling of semodule -BN, ebitmap_cardinality() is called quite often and contributes a lot to the total runtime. Cache its result in the ebitmap struct to reduce this overhead. The cached value is invalidated on most modifying operations, but ebitmap_cardinality() is usually called once the ebitmap doesn't change any more. After this patch, the time to do 'semodule -BN' on Fedora Rawhide has decreased from ~14.6s to ~12.4s (2.2s saved). Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com> --- v2: corrected time values in commit message libsepol/include/sepol/policydb/ebitmap.h | 1 + libsepol/src/ebitmap.c | 10 ++++++++++ 2 files changed, 11 insertions(+)