diff mbox series

[bpf-next,v2,2/5] bpf: introduce timeout map

Message ID 20201214201118.148126-3-xiyou.wangcong@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: introduce timeout map | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 15659 this patch: 15663
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 86 exceeds 80 columns
netdev/build_allmodconfig_warn fail Errors and warnings before: 15324 this patch: 15328
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Cong Wang Dec. 14, 2020, 8:11 p.m. UTC
From: Cong Wang <cong.wang@bytedance.com>

This borrows the idea from conntrack and will be used for conntrack in
bpf too. Each element in a timeout map has a user-specified timeout
in secs, after it expires it will be automatically removed from the map.

There are two cases here:

1. When the timeout map is idle, that is, no one updates or accesses it,
   we rely on the idle work to scan the whole hash table and remove
   these expired. The idle work is scheduled every 1 sec.

2. When the timeout map is actively accessed, we could reach expired
   elements before the idle work kicks in, we can simply skip them and
   schedule another work to do the actual removal work. We avoid taking
   locks on fast path.

The timeout of each element can be set or updated via bpf_map_update_elem()
and we reuse the upper 32-bit of the 64-bit flag for the timeout value.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Dongdong Wang <wangdongdong.6@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   3 +-
 kernel/bpf/hashtab.c           | 244 ++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |   3 +-
 tools/include/uapi/linux/bpf.h |   1 +
 5 files changed, 248 insertions(+), 4 deletions(-)

Comments

Andrii Nakryiko Dec. 15, 2020, 7:27 p.m. UTC | #1
On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> From: Cong Wang <cong.wang@bytedance.com>
>
> This borrows the idea from conntrack and will be used for conntrack in
> bpf too. Each element in a timeout map has a user-specified timeout
> in secs, after it expires it will be automatically removed from the map.
>
> There are two cases here:
>
> 1. When the timeout map is idle, that is, no one updates or accesses it,
>    we rely on the idle work to scan the whole hash table and remove
>    these expired. The idle work is scheduled every 1 sec.

Would 1 second be a good period for a lot of cases? Probably would be
good to expand on what went into this decision.

>
> 2. When the timeout map is actively accessed, we could reach expired
>    elements before the idle work kicks in, we can simply skip them and
>    schedule another work to do the actual removal work. We avoid taking
>    locks on fast path.
>
> The timeout of each element can be set or updated via bpf_map_update_elem()
> and we reuse the upper 32-bit of the 64-bit flag for the timeout value.
>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Dongdong Wang <wangdongdong.6@bytedance.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
>  include/linux/bpf_types.h      |   1 +
>  include/uapi/linux/bpf.h       |   3 +-
>  kernel/bpf/hashtab.c           | 244 ++++++++++++++++++++++++++++++++-
>  kernel/bpf/syscall.c           |   3 +-
>  tools/include/uapi/linux/bpf.h |   1 +
>  5 files changed, 248 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 99f7fd657d87..00a3b17b6af2 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
>  #endif
>  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_TIMEOUT_HASH, htab_timeout_map_ops)
>
>  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
>  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 30b477a26482..dedb47bc3f52 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -158,6 +158,7 @@ enum bpf_map_type {
>         BPF_MAP_TYPE_RINGBUF,
>         BPF_MAP_TYPE_INODE_STORAGE,
>         BPF_MAP_TYPE_TASK_STORAGE,
> +       BPF_MAP_TYPE_TIMEOUT_HASH,
>  };
>
>  /* Note that tracing related programs such as
> @@ -393,7 +394,7 @@ enum bpf_link_type {
>   */
>  #define BPF_PSEUDO_CALL                1
>
> -/* flags for BPF_MAP_UPDATE_ELEM command */
> +/* flags for BPF_MAP_UPDATE_ELEM command, upper 32 bits are timeout */

timeout in what units of time?

>  enum {
>         BPF_ANY         = 0, /* create new element or update existing */
>         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index f0b7b54fa3a8..178cb376c397 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -8,6 +8,8 @@
>  #include <linux/filter.h>
>  #include <linux/rculist_nulls.h>
>  #include <linux/random.h>
> +#include <linux/llist.h>
> +#include <linux/workqueue.h>
>  #include <uapi/linux/btf.h>
>  #include <linux/rcupdate_trace.h>
>  #include "percpu_freelist.h"
> @@ -84,6 +86,8 @@ struct bucket {
>                 raw_spinlock_t raw_lock;
>                 spinlock_t     lock;
>         };
> +       struct llist_node gc_node;
> +       atomic_t pending;

HASH is an extremely frequently used type of map, and oftentimes with
a lot of entries/buckets. I don't think users of normal
BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
timeouts. So I think it's not appropriate to increase the size of the
struct bucket here.

>  };
>
>  #define HASHTAB_MAP_LOCK_COUNT 8
> @@ -104,6 +108,9 @@ struct bpf_htab {
>         u32 hashrnd;
>         struct lock_class_key lockdep_key;
>         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> +       struct llist_head gc_list;
> +       struct work_struct gc_work;
> +       struct delayed_work gc_idle_work;
>  };
>
>  /* each htab element is struct htab_elem + key + value */
> @@ -124,6 +131,7 @@ struct htab_elem {
>                 struct bpf_lru_node lru_node;
>         };
>         u32 hash;
> +       u64 expires;

time units? and similar concerns about wasting a lot of added memory
for not benefit to HASH users.

>         char key[] __aligned(8);
>  };
>
> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
>
>         for (i = 0; i < htab->n_buckets; i++) {
>                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> +               atomic_set(&htab->buckets[i].pending, 0);
>                 if (htab_use_raw_lock(htab)) {
>                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
>                         lockdep_set_class(&htab->buckets[i].raw_lock,
> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
>         return 0;
>  }
>
> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> +{
> +       if (atomic_fetch_or(1, &b->pending))
> +               return;
> +       llist_add(&b->gc_node, &htab->gc_list);
> +       queue_work(system_unbound_wq, &htab->gc_work);
> +}

I'm concerned about each bucket being scheduled individually... And
similarly concerned that each instance of TIMEOUT_HASH will do its own
scheduling independently. Can you think about the way to have a
"global" gc/purging logic, and just make sure that buckets that need
processing would be just internally chained together. So the purging
routing would iterate all the scheduled hashmaps, and within each it
will have a linked list of buckets that need processing? And all that
is done just once each GC period. Not N times for N maps or N*M times
for N maps with M buckets in each.

> +
>  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
>  {
>         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> @@ -732,10 +749,13 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
>  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  {
>         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
>         struct hlist_nulls_head *head;
>         struct htab_elem *l, *next_l;
>         u32 hash, key_size;
> +       struct bucket *b;
>         int i = 0;
> +       u64 now;
>
>         WARN_ON_ONCE(!rcu_read_lock_held());
>
> @@ -746,7 +766,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>
>         hash = htab_map_hash(key, key_size, htab->hashrnd);
>
> -       head = select_bucket(htab, hash);
> +       b = __select_bucket(htab, hash);
> +       head = &b->head;
>
>         /* lookup the key */
>         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
> @@ -759,6 +780,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>                                   struct htab_elem, hash_node);
>
>         if (next_l) {
> +               if (is_timeout) {
> +                       now = get_jiffies_64();
> +                       if (time_after_eq64(now, next_l->expires)) {
> +                               htab_sched_gc(htab, b);
> +                               goto find_first_elem;
> +                       }
> +               }

this piece of logic is repeated verbatim many times, seems like a
helper function would make sense here

>                 /* if next elem in this hash list is non-zero, just return it */
>                 memcpy(next_key, next_l->key, key_size);
>                 return 0;
> @@ -771,12 +799,20 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
>  find_first_elem:
>         /* iterate over buckets */
>         for (; i < htab->n_buckets; i++) {
> -               head = select_bucket(htab, i);
> +               b = __select_bucket(htab, i);
> +               head = &b->head;
>
>                 /* pick first element in the bucket */
>                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
>                                           struct htab_elem, hash_node);
>                 if (next_l) {
> +                       if (is_timeout) {
> +                               now = get_jiffies_64();
> +                               if (time_after_eq64(now, next_l->expires)) {
> +                                       htab_sched_gc(htab, b);
> +                                       continue;
> +                               }
> +                       }
>                         /* if it's not empty, just return it */
>                         memcpy(next_key, next_l->key, key_size);
>                         return 0;
> @@ -975,18 +1011,31 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
>         return 0;
>  }
>
> +static u32 fetch_timeout(u64 *map_flags)
> +{
> +       u32 timeout = (*map_flags) >> 32;
> +
> +       *map_flags = (*map_flags) & 0xffffffff;
> +       return timeout;
> +}
> +
>  /* Called from syscall or from eBPF program */
>  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>                                 u64 map_flags)
>  {
>         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> +       bool timeout_map = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
>         struct htab_elem *l_new = NULL, *l_old;
>         struct hlist_nulls_head *head;
>         unsigned long flags;
>         struct bucket *b;
>         u32 key_size, hash;
> +       u32 timeout;
> +       u64 now;
>         int ret;
>
> +       timeout = fetch_timeout(&map_flags);
> +
>         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
>                 /* unknown flags */
>                 return -EINVAL;
> @@ -1042,6 +1091,10 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>                 copy_map_value_locked(map,
>                                       l_old->key + round_up(key_size, 8),
>                                       value, false);
> +               if (timeout_map) {
> +                       now = get_jiffies_64();
> +                       l_old->expires = now + HZ * timeout;
> +               }

Ok, so it seems timeout is at a second granularity. Would it make
sense to make it at millisecond granularity instead? I think
millisecond would be more powerful and allow more use cases, in the
long run. Micro- and nano-second granularity seems like an overkill,
though. And would reduce the max timeout to pretty small numbers. With
milliseconds, you still will get more than 23 days of timeout, which
seems to be plenty.


>                 ret = 0;
>                 goto err;
>         }
> @@ -1054,6 +1107,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
>                 goto err;
>         }
>
> +       if (timeout_map) {
> +               now = get_jiffies_64();
> +               if (l_old && time_after_eq64(now, l_old->expires))
> +                       htab_sched_gc(htab, b);
> +               l_new->expires = now + HZ * timeout;
> +       }
> +
>         /* add new element to the head of the list, so that
>          * concurrent search will find it before old elem
>          */
> @@ -2180,3 +2240,183 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
>         .map_btf_name = "bpf_htab",
>         .map_btf_id = &htab_of_maps_map_btf_id,
>  };
> +

[...]
Cong Wang Dec. 15, 2020, 8:06 p.m. UTC | #2
On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > From: Cong Wang <cong.wang@bytedance.com>
> >
> > This borrows the idea from conntrack and will be used for conntrack in
> > bpf too. Each element in a timeout map has a user-specified timeout
> > in secs, after it expires it will be automatically removed from the map.
> >
> > There are two cases here:
> >
> > 1. When the timeout map is idle, that is, no one updates or accesses it,
> >    we rely on the idle work to scan the whole hash table and remove
> >    these expired. The idle work is scheduled every 1 sec.
>
> Would 1 second be a good period for a lot of cases? Probably would be
> good to expand on what went into this decision.

Sure, because our granularity is 1 sec, I will add it into changelog.

>
> >
> > 2. When the timeout map is actively accessed, we could reach expired
> >    elements before the idle work kicks in, we can simply skip them and
> >    schedule another work to do the actual removal work. We avoid taking
> >    locks on fast path.
> >
> > The timeout of each element can be set or updated via bpf_map_update_elem()
> > and we reuse the upper 32-bit of the 64-bit flag for the timeout value.
> >
> > Cc: Alexei Starovoitov <ast@kernel.org>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Dongdong Wang <wangdongdong.6@bytedance.com>
> > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > ---
> >  include/linux/bpf_types.h      |   1 +
> >  include/uapi/linux/bpf.h       |   3 +-
> >  kernel/bpf/hashtab.c           | 244 ++++++++++++++++++++++++++++++++-
> >  kernel/bpf/syscall.c           |   3 +-
> >  tools/include/uapi/linux/bpf.h |   1 +
> >  5 files changed, 248 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index 99f7fd657d87..00a3b17b6af2 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
> >  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
> >  #endif
> >  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_TIMEOUT_HASH, htab_timeout_map_ops)
> >
> >  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
> >  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 30b477a26482..dedb47bc3f52 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -158,6 +158,7 @@ enum bpf_map_type {
> >         BPF_MAP_TYPE_RINGBUF,
> >         BPF_MAP_TYPE_INODE_STORAGE,
> >         BPF_MAP_TYPE_TASK_STORAGE,
> > +       BPF_MAP_TYPE_TIMEOUT_HASH,
> >  };
> >
> >  /* Note that tracing related programs such as
> > @@ -393,7 +394,7 @@ enum bpf_link_type {
> >   */
> >  #define BPF_PSEUDO_CALL                1
> >
> > -/* flags for BPF_MAP_UPDATE_ELEM command */
> > +/* flags for BPF_MAP_UPDATE_ELEM command, upper 32 bits are timeout */
>
> timeout in what units of time?

1 sec, should I also add it in this comment (and everywhere else)?

>
> >  enum {
> >         BPF_ANY         = 0, /* create new element or update existing */
> >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > index f0b7b54fa3a8..178cb376c397 100644
> > --- a/kernel/bpf/hashtab.c
> > +++ b/kernel/bpf/hashtab.c
> > @@ -8,6 +8,8 @@
> >  #include <linux/filter.h>
> >  #include <linux/rculist_nulls.h>
> >  #include <linux/random.h>
> > +#include <linux/llist.h>
> > +#include <linux/workqueue.h>
> >  #include <uapi/linux/btf.h>
> >  #include <linux/rcupdate_trace.h>
> >  #include "percpu_freelist.h"
> > @@ -84,6 +86,8 @@ struct bucket {
> >                 raw_spinlock_t raw_lock;
> >                 spinlock_t     lock;
> >         };
> > +       struct llist_node gc_node;
> > +       atomic_t pending;
>
> HASH is an extremely frequently used type of map, and oftentimes with
> a lot of entries/buckets. I don't think users of normal
> BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> timeouts. So I think it's not appropriate to increase the size of the
> struct bucket here.

I understand that, but what's a better way to do this? I can wrap it up
on top of struct bucket for sure, but it would need to change a lot of code.
So, basically code reuse vs. struct bucket size increase. ;)

>
> >  };
> >
> >  #define HASHTAB_MAP_LOCK_COUNT 8
> > @@ -104,6 +108,9 @@ struct bpf_htab {
> >         u32 hashrnd;
> >         struct lock_class_key lockdep_key;
> >         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> > +       struct llist_head gc_list;
> > +       struct work_struct gc_work;
> > +       struct delayed_work gc_idle_work;
> >  };
> >
> >  /* each htab element is struct htab_elem + key + value */
> > @@ -124,6 +131,7 @@ struct htab_elem {
> >                 struct bpf_lru_node lru_node;
> >         };
> >         u32 hash;
> > +       u64 expires;
>
> time units? and similar concerns about wasting a lot of added memory
> for not benefit to HASH users.

'expires' is in jiffies, I can add a comment here if necessary.

Similarly, please suggest how to expand struct htab_elem without changing
a lot of code. I also tried to find some hole in the struct, but I
couldn't, so I
ran out of ideas here.

>
> >         char key[] __aligned(8);
> >  };
> >
> > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> >
> >         for (i = 0; i < htab->n_buckets; i++) {
> >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > +               atomic_set(&htab->buckets[i].pending, 0);
> >                 if (htab_use_raw_lock(htab)) {
> >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> >         return 0;
> >  }
> >
> > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > +{
> > +       if (atomic_fetch_or(1, &b->pending))
> > +               return;
> > +       llist_add(&b->gc_node, &htab->gc_list);
> > +       queue_work(system_unbound_wq, &htab->gc_work);
> > +}
>
> I'm concerned about each bucket being scheduled individually... And
> similarly concerned that each instance of TIMEOUT_HASH will do its own
> scheduling independently. Can you think about the way to have a
> "global" gc/purging logic, and just make sure that buckets that need
> processing would be just internally chained together. So the purging
> routing would iterate all the scheduled hashmaps, and within each it
> will have a linked list of buckets that need processing? And all that
> is done just once each GC period. Not N times for N maps or N*M times
> for N maps with M buckets in each.

Our internal discussion went to the opposite actually, people here argued
one work is not sufficient for a hashtable because there would be millions
of entries (max_entries, which is also number of buckets). ;)

I chose one work per hash table because we could use map-in-map to divide
the millions of entries.

So, this really depends on how many maps and how many buckets in each
map. I tend to leave this as it is, because there is no way to satisfy
all of the
cases.

>
> > +
> >  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> >  {
> >         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> > @@ -732,10 +749,13 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> >  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >  {
> >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > +       bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> >         struct hlist_nulls_head *head;
> >         struct htab_elem *l, *next_l;
> >         u32 hash, key_size;
> > +       struct bucket *b;
> >         int i = 0;
> > +       u64 now;
> >
> >         WARN_ON_ONCE(!rcu_read_lock_held());
> >
> > @@ -746,7 +766,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >
> >         hash = htab_map_hash(key, key_size, htab->hashrnd);
> >
> > -       head = select_bucket(htab, hash);
> > +       b = __select_bucket(htab, hash);
> > +       head = &b->head;
> >
> >         /* lookup the key */
> >         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
> > @@ -759,6 +780,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >                                   struct htab_elem, hash_node);
> >
> >         if (next_l) {
> > +               if (is_timeout) {
> > +                       now = get_jiffies_64();
> > +                       if (time_after_eq64(now, next_l->expires)) {
> > +                               htab_sched_gc(htab, b);
> > +                               goto find_first_elem;
> > +                       }
> > +               }
>
> this piece of logic is repeated verbatim many times, seems like a
> helper function would make sense here

Except goto or continue, isn't it? ;) Please do share your ideas on
how to make it a helper for both goto and continue.


>
> >                 /* if next elem in this hash list is non-zero, just return it */
> >                 memcpy(next_key, next_l->key, key_size);
> >                 return 0;
> > @@ -771,12 +799,20 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> >  find_first_elem:
> >         /* iterate over buckets */
> >         for (; i < htab->n_buckets; i++) {
> > -               head = select_bucket(htab, i);
> > +               b = __select_bucket(htab, i);
> > +               head = &b->head;
> >
> >                 /* pick first element in the bucket */
> >                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
> >                                           struct htab_elem, hash_node);
> >                 if (next_l) {
> > +                       if (is_timeout) {
> > +                               now = get_jiffies_64();
> > +                               if (time_after_eq64(now, next_l->expires)) {
> > +                                       htab_sched_gc(htab, b);
> > +                                       continue;
> > +                               }
> > +                       }
> >                         /* if it's not empty, just return it */
> >                         memcpy(next_key, next_l->key, key_size);
> >                         return 0;
> > @@ -975,18 +1011,31 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
> >         return 0;
> >  }
> >
> > +static u32 fetch_timeout(u64 *map_flags)
> > +{
> > +       u32 timeout = (*map_flags) >> 32;
> > +
> > +       *map_flags = (*map_flags) & 0xffffffff;
> > +       return timeout;
> > +}
> > +
> >  /* Called from syscall or from eBPF program */
> >  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >                                 u64 map_flags)
> >  {
> >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > +       bool timeout_map = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> >         struct htab_elem *l_new = NULL, *l_old;
> >         struct hlist_nulls_head *head;
> >         unsigned long flags;
> >         struct bucket *b;
> >         u32 key_size, hash;
> > +       u32 timeout;
> > +       u64 now;
> >         int ret;
> >
> > +       timeout = fetch_timeout(&map_flags);
> > +
> >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
> >                 /* unknown flags */
> >                 return -EINVAL;
> > @@ -1042,6 +1091,10 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> >                 copy_map_value_locked(map,
> >                                       l_old->key + round_up(key_size, 8),
> >                                       value, false);
> > +               if (timeout_map) {
> > +                       now = get_jiffies_64();
> > +                       l_old->expires = now + HZ * timeout;
> > +               }
>
> Ok, so it seems timeout is at a second granularity. Would it make
> sense to make it at millisecond granularity instead? I think
> millisecond would be more powerful and allow more use cases, in the
> long run. Micro- and nano-second granularity seems like an overkill,
> though. And would reduce the max timeout to pretty small numbers. With
> milliseconds, you still will get more than 23 days of timeout, which
> seems to be plenty.

Sure if you want to pay the price of scheduling the work more often...

For our own use case, second is sufficient. What use case do you have
for paying this price? I am happy to hear.

Thanks.
Andrii Nakryiko Dec. 15, 2020, 10:03 p.m. UTC | #3
On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > From: Cong Wang <cong.wang@bytedance.com>
> > >
> > > This borrows the idea from conntrack and will be used for conntrack in
> > > bpf too. Each element in a timeout map has a user-specified timeout
> > > in secs, after it expires it will be automatically removed from the map.
> > >
> > > There are two cases here:
> > >
> > > 1. When the timeout map is idle, that is, no one updates or accesses it,
> > >    we rely on the idle work to scan the whole hash table and remove
> > >    these expired. The idle work is scheduled every 1 sec.
> >
> > Would 1 second be a good period for a lot of cases? Probably would be
> > good to expand on what went into this decision.
>
> Sure, because our granularity is 1 sec, I will add it into changelog.
>

Granularity of a timeout is not that coupled with the period of
garbage collection. In this case, with 1 second period, you can have
some items not garbage collected for up to 2 seconds due to timing and
races. Just keep that in mind.

> >
> > >
> > > 2. When the timeout map is actively accessed, we could reach expired
> > >    elements before the idle work kicks in, we can simply skip them and
> > >    schedule another work to do the actual removal work. We avoid taking
> > >    locks on fast path.
> > >
> > > The timeout of each element can be set or updated via bpf_map_update_elem()
> > > and we reuse the upper 32-bit of the 64-bit flag for the timeout value.
> > >
> > > Cc: Alexei Starovoitov <ast@kernel.org>
> > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > Cc: Dongdong Wang <wangdongdong.6@bytedance.com>
> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > ---
> > >  include/linux/bpf_types.h      |   1 +
> > >  include/uapi/linux/bpf.h       |   3 +-
> > >  kernel/bpf/hashtab.c           | 244 ++++++++++++++++++++++++++++++++-
> > >  kernel/bpf/syscall.c           |   3 +-
> > >  tools/include/uapi/linux/bpf.h |   1 +
> > >  5 files changed, 248 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > > index 99f7fd657d87..00a3b17b6af2 100644
> > > --- a/include/linux/bpf_types.h
> > > +++ b/include/linux/bpf_types.h
> > > @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
> > >  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
> > >  #endif
> > >  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
> > > +BPF_MAP_TYPE(BPF_MAP_TYPE_TIMEOUT_HASH, htab_timeout_map_ops)
> > >
> > >  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
> > >  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 30b477a26482..dedb47bc3f52 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -158,6 +158,7 @@ enum bpf_map_type {
> > >         BPF_MAP_TYPE_RINGBUF,
> > >         BPF_MAP_TYPE_INODE_STORAGE,
> > >         BPF_MAP_TYPE_TASK_STORAGE,
> > > +       BPF_MAP_TYPE_TIMEOUT_HASH,
> > >  };
> > >
> > >  /* Note that tracing related programs such as
> > > @@ -393,7 +394,7 @@ enum bpf_link_type {
> > >   */
> > >  #define BPF_PSEUDO_CALL                1
> > >
> > > -/* flags for BPF_MAP_UPDATE_ELEM command */
> > > +/* flags for BPF_MAP_UPDATE_ELEM command, upper 32 bits are timeout */
> >
> > timeout in what units of time?
>
> 1 sec, should I also add it in this comment (and everywhere else)?

yes, please

>
> >
> > >  enum {
> > >         BPF_ANY         = 0, /* create new element or update existing */
> > >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > index f0b7b54fa3a8..178cb376c397 100644
> > > --- a/kernel/bpf/hashtab.c
> > > +++ b/kernel/bpf/hashtab.c
> > > @@ -8,6 +8,8 @@
> > >  #include <linux/filter.h>
> > >  #include <linux/rculist_nulls.h>
> > >  #include <linux/random.h>
> > > +#include <linux/llist.h>
> > > +#include <linux/workqueue.h>
> > >  #include <uapi/linux/btf.h>
> > >  #include <linux/rcupdate_trace.h>
> > >  #include "percpu_freelist.h"
> > > @@ -84,6 +86,8 @@ struct bucket {
> > >                 raw_spinlock_t raw_lock;
> > >                 spinlock_t     lock;
> > >         };
> > > +       struct llist_node gc_node;
> > > +       atomic_t pending;
> >
> > HASH is an extremely frequently used type of map, and oftentimes with
> > a lot of entries/buckets. I don't think users of normal
> > BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> > timeouts. So I think it's not appropriate to increase the size of the
> > struct bucket here.
>
> I understand that, but what's a better way to do this? I can wrap it up
> on top of struct bucket for sure, but it would need to change a lot of code.
> So, basically code reuse vs. struct bucket size increase. ;)

I think not paying potentially lots of memory for unused features
wins. Some struct embedding might work. Or just better code reuse.
Please think this through, don't wait for me to write the code for
you.

>
> >
> > >  };
> > >
> > >  #define HASHTAB_MAP_LOCK_COUNT 8
> > > @@ -104,6 +108,9 @@ struct bpf_htab {
> > >         u32 hashrnd;
> > >         struct lock_class_key lockdep_key;
> > >         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> > > +       struct llist_head gc_list;
> > > +       struct work_struct gc_work;
> > > +       struct delayed_work gc_idle_work;
> > >  };
> > >
> > >  /* each htab element is struct htab_elem + key + value */
> > > @@ -124,6 +131,7 @@ struct htab_elem {
> > >                 struct bpf_lru_node lru_node;
> > >         };
> > >         u32 hash;
> > > +       u64 expires;
> >
> > time units? and similar concerns about wasting a lot of added memory
> > for not benefit to HASH users.
>
> 'expires' is in jiffies, I can add a comment here if necessary.

I think it's really helpful, because everyone reading this would be
wondering and then jumping around the code to figure it out

>
> Similarly, please suggest how to expand struct htab_elem without changing
> a lot of code. I also tried to find some hole in the struct, but I
> couldn't, so I
> ran out of ideas here.

I mentioned above, you can have your own struct and embed htab_elem
inside. It might need some refactoring, of course.

>
> >
> > >         char key[] __aligned(8);
> > >  };
> > >
> > > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > >
> > >         for (i = 0; i < htab->n_buckets; i++) {
> > >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > +               atomic_set(&htab->buckets[i].pending, 0);
> > >                 if (htab_use_raw_lock(htab)) {
> > >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > >         return 0;
> > >  }
> > >
> > > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > +{
> > > +       if (atomic_fetch_or(1, &b->pending))
> > > +               return;
> > > +       llist_add(&b->gc_node, &htab->gc_list);
> > > +       queue_work(system_unbound_wq, &htab->gc_work);
> > > +}
> >
> > I'm concerned about each bucket being scheduled individually... And
> > similarly concerned that each instance of TIMEOUT_HASH will do its own
> > scheduling independently. Can you think about the way to have a
> > "global" gc/purging logic, and just make sure that buckets that need
> > processing would be just internally chained together. So the purging
> > routing would iterate all the scheduled hashmaps, and within each it
> > will have a linked list of buckets that need processing? And all that
> > is done just once each GC period. Not N times for N maps or N*M times
> > for N maps with M buckets in each.
>
> Our internal discussion went to the opposite actually, people here argued
> one work is not sufficient for a hashtable because there would be millions
> of entries (max_entries, which is also number of buckets). ;)

I was hoping that it's possible to expire elements without iterating
the entire hash table every single time, only items that need to be
processed. Hashed timing wheel is one way to do something like this,
kernel has to solve similar problems with timeouts as well, why not
taking inspiration there?

>
> I chose one work per hash table because we could use map-in-map to divide
> the millions of entries.
>
> So, this really depends on how many maps and how many buckets in each
> map. I tend to leave this as it is, because there is no way to satisfy
> all of the
> cases.

But I think some ways are better than others. Please consider all the
options, not just the simplest one.

>
> >
> > > +
> > >  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> > >  {
> > >         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> > > @@ -732,10 +749,13 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> > >  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > >  {
> > >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > > +       bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> > >         struct hlist_nulls_head *head;
> > >         struct htab_elem *l, *next_l;
> > >         u32 hash, key_size;
> > > +       struct bucket *b;
> > >         int i = 0;
> > > +       u64 now;
> > >
> > >         WARN_ON_ONCE(!rcu_read_lock_held());
> > >
> > > @@ -746,7 +766,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > >
> > >         hash = htab_map_hash(key, key_size, htab->hashrnd);
> > >
> > > -       head = select_bucket(htab, hash);
> > > +       b = __select_bucket(htab, hash);
> > > +       head = &b->head;
> > >
> > >         /* lookup the key */
> > >         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
> > > @@ -759,6 +780,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > >                                   struct htab_elem, hash_node);
> > >
> > >         if (next_l) {
> > > +               if (is_timeout) {
> > > +                       now = get_jiffies_64();
> > > +                       if (time_after_eq64(now, next_l->expires)) {
> > > +                               htab_sched_gc(htab, b);
> > > +                               goto find_first_elem;
> > > +                       }
> > > +               }
> >
> > this piece of logic is repeated verbatim many times, seems like a
> > helper function would make sense here
>
> Except goto or continue, isn't it? ;) Please do share your ideas on
> how to make it a helper for both goto and continue.

So there is no way to make it work like this:

if (htab_elem_expired(htab, next_l))
    goto find_first_elem;

?

>
>
> >
> > >                 /* if next elem in this hash list is non-zero, just return it */
> > >                 memcpy(next_key, next_l->key, key_size);
> > >                 return 0;
> > > @@ -771,12 +799,20 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > >  find_first_elem:
> > >         /* iterate over buckets */
> > >         for (; i < htab->n_buckets; i++) {
> > > -               head = select_bucket(htab, i);
> > > +               b = __select_bucket(htab, i);
> > > +               head = &b->head;
> > >
> > >                 /* pick first element in the bucket */
> > >                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
> > >                                           struct htab_elem, hash_node);
> > >                 if (next_l) {
> > > +                       if (is_timeout) {
> > > +                               now = get_jiffies_64();
> > > +                               if (time_after_eq64(now, next_l->expires)) {
> > > +                                       htab_sched_gc(htab, b);
> > > +                                       continue;
> > > +                               }
> > > +                       }
> > >                         /* if it's not empty, just return it */
> > >                         memcpy(next_key, next_l->key, key_size);
> > >                         return 0;
> > > @@ -975,18 +1011,31 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
> > >         return 0;
> > >  }
> > >
> > > +static u32 fetch_timeout(u64 *map_flags)
> > > +{
> > > +       u32 timeout = (*map_flags) >> 32;
> > > +
> > > +       *map_flags = (*map_flags) & 0xffffffff;
> > > +       return timeout;
> > > +}
> > > +
> > >  /* Called from syscall or from eBPF program */
> > >  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> > >                                 u64 map_flags)
> > >  {
> > >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > > +       bool timeout_map = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> > >         struct htab_elem *l_new = NULL, *l_old;
> > >         struct hlist_nulls_head *head;
> > >         unsigned long flags;
> > >         struct bucket *b;
> > >         u32 key_size, hash;
> > > +       u32 timeout;
> > > +       u64 now;
> > >         int ret;
> > >
> > > +       timeout = fetch_timeout(&map_flags);
> > > +
> > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
> > >                 /* unknown flags */
> > >                 return -EINVAL;
> > > @@ -1042,6 +1091,10 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> > >                 copy_map_value_locked(map,
> > >                                       l_old->key + round_up(key_size, 8),
> > >                                       value, false);
> > > +               if (timeout_map) {
> > > +                       now = get_jiffies_64();
> > > +                       l_old->expires = now + HZ * timeout;
> > > +               }
> >
> > Ok, so it seems timeout is at a second granularity. Would it make
> > sense to make it at millisecond granularity instead? I think
> > millisecond would be more powerful and allow more use cases, in the
> > long run. Micro- and nano-second granularity seems like an overkill,
> > though. And would reduce the max timeout to pretty small numbers. With
> > milliseconds, you still will get more than 23 days of timeout, which
> > seems to be plenty.
>
> Sure if you want to pay the price of scheduling the work more often...

See above about timer granularity and GC period. You can have
nanosecond precision timeout and still GC only once every seconds, as
an example. You are checking expiration on lookup, so it can be
handled very precisely. You don't have to GC 1000 times per second to
support millisecond granularity.

>
> For our own use case, second is sufficient. What use case do you have
> for paying this price? I am happy to hear.

I don't have a specific use case. But I also don't see the extra price
we need to pay. You are adding a new *generic* data structure to the
wide BPF infrastructure, so please consider implications beyond your
immediate use case.

>
> Thanks.
Daniel Borkmann Dec. 15, 2020, 11:23 p.m. UTC | #4
On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
> On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>
>> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
>> <andrii.nakryiko@gmail.com> wrote:
>>>
>>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>
>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>
>>>> This borrows the idea from conntrack and will be used for conntrack in
>>>> bpf too. Each element in a timeout map has a user-specified timeout
>>>> in secs, after it expires it will be automatically removed from the map.
[...]
>>>>          char key[] __aligned(8);
>>>>   };
>>>>
>>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
>>>>
>>>>          for (i = 0; i < htab->n_buckets; i++) {
>>>>                  INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
>>>> +               atomic_set(&htab->buckets[i].pending, 0);
>>>>                  if (htab_use_raw_lock(htab)) {
>>>>                          raw_spin_lock_init(&htab->buckets[i].raw_lock);
>>>>                          lockdep_set_class(&htab->buckets[i].raw_lock,
>>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
>>>>          return 0;
>>>>   }
>>>>
>>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
>>>> +{
>>>> +       if (atomic_fetch_or(1, &b->pending))
>>>> +               return;
>>>> +       llist_add(&b->gc_node, &htab->gc_list);
>>>> +       queue_work(system_unbound_wq, &htab->gc_work);
>>>> +}
>>>
>>> I'm concerned about each bucket being scheduled individually... And
>>> similarly concerned that each instance of TIMEOUT_HASH will do its own
>>> scheduling independently. Can you think about the way to have a
>>> "global" gc/purging logic, and just make sure that buckets that need
>>> processing would be just internally chained together. So the purging
>>> routing would iterate all the scheduled hashmaps, and within each it
>>> will have a linked list of buckets that need processing? And all that
>>> is done just once each GC period. Not N times for N maps or N*M times
>>> for N maps with M buckets in each.
>>
>> Our internal discussion went to the opposite actually, people here argued
>> one work is not sufficient for a hashtable because there would be millions
>> of entries (max_entries, which is also number of buckets). ;)
> 
> I was hoping that it's possible to expire elements without iterating
> the entire hash table every single time, only items that need to be
> processed. Hashed timing wheel is one way to do something like this,
> kernel has to solve similar problems with timeouts as well, why not
> taking inspiration there?

Couldn't this map be coupled with LRU map for example through flag on map
creation so that the different LRU map flavors can be used with it? For BPF
CT use case we do rely on LRU map to purge 'inactive' entries once full. I
wonder if for that case you then still need to schedule a GC at all.. e.g.
if you hit the condition time_after_eq64(now, entry->expires) you'd just
re-link the expired element from the public htab to e.g. the LRU's local
CPU's free/pending-list instead.

Thanks,
Daniel
Cong Wang Dec. 16, 2020, 12:15 a.m. UTC | #5
On Tue, Dec 15, 2020 at 2:08 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > From: Cong Wang <cong.wang@bytedance.com>
> > > >
> > > > This borrows the idea from conntrack and will be used for conntrack in
> > > > bpf too. Each element in a timeout map has a user-specified timeout
> > > > in secs, after it expires it will be automatically removed from the map.
> > > >
> > > > There are two cases here:
> > > >
> > > > 1. When the timeout map is idle, that is, no one updates or accesses it,
> > > >    we rely on the idle work to scan the whole hash table and remove
> > > >    these expired. The idle work is scheduled every 1 sec.
> > >
> > > Would 1 second be a good period for a lot of cases? Probably would be
> > > good to expand on what went into this decision.
> >
> > Sure, because our granularity is 1 sec, I will add it into changelog.
> >
>
> Granularity of a timeout is not that coupled with the period of
> garbage collection. In this case, with 1 second period, you can have
> some items not garbage collected for up to 2 seconds due to timing and
> races. Just keep that in mind.

Well, it is. Let's say we add entries every ms and kick gc every sec, we
could end up with thousands of expired entries in hash map, which could
be a problem under memory pressure.

>
> > >
> > > >
> > > > 2. When the timeout map is actively accessed, we could reach expired
> > > >    elements before the idle work kicks in, we can simply skip them and
> > > >    schedule another work to do the actual removal work. We avoid taking
> > > >    locks on fast path.
> > > >
> > > > The timeout of each element can be set or updated via bpf_map_update_elem()
> > > > and we reuse the upper 32-bit of the 64-bit flag for the timeout value.
> > > >
> > > > Cc: Alexei Starovoitov <ast@kernel.org>
> > > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > > > Cc: Dongdong Wang <wangdongdong.6@bytedance.com>
> > > > Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> > > > ---
> > > >  include/linux/bpf_types.h      |   1 +
> > > >  include/uapi/linux/bpf.h       |   3 +-
> > > >  kernel/bpf/hashtab.c           | 244 ++++++++++++++++++++++++++++++++-
> > > >  kernel/bpf/syscall.c           |   3 +-
> > > >  tools/include/uapi/linux/bpf.h |   1 +
> > > >  5 files changed, 248 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > > > index 99f7fd657d87..00a3b17b6af2 100644
> > > > --- a/include/linux/bpf_types.h
> > > > +++ b/include/linux/bpf_types.h
> > > > @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
> > > >  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
> > > >  #endif
> > > >  BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
> > > > +BPF_MAP_TYPE(BPF_MAP_TYPE_TIMEOUT_HASH, htab_timeout_map_ops)
> > > >
> > > >  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
> > > >  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index 30b477a26482..dedb47bc3f52 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -158,6 +158,7 @@ enum bpf_map_type {
> > > >         BPF_MAP_TYPE_RINGBUF,
> > > >         BPF_MAP_TYPE_INODE_STORAGE,
> > > >         BPF_MAP_TYPE_TASK_STORAGE,
> > > > +       BPF_MAP_TYPE_TIMEOUT_HASH,
> > > >  };
> > > >
> > > >  /* Note that tracing related programs such as
> > > > @@ -393,7 +394,7 @@ enum bpf_link_type {
> > > >   */
> > > >  #define BPF_PSEUDO_CALL                1
> > > >
> > > > -/* flags for BPF_MAP_UPDATE_ELEM command */
> > > > +/* flags for BPF_MAP_UPDATE_ELEM command, upper 32 bits are timeout */
> > >
> > > timeout in what units of time?
> >
> > 1 sec, should I also add it in this comment (and everywhere else)?
>
> yes, please

Sure.

>
> >
> > >
> > > >  enum {
> > > >         BPF_ANY         = 0, /* create new element or update existing */
> > > >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > > index f0b7b54fa3a8..178cb376c397 100644
> > > > --- a/kernel/bpf/hashtab.c
> > > > +++ b/kernel/bpf/hashtab.c
> > > > @@ -8,6 +8,8 @@
> > > >  #include <linux/filter.h>
> > > >  #include <linux/rculist_nulls.h>
> > > >  #include <linux/random.h>
> > > > +#include <linux/llist.h>
> > > > +#include <linux/workqueue.h>
> > > >  #include <uapi/linux/btf.h>
> > > >  #include <linux/rcupdate_trace.h>
> > > >  #include "percpu_freelist.h"
> > > > @@ -84,6 +86,8 @@ struct bucket {
> > > >                 raw_spinlock_t raw_lock;
> > > >                 spinlock_t     lock;
> > > >         };
> > > > +       struct llist_node gc_node;
> > > > +       atomic_t pending;
> > >
> > > HASH is an extremely frequently used type of map, and oftentimes with
> > > a lot of entries/buckets. I don't think users of normal
> > > BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> > > timeouts. So I think it's not appropriate to increase the size of the
> > > struct bucket here.
> >
> > I understand that, but what's a better way to do this? I can wrap it up
> > on top of struct bucket for sure, but it would need to change a lot of code.
> > So, basically code reuse vs. struct bucket size increase. ;)
>
> I think not paying potentially lots of memory for unused features
> wins. Some struct embedding might work. Or just better code reuse.
> Please think this through, don't wait for me to write the code for
> you.

I perfectly understand this point, but other reviewers could easily argue
why not just reuse the existing hashmap code given they are pretty much
similar.

I personally have no problem duplicating the code, but I need to justify it,
right? :-/


>
> >
> > >
> > > >  };
> > > >
> > > >  #define HASHTAB_MAP_LOCK_COUNT 8
> > > > @@ -104,6 +108,9 @@ struct bpf_htab {
> > > >         u32 hashrnd;
> > > >         struct lock_class_key lockdep_key;
> > > >         int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
> > > > +       struct llist_head gc_list;
> > > > +       struct work_struct gc_work;
> > > > +       struct delayed_work gc_idle_work;
> > > >  };
> > > >
> > > >  /* each htab element is struct htab_elem + key + value */
> > > > @@ -124,6 +131,7 @@ struct htab_elem {
> > > >                 struct bpf_lru_node lru_node;
> > > >         };
> > > >         u32 hash;
> > > > +       u64 expires;
> > >
> > > time units? and similar concerns about wasting a lot of added memory
> > > for not benefit to HASH users.
> >
> > 'expires' is in jiffies, I can add a comment here if necessary.
>
> I think it's really helpful, because everyone reading this would be
> wondering and then jumping around the code to figure it out

Sure.

>
> >
> > Similarly, please suggest how to expand struct htab_elem without changing
> > a lot of code. I also tried to find some hole in the struct, but I
> > couldn't, so I
> > ran out of ideas here.
>
> I mentioned above, you can have your own struct and embed htab_elem
> inside. It might need some refactoring, of course.

So increasing 8 bytes of struct htab_elem is a solid reason to change
_potentially_ all of the hash map code? It does not sound solid to me,
at least it is arguable.

I also doubt I could really wrap up on top of htab_elem, as it assumes
key and value are stored at the end. And these structs are internal,
it is really hard to factor out.

>
> >
> > >
> > > >         char key[] __aligned(8);
> > > >  };
> > > >
> > > > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > >
> > > >         for (i = 0; i < htab->n_buckets; i++) {
> > > >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > > +               atomic_set(&htab->buckets[i].pending, 0);
> > > >                 if (htab_use_raw_lock(htab)) {
> > > >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > > > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > >         return 0;
> > > >  }
> > > >
> > > > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > > +{
> > > > +       if (atomic_fetch_or(1, &b->pending))
> > > > +               return;
> > > > +       llist_add(&b->gc_node, &htab->gc_list);
> > > > +       queue_work(system_unbound_wq, &htab->gc_work);
> > > > +}
> > >
> > > I'm concerned about each bucket being scheduled individually... And
> > > similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > scheduling independently. Can you think about the way to have a
> > > "global" gc/purging logic, and just make sure that buckets that need
> > > processing would be just internally chained together. So the purging
> > > routing would iterate all the scheduled hashmaps, and within each it
> > > will have a linked list of buckets that need processing? And all that
> > > is done just once each GC period. Not N times for N maps or N*M times
> > > for N maps with M buckets in each.
> >
> > Our internal discussion went to the opposite actually, people here argued
> > one work is not sufficient for a hashtable because there would be millions
> > of entries (max_entries, which is also number of buckets). ;)
>
> I was hoping that it's possible to expire elements without iterating
> the entire hash table every single time, only items that need to be
> processed. Hashed timing wheel is one way to do something like this,

How could we know which ones are expired without scanning the
whole table? They are clearly not sorted even within a bucket. Sorting
them with expiration? Slightly better, as we can just stop at the first
non-expired but with an expense of slowing down the update path.

> kernel has to solve similar problems with timeouts as well, why not
> taking inspiration there?

Mind to point out which similar problems in the kernel?

If you mean inspiration from conntrack, it is even worse, it uses multiple
locking and locks on fast path too. I also looked at xt_hashlimit, it is not
any better either.

>
> >
> > I chose one work per hash table because we could use map-in-map to divide
> > the millions of entries.
> >
> > So, this really depends on how many maps and how many buckets in each
> > map. I tend to leave this as it is, because there is no way to satisfy
> > all of the
> > cases.
>
> But I think some ways are better than others. Please consider all the
> options, not just the simplest one.

I _did_ consider multiple works per hash map carefully, like I said, it
could be workarounded with map-in-map and the locking would be more
complicated. Hence I pick this current implementation.

Simplicity also means less bugs, in case you do not like it. ;)

>
> >
> > >
> > > > +
> > > >  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> > > >  {
> > > >         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> > > > @@ -732,10 +749,13 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> > > >  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > >  {
> > > >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > > > +       bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> > > >         struct hlist_nulls_head *head;
> > > >         struct htab_elem *l, *next_l;
> > > >         u32 hash, key_size;
> > > > +       struct bucket *b;
> > > >         int i = 0;
> > > > +       u64 now;
> > > >
> > > >         WARN_ON_ONCE(!rcu_read_lock_held());
> > > >
> > > > @@ -746,7 +766,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > >
> > > >         hash = htab_map_hash(key, key_size, htab->hashrnd);
> > > >
> > > > -       head = select_bucket(htab, hash);
> > > > +       b = __select_bucket(htab, hash);
> > > > +       head = &b->head;
> > > >
> > > >         /* lookup the key */
> > > >         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
> > > > @@ -759,6 +780,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > >                                   struct htab_elem, hash_node);
> > > >
> > > >         if (next_l) {
> > > > +               if (is_timeout) {
> > > > +                       now = get_jiffies_64();
> > > > +                       if (time_after_eq64(now, next_l->expires)) {
> > > > +                               htab_sched_gc(htab, b);
> > > > +                               goto find_first_elem;
> > > > +                       }
> > > > +               }
> > >
> > > this piece of logic is repeated verbatim many times, seems like a
> > > helper function would make sense here
> >
> > Except goto or continue, isn't it? ;) Please do share your ideas on
> > how to make it a helper for both goto and continue.
>
> So there is no way to make it work like this:
>
> if (htab_elem_expired(htab, next_l))
>     goto find_first_elem;
>
> ?

Good idea, it also needs to pass in struct bucket.

>
> >
> >
> > >
> > > >                 /* if next elem in this hash list is non-zero, just return it */
> > > >                 memcpy(next_key, next_l->key, key_size);
> > > >                 return 0;
> > > > @@ -771,12 +799,20 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > >  find_first_elem:
> > > >         /* iterate over buckets */
> > > >         for (; i < htab->n_buckets; i++) {
> > > > -               head = select_bucket(htab, i);
> > > > +               b = __select_bucket(htab, i);
> > > > +               head = &b->head;
> > > >
> > > >                 /* pick first element in the bucket */
> > > >                 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
> > > >                                           struct htab_elem, hash_node);
> > > >                 if (next_l) {
> > > > +                       if (is_timeout) {
> > > > +                               now = get_jiffies_64();
> > > > +                               if (time_after_eq64(now, next_l->expires)) {
> > > > +                                       htab_sched_gc(htab, b);
> > > > +                                       continue;
> > > > +                               }
> > > > +                       }
> > > >                         /* if it's not empty, just return it */
> > > >                         memcpy(next_key, next_l->key, key_size);
> > > >                         return 0;
> > > > @@ -975,18 +1011,31 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
> > > >         return 0;
> > > >  }
> > > >
> > > > +static u32 fetch_timeout(u64 *map_flags)
> > > > +{
> > > > +       u32 timeout = (*map_flags) >> 32;
> > > > +
> > > > +       *map_flags = (*map_flags) & 0xffffffff;
> > > > +       return timeout;
> > > > +}
> > > > +
> > > >  /* Called from syscall or from eBPF program */
> > > >  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> > > >                                 u64 map_flags)
> > > >  {
> > > >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > > > +       bool timeout_map = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> > > >         struct htab_elem *l_new = NULL, *l_old;
> > > >         struct hlist_nulls_head *head;
> > > >         unsigned long flags;
> > > >         struct bucket *b;
> > > >         u32 key_size, hash;
> > > > +       u32 timeout;
> > > > +       u64 now;
> > > >         int ret;
> > > >
> > > > +       timeout = fetch_timeout(&map_flags);
> > > > +
> > > >         if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
> > > >                 /* unknown flags */
> > > >                 return -EINVAL;
> > > > @@ -1042,6 +1091,10 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
> > > >                 copy_map_value_locked(map,
> > > >                                       l_old->key + round_up(key_size, 8),
> > > >                                       value, false);
> > > > +               if (timeout_map) {
> > > > +                       now = get_jiffies_64();
> > > > +                       l_old->expires = now + HZ * timeout;
> > > > +               }
> > >
> > > Ok, so it seems timeout is at a second granularity. Would it make
> > > sense to make it at millisecond granularity instead? I think
> > > millisecond would be more powerful and allow more use cases, in the
> > > long run. Micro- and nano-second granularity seems like an overkill,
> > > though. And would reduce the max timeout to pretty small numbers. With
> > > milliseconds, you still will get more than 23 days of timeout, which
> > > seems to be plenty.
> >
> > Sure if you want to pay the price of scheduling the work more often...
>
> See above about timer granularity and GC period. You can have
> nanosecond precision timeout and still GC only once every seconds, as
> an example. You are checking expiration on lookup, so it can be
> handled very precisely. You don't have to GC 1000 times per second to
> support millisecond granularity.

Like I said, if memory were not a problem, we could schedule it once per
hour too. But I believe memory matters here. ;)


> > For our own use case, second is sufficient. What use case do you have
> > for paying this price? I am happy to hear.
>
> I don't have a specific use case. But I also don't see the extra price
> we need to pay. You are adding a new *generic* data structure to the
> wide BPF infrastructure, so please consider implications beyond your
> immediate use case.

I have considered it, just not able to find any better way to make everyone
happy. If I choose not to increase struct bucket/htab_elem, I may have to
duplicate or change a lot more hash map code. If I choose to increase it,
regular map users could get an overhead. See the trouble? :)

Thanks.
Cong Wang Dec. 16, 2020, 12:22 a.m. UTC | #6
On Tue, Dec 15, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
> > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>
> >> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> >> <andrii.nakryiko@gmail.com> wrote:
> >>>
> >>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>>
> >>>> From: Cong Wang <cong.wang@bytedance.com>
> >>>>
> >>>> This borrows the idea from conntrack and will be used for conntrack in
> >>>> bpf too. Each element in a timeout map has a user-specified timeout
> >>>> in secs, after it expires it will be automatically removed from the map.
> [...]
> >>>>          char key[] __aligned(8);
> >>>>   };
> >>>>
> >>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> >>>>
> >>>>          for (i = 0; i < htab->n_buckets; i++) {
> >>>>                  INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> >>>> +               atomic_set(&htab->buckets[i].pending, 0);
> >>>>                  if (htab_use_raw_lock(htab)) {
> >>>>                          raw_spin_lock_init(&htab->buckets[i].raw_lock);
> >>>>                          lockdep_set_class(&htab->buckets[i].raw_lock,
> >>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> >>>>          return 0;
> >>>>   }
> >>>>
> >>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> >>>> +{
> >>>> +       if (atomic_fetch_or(1, &b->pending))
> >>>> +               return;
> >>>> +       llist_add(&b->gc_node, &htab->gc_list);
> >>>> +       queue_work(system_unbound_wq, &htab->gc_work);
> >>>> +}
> >>>
> >>> I'm concerned about each bucket being scheduled individually... And
> >>> similarly concerned that each instance of TIMEOUT_HASH will do its own
> >>> scheduling independently. Can you think about the way to have a
> >>> "global" gc/purging logic, and just make sure that buckets that need
> >>> processing would be just internally chained together. So the purging
> >>> routing would iterate all the scheduled hashmaps, and within each it
> >>> will have a linked list of buckets that need processing? And all that
> >>> is done just once each GC period. Not N times for N maps or N*M times
> >>> for N maps with M buckets in each.
> >>
> >> Our internal discussion went to the opposite actually, people here argued
> >> one work is not sufficient for a hashtable because there would be millions
> >> of entries (max_entries, which is also number of buckets). ;)
> >
> > I was hoping that it's possible to expire elements without iterating
> > the entire hash table every single time, only items that need to be
> > processed. Hashed timing wheel is one way to do something like this,
> > kernel has to solve similar problems with timeouts as well, why not
> > taking inspiration there?
>
> Couldn't this map be coupled with LRU map for example through flag on map
> creation so that the different LRU map flavors can be used with it? For BPF
> CT use case we do rely on LRU map to purge 'inactive' entries once full. I
> wonder if for that case you then still need to schedule a GC at all.. e.g.
> if you hit the condition time_after_eq64(now, entry->expires) you'd just
> re-link the expired element from the public htab to e.g. the LRU's local
> CPU's free/pending-list instead.

I doubt we can use size as a limit to kick off GC or LRU, it must be
time-based. And in case of idle, there has to be an async GC, right?

Thanks.
Alexei Starovoitov Dec. 16, 2020, 1:14 a.m. UTC | #7
On Tue, Dec 15, 2020 at 04:22:21PM -0800, Cong Wang wrote:
> On Tue, Dec 15, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >
> > On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
> > > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >>
> > >> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > >> <andrii.nakryiko@gmail.com> wrote:
> > >>>
> > >>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >>>>
> > >>>> From: Cong Wang <cong.wang@bytedance.com>
> > >>>>
> > >>>> This borrows the idea from conntrack and will be used for conntrack in
> > >>>> bpf too. Each element in a timeout map has a user-specified timeout
> > >>>> in secs, after it expires it will be automatically removed from the map.
> > [...]
> > >>>>          char key[] __aligned(8);
> > >>>>   };
> > >>>>
> > >>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > >>>>
> > >>>>          for (i = 0; i < htab->n_buckets; i++) {
> > >>>>                  INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > >>>> +               atomic_set(&htab->buckets[i].pending, 0);
> > >>>>                  if (htab_use_raw_lock(htab)) {
> > >>>>                          raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > >>>>                          lockdep_set_class(&htab->buckets[i].raw_lock,
> > >>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > >>>>          return 0;
> > >>>>   }
> > >>>>
> > >>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > >>>> +{
> > >>>> +       if (atomic_fetch_or(1, &b->pending))
> > >>>> +               return;
> > >>>> +       llist_add(&b->gc_node, &htab->gc_list);
> > >>>> +       queue_work(system_unbound_wq, &htab->gc_work);
> > >>>> +}
> > >>>
> > >>> I'm concerned about each bucket being scheduled individually... And
> > >>> similarly concerned that each instance of TIMEOUT_HASH will do its own
> > >>> scheduling independently. Can you think about the way to have a
> > >>> "global" gc/purging logic, and just make sure that buckets that need
> > >>> processing would be just internally chained together. So the purging
> > >>> routing would iterate all the scheduled hashmaps, and within each it
> > >>> will have a linked list of buckets that need processing? And all that
> > >>> is done just once each GC period. Not N times for N maps or N*M times
> > >>> for N maps with M buckets in each.
> > >>
> > >> Our internal discussion went to the opposite actually, people here argued
> > >> one work is not sufficient for a hashtable because there would be millions
> > >> of entries (max_entries, which is also number of buckets). ;)
> > >
> > > I was hoping that it's possible to expire elements without iterating
> > > the entire hash table every single time, only items that need to be
> > > processed. Hashed timing wheel is one way to do something like this,
> > > kernel has to solve similar problems with timeouts as well, why not
> > > taking inspiration there?
> >
> > Couldn't this map be coupled with LRU map for example through flag on map
> > creation so that the different LRU map flavors can be used with it? For BPF
> > CT use case we do rely on LRU map to purge 'inactive' entries once full. I
> > wonder if for that case you then still need to schedule a GC at all.. e.g.
> > if you hit the condition time_after_eq64(now, entry->expires) you'd just
> > re-link the expired element from the public htab to e.g. the LRU's local
> > CPU's free/pending-list instead.
> 
> I doubt we can use size as a limit to kick off GC or LRU, it must be
> time-based. And in case of idle, there has to be an async GC, right?

Why does it have to be time based?
Why LRU alone is not enough?
People implemented conntracker in bpf using LRU map.
Anything extra can be added on top from user space
which can easily copy with 1 sec granularity.
Say the kernel does GC and deletes htab entries.
How user space will know that it's gone? There would need to be
an event sent to user space when entry is being deleted by the kernel.
But then such event will be racy. Instead when timers and expirations
are done by user space everything is in sync.
Cong Wang Dec. 16, 2020, 2:10 a.m. UTC | #8
On Tue, Dec 15, 2020 at 5:14 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 04:22:21PM -0800, Cong Wang wrote:
> > On Tue, Dec 15, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> > >
> > > On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
> > > > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >>
> > > >> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > > >> <andrii.nakryiko@gmail.com> wrote:
> > > >>>
> > > >>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >>>>
> > > >>>> From: Cong Wang <cong.wang@bytedance.com>
> > > >>>>
> > > >>>> This borrows the idea from conntrack and will be used for conntrack in
> > > >>>> bpf too. Each element in a timeout map has a user-specified timeout
> > > >>>> in secs, after it expires it will be automatically removed from the map.
> > > [...]
> > > >>>>          char key[] __aligned(8);
> > > >>>>   };
> > > >>>>
> > > >>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > >>>>
> > > >>>>          for (i = 0; i < htab->n_buckets; i++) {
> > > >>>>                  INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > >>>> +               atomic_set(&htab->buckets[i].pending, 0);
> > > >>>>                  if (htab_use_raw_lock(htab)) {
> > > >>>>                          raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > >>>>                          lockdep_set_class(&htab->buckets[i].raw_lock,
> > > >>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > >>>>          return 0;
> > > >>>>   }
> > > >>>>
> > > >>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > >>>> +{
> > > >>>> +       if (atomic_fetch_or(1, &b->pending))
> > > >>>> +               return;
> > > >>>> +       llist_add(&b->gc_node, &htab->gc_list);
> > > >>>> +       queue_work(system_unbound_wq, &htab->gc_work);
> > > >>>> +}
> > > >>>
> > > >>> I'm concerned about each bucket being scheduled individually... And
> > > >>> similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > >>> scheduling independently. Can you think about the way to have a
> > > >>> "global" gc/purging logic, and just make sure that buckets that need
> > > >>> processing would be just internally chained together. So the purging
> > > >>> routing would iterate all the scheduled hashmaps, and within each it
> > > >>> will have a linked list of buckets that need processing? And all that
> > > >>> is done just once each GC period. Not N times for N maps or N*M times
> > > >>> for N maps with M buckets in each.
> > > >>
> > > >> Our internal discussion went to the opposite actually, people here argued
> > > >> one work is not sufficient for a hashtable because there would be millions
> > > >> of entries (max_entries, which is also number of buckets). ;)
> > > >
> > > > I was hoping that it's possible to expire elements without iterating
> > > > the entire hash table every single time, only items that need to be
> > > > processed. Hashed timing wheel is one way to do something like this,
> > > > kernel has to solve similar problems with timeouts as well, why not
> > > > taking inspiration there?
> > >
> > > Couldn't this map be coupled with LRU map for example through flag on map
> > > creation so that the different LRU map flavors can be used with it? For BPF
> > > CT use case we do rely on LRU map to purge 'inactive' entries once full. I
> > > wonder if for that case you then still need to schedule a GC at all.. e.g.
> > > if you hit the condition time_after_eq64(now, entry->expires) you'd just
> > > re-link the expired element from the public htab to e.g. the LRU's local
> > > CPU's free/pending-list instead.
> >
> > I doubt we can use size as a limit to kick off GC or LRU, it must be
> > time-based. And in case of idle, there has to be an async GC, right?
>
> Why does it have to be time based?

Because it is how a session timeouts? For instance, CT uses
nf_conntrack_udp_timeout to timeout UDP sessions. Or are we going
to redefine conntrack?

> Why LRU alone is not enough?
> People implemented conntracker in bpf using LRU map.

Sure, people also implement CT on native hash map too and timeout
with user-space timers. ;)

> Anything extra can be added on top from user space
> which can easily copy with 1 sec granularity.

The problem is never about granularity, it is about how efficient we can
GC. User-space has to scan the whole table one by one, while the kernel
can just do this behind the scene with a much lower overhead.

Let's say we arm a timer for each entry in user-space, it requires a syscall
and locking buckets each time for each entry. Kernel could do it without
any additional syscall and batching. Like I said above, we could have
millions of entries, so the overhead would be big in this scenario.

> Say the kernel does GC and deletes htab entries.
> How user space will know that it's gone? There would need to be

By a lookup.

> an event sent to user space when entry is being deleted by the kernel.
> But then such event will be racy. Instead when timers and expirations
> are done by user space everything is in sync.

Why there has to be an event?

Thanks.
Alexei Starovoitov Dec. 16, 2020, 2:35 a.m. UTC | #9
On Tue, Dec 15, 2020 at 6:10 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 5:14 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Dec 15, 2020 at 04:22:21PM -0800, Cong Wang wrote:
> > > On Tue, Dec 15, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> > > >
> > > > On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
> > > > > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >>
> > > > >> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > > > >> <andrii.nakryiko@gmail.com> wrote:
> > > > >>>
> > > > >>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >>>>
> > > > >>>> From: Cong Wang <cong.wang@bytedance.com>
> > > > >>>>
> > > > >>>> This borrows the idea from conntrack and will be used for conntrack in
> > > > >>>> bpf too. Each element in a timeout map has a user-specified timeout
> > > > >>>> in secs, after it expires it will be automatically removed from the map.
> > > > [...]
> > > > >>>>          char key[] __aligned(8);
> > > > >>>>   };
> > > > >>>>
> > > > >>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > > >>>>
> > > > >>>>          for (i = 0; i < htab->n_buckets; i++) {
> > > > >>>>                  INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > > >>>> +               atomic_set(&htab->buckets[i].pending, 0);
> > > > >>>>                  if (htab_use_raw_lock(htab)) {
> > > > >>>>                          raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > > >>>>                          lockdep_set_class(&htab->buckets[i].raw_lock,
> > > > >>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > > >>>>          return 0;
> > > > >>>>   }
> > > > >>>>
> > > > >>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > > >>>> +{
> > > > >>>> +       if (atomic_fetch_or(1, &b->pending))
> > > > >>>> +               return;
> > > > >>>> +       llist_add(&b->gc_node, &htab->gc_list);
> > > > >>>> +       queue_work(system_unbound_wq, &htab->gc_work);
> > > > >>>> +}
> > > > >>>
> > > > >>> I'm concerned about each bucket being scheduled individually... And
> > > > >>> similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > > >>> scheduling independently. Can you think about the way to have a
> > > > >>> "global" gc/purging logic, and just make sure that buckets that need
> > > > >>> processing would be just internally chained together. So the purging
> > > > >>> routing would iterate all the scheduled hashmaps, and within each it
> > > > >>> will have a linked list of buckets that need processing? And all that
> > > > >>> is done just once each GC period. Not N times for N maps or N*M times
> > > > >>> for N maps with M buckets in each.
> > > > >>
> > > > >> Our internal discussion went to the opposite actually, people here argued
> > > > >> one work is not sufficient for a hashtable because there would be millions
> > > > >> of entries (max_entries, which is also number of buckets). ;)
> > > > >
> > > > > I was hoping that it's possible to expire elements without iterating
> > > > > the entire hash table every single time, only items that need to be
> > > > > processed. Hashed timing wheel is one way to do something like this,
> > > > > kernel has to solve similar problems with timeouts as well, why not
> > > > > taking inspiration there?
> > > >
> > > > Couldn't this map be coupled with LRU map for example through flag on map
> > > > creation so that the different LRU map flavors can be used with it? For BPF
> > > > CT use case we do rely on LRU map to purge 'inactive' entries once full. I
> > > > wonder if for that case you then still need to schedule a GC at all.. e.g.
> > > > if you hit the condition time_after_eq64(now, entry->expires) you'd just
> > > > re-link the expired element from the public htab to e.g. the LRU's local
> > > > CPU's free/pending-list instead.
> > >
> > > I doubt we can use size as a limit to kick off GC or LRU, it must be
> > > time-based. And in case of idle, there has to be an async GC, right?
> >
> > Why does it have to be time based?
>
> Because it is how a session timeouts? For instance, CT uses
> nf_conntrack_udp_timeout to timeout UDP sessions. Or are we going
> to redefine conntrack?

hmm. I see no reason to re-implement conntrack as-is in bpf.

> > Why LRU alone is not enough?
> > People implemented conntracker in bpf using LRU map.
>
> Sure, people also implement CT on native hash map too and timeout
> with user-space timers. ;)

exactly. what's wrong with that?
Perfectly fine way to do CT.

> > Anything extra can be added on top from user space
> > which can easily copy with 1 sec granularity.
>
> The problem is never about granularity, it is about how efficient we can
> GC. User-space has to scan the whole table one by one, while the kernel
> can just do this behind the scene with a much lower overhead.
>
> Let's say we arm a timer for each entry in user-space, it requires a syscall
> and locking buckets each time for each entry. Kernel could do it without
> any additional syscall and batching. Like I said above, we could have
> millions of entries, so the overhead would be big in this scenario.

and the user space can pick any other implementation instead
of trivial entry by entry gc with timer.

> > Say the kernel does GC and deletes htab entries.
> > How user space will know that it's gone? There would need to be
>
> By a lookup.
>
> > an event sent to user space when entry is being deleted by the kernel.
> > But then such event will be racy. Instead when timers and expirations
> > are done by user space everything is in sync.
>
> Why there has to be an event?

because when any production worthy implementation moves
past the prototype stage there is something that user space needs to keep
as well. Sometimes the bpf map in the kernel is alone.
But a lot of times there is a user space mirror of the map in c++ or golang
with the same key where user space keeps extra data.
David Laight Dec. 16, 2020, 10:38 a.m. UTC | #10
From: Alexei Starovoitov
> Sent: 16 December 2020 02:36
...
> > The problem is never about granularity, it is about how efficient we can
> > GC. User-space has to scan the whole table one by one, while the kernel
> > can just do this behind the scene with a much lower overhead.
> >
> > Let's say we arm a timer for each entry in user-space, it requires a syscall
> > and locking buckets each time for each entry. Kernel could do it without
> > any additional syscall and batching. Like I said above, we could have
> > millions of entries, so the overhead would be big in this scenario.
> 
> and the user space can pick any other implementation instead
> of trivial entry by entry gc with timer.

The kernel can also gc entries when scanning hash lists during insert
(or even during lookup if not using rw locks).

Apart from the memory use there isn't really a problem having timed-out
entries in the hash table if nothing is looking at them.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Andrii Nakryiko Dec. 16, 2020, 6:35 p.m. UTC | #11
On Tue, Dec 15, 2020 at 4:15 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 2:08 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > From: Cong Wang <cong.wang@bytedance.com>
> > > > >
> > > > > This borrows the idea from conntrack and will be used for conntrack in
> > > > > bpf too. Each element in a timeout map has a user-specified timeout
> > > > > in secs, after it expires it will be automatically removed from the map.
> > > > >
> > > > > There are two cases here:
> > > > >
> > > > > 1. When the timeout map is idle, that is, no one updates or accesses it,
> > > > >    we rely on the idle work to scan the whole hash table and remove
> > > > >    these expired. The idle work is scheduled every 1 sec.
> > > >
> > > > Would 1 second be a good period for a lot of cases? Probably would be
> > > > good to expand on what went into this decision.
> > >
> > > Sure, because our granularity is 1 sec, I will add it into changelog.
> > >
> >
> > Granularity of a timeout is not that coupled with the period of
> > garbage collection. In this case, with 1 second period, you can have
> > some items not garbage collected for up to 2 seconds due to timing and
> > races. Just keep that in mind.
>
> Well, it is. Let's say we add entries every ms and kick gc every sec, we
> could end up with thousands of expired entries in hash map, which could
> be a problem under memory pressure.

You can have the same thousands of entries expired with 1 second
timeout granularity, so not sure what point you are making. Think
about entries being added 1 every millisecond with 1 second timeout.
So at time +1ms you have 1 message with timeout at +1001ms, at +2ms,
you have 2 messages, one expiring at +1001ms and another at +1002ms.
So when you 1 second period GC kicks in at, say, +1000ms, it discards
nothing. By the time it kicks in second time at +2000ms, you are going
to expire 1000items, but you could have expired 500 at +1500ms, if
your period was 500ms, for example. With a 200ms period, you'd have at
most 200 not expired entries.

This is why I'm saying granularity of timeout units is not coupled
with the GC period. I hope this makes it clearer. More granular
timeout units give more flexibility and power to users without
changing anything else.

But relying purely on GC period is bad, because with more frequent
updates you can accumulate almost arbitrary number of expired entries
between two GC passes. No matter the timeout granularity.

>
> >

[...]

>
> >
> > >
> > > >
> > > > >  enum {
> > > > >         BPF_ANY         = 0, /* create new element or update existing */
> > > > >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > > > index f0b7b54fa3a8..178cb376c397 100644
> > > > > --- a/kernel/bpf/hashtab.c
> > > > > +++ b/kernel/bpf/hashtab.c
> > > > > @@ -8,6 +8,8 @@
> > > > >  #include <linux/filter.h>
> > > > >  #include <linux/rculist_nulls.h>
> > > > >  #include <linux/random.h>
> > > > > +#include <linux/llist.h>
> > > > > +#include <linux/workqueue.h>
> > > > >  #include <uapi/linux/btf.h>
> > > > >  #include <linux/rcupdate_trace.h>
> > > > >  #include "percpu_freelist.h"
> > > > > @@ -84,6 +86,8 @@ struct bucket {
> > > > >                 raw_spinlock_t raw_lock;
> > > > >                 spinlock_t     lock;
> > > > >         };
> > > > > +       struct llist_node gc_node;
> > > > > +       atomic_t pending;
> > > >
> > > > HASH is an extremely frequently used type of map, and oftentimes with
> > > > a lot of entries/buckets. I don't think users of normal
> > > > BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> > > > timeouts. So I think it's not appropriate to increase the size of the
> > > > struct bucket here.
> > >
> > > I understand that, but what's a better way to do this? I can wrap it up
> > > on top of struct bucket for sure, but it would need to change a lot of code.
> > > So, basically code reuse vs. struct bucket size increase. ;)
> >
> > I think not paying potentially lots of memory for unused features
> > wins. Some struct embedding might work. Or just better code reuse.
> > Please think this through, don't wait for me to write the code for
> > you.
>
> I perfectly understand this point, but other reviewers could easily argue
> why not just reuse the existing hashmap code given they are pretty much
> similar.
>
> I personally have no problem duplicating the code, but I need to justify it,
> right? :-/

Minimize duplication of the code, no one said copy/paste all the code.
But memory bloat is a real problem and should be justification enough
to at least consider other options.

[...]

>
> >
> > >
> > > Similarly, please suggest how to expand struct htab_elem without changing
> > > a lot of code. I also tried to find some hole in the struct, but I
> > > couldn't, so I
> > > ran out of ideas here.
> >
> > I mentioned above, you can have your own struct and embed htab_elem
> > inside. It might need some refactoring, of course.
>
> So increasing 8 bytes of struct htab_elem is a solid reason to change
> _potentially_ all of the hash map code? It does not sound solid to me,
> at least it is arguable.

8 bytes for htab_elem and 16 bytes for bucket (which equals
max_entries). Solid enough for me. But I certainly hope that not all
of the hashmap code would need to be changed.

>
> I also doubt I could really wrap up on top of htab_elem, as it assumes
> key and value are stored at the end. And these structs are internal,
> it is really hard to factor out.

I didn't do the exercise of trying to implement this, so discussing
this is a bit meaningless at this time. But

struct htab_elem_timeout {
  ... my timeout related stuff ...
  struct htab_elem elem;
};

would preserve that property.


>
> >
> > >
> > > >
> > > > >         char key[] __aligned(8);
> > > > >  };
> > > > >
> > > > > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > > >
> > > > >         for (i = 0; i < htab->n_buckets; i++) {
> > > > >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > > > +               atomic_set(&htab->buckets[i].pending, 0);
> > > > >                 if (htab_use_raw_lock(htab)) {
> > > > >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > > >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > > > > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > > >         return 0;
> > > > >  }
> > > > >
> > > > > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > > > +{
> > > > > +       if (atomic_fetch_or(1, &b->pending))
> > > > > +               return;
> > > > > +       llist_add(&b->gc_node, &htab->gc_list);
> > > > > +       queue_work(system_unbound_wq, &htab->gc_work);
> > > > > +}
> > > >
> > > > I'm concerned about each bucket being scheduled individually... And
> > > > similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > > scheduling independently. Can you think about the way to have a
> > > > "global" gc/purging logic, and just make sure that buckets that need
> > > > processing would be just internally chained together. So the purging
> > > > routing would iterate all the scheduled hashmaps, and within each it
> > > > will have a linked list of buckets that need processing? And all that
> > > > is done just once each GC period. Not N times for N maps or N*M times
> > > > for N maps with M buckets in each.
> > >
> > > Our internal discussion went to the opposite actually, people here argued
> > > one work is not sufficient for a hashtable because there would be millions
> > > of entries (max_entries, which is also number of buckets). ;)
> >
> > I was hoping that it's possible to expire elements without iterating
> > the entire hash table every single time, only items that need to be
> > processed. Hashed timing wheel is one way to do something like this,
>
> How could we know which ones are expired without scanning the
> whole table? They are clearly not sorted even within a bucket. Sorting
> them with expiration? Slightly better, as we can just stop at the first
> non-expired but with an expense of slowing down the update path.

Have you looked up "hashed timing wheel"?

>
> > kernel has to solve similar problems with timeouts as well, why not
> > taking inspiration there?
>
> Mind to point out which similar problems in the kernel?
>
> If you mean inspiration from conntrack, it is even worse, it uses multiple
> locking and locks on fast path too. I also looked at xt_hashlimit, it is not
> any better either.

I was thinking about epoll timeouts, but I don't know all the
implementation details, of course. My point was that kernel solves the
problem of maintaining a lot of uncorrelated timeouts already (epoll,
timeout signals, etc).

>
> >
> > >
> > > I chose one work per hash table because we could use map-in-map to divide
> > > the millions of entries.
> > >
> > > So, this really depends on how many maps and how many buckets in each
> > > map. I tend to leave this as it is, because there is no way to satisfy
> > > all of the
> > > cases.
> >
> > But I think some ways are better than others. Please consider all the
> > options, not just the simplest one.
>
> I _did_ consider multiple works per hash map carefully, like I said, it
> could be workarounded with map-in-map and the locking would be more
> complicated. Hence I pick this current implementation.
>
> Simplicity also means less bugs, in case you do not like it. ;)
>

There is simple and there is simplistic... I'm just cautioning against
falling into the second category.

> >
> > >
> > > >
> > > > > +
> > > > >  static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
> > > > >  {
> > > > >         bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> > > > > @@ -732,10 +749,13 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
> > > > >  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > > >  {
> > > > >         struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> > > > > +       bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
> > > > >         struct hlist_nulls_head *head;
> > > > >         struct htab_elem *l, *next_l;
> > > > >         u32 hash, key_size;
> > > > > +       struct bucket *b;
> > > > >         int i = 0;
> > > > > +       u64 now;
> > > > >
> > > > >         WARN_ON_ONCE(!rcu_read_lock_held());
> > > > >
> > > > > @@ -746,7 +766,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > > >
> > > > >         hash = htab_map_hash(key, key_size, htab->hashrnd);
> > > > >
> > > > > -       head = select_bucket(htab, hash);
> > > > > +       b = __select_bucket(htab, hash);
> > > > > +       head = &b->head;
> > > > >
> > > > >         /* lookup the key */
> > > > >         l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
> > > > > @@ -759,6 +780,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
> > > > >                                   struct htab_elem, hash_node);
> > > > >
> > > > >         if (next_l) {
> > > > > +               if (is_timeout) {
> > > > > +                       now = get_jiffies_64();
> > > > > +                       if (time_after_eq64(now, next_l->expires)) {
> > > > > +                               htab_sched_gc(htab, b);
> > > > > +                               goto find_first_elem;
> > > > > +                       }
> > > > > +               }
> > > >
> > > > this piece of logic is repeated verbatim many times, seems like a
> > > > helper function would make sense here
> > >
> > > Except goto or continue, isn't it? ;) Please do share your ideas on
> > > how to make it a helper for both goto and continue.
> >
> > So there is no way to make it work like this:
> >
> > if (htab_elem_expired(htab, next_l))
> >     goto find_first_elem;
> >
> > ?
>
> Good idea, it also needs to pass in struct bucket.
>

Great, glad that it is doable after all.

> >
> > >
> > >
> > > >

[...]

> > > > > +               if (timeout_map) {
> > > > > +                       now = get_jiffies_64();
> > > > > +                       l_old->expires = now + HZ * timeout;
> > > > > +               }
> > > >
> > > > Ok, so it seems timeout is at a second granularity. Would it make
> > > > sense to make it at millisecond granularity instead? I think
> > > > millisecond would be more powerful and allow more use cases, in the
> > > > long run. Micro- and nano-second granularity seems like an overkill,
> > > > though. And would reduce the max timeout to pretty small numbers. With
> > > > milliseconds, you still will get more than 23 days of timeout, which
> > > > seems to be plenty.
> > >
> > > Sure if you want to pay the price of scheduling the work more often...
> >
> > See above about timer granularity and GC period. You can have
> > nanosecond precision timeout and still GC only once every seconds, as
> > an example. You are checking expiration on lookup, so it can be
> > handled very precisely. You don't have to GC 1000 times per second to
> > support millisecond granularity.
>
> Like I said, if memory were not a problem, we could schedule it once per
> hour too. But I believe memory matters here. ;)

See above. Your 1 second granularity timeouts don't solve any
problems, you just chose to ignore those problems.

>
>
> > > For our own use case, second is sufficient. What use case do you have
> > > for paying this price? I am happy to hear.
> >
> > I don't have a specific use case. But I also don't see the extra price
> > we need to pay. You are adding a new *generic* data structure to the
> > wide BPF infrastructure, so please consider implications beyond your
> > immediate use case.
>
> I have considered it, just not able to find any better way to make everyone
> happy. If I choose not to increase struct bucket/htab_elem, I may have to
> duplicate or change a lot more hash map code. If I choose to increase it,
> regular map users could get an overhead. See the trouble? :)

I certainly do see the trouble, which is why I'm discussing more
options with you.

>
> Thanks.
Cong Wang Dec. 17, 2020, 5:06 a.m. UTC | #12
On Tue, Dec 15, 2020 at 6:35 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 6:10 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > Sure, people also implement CT on native hash map too and timeout
> > with user-space timers. ;)
>
> exactly. what's wrong with that?
> Perfectly fine way to do CT.

Seriously? When we have 8 millions of entries in a hash map, it is
definitely seriously wrong to purge entries one by one from user-space.

In case you don't believe me, take a look at what cilium CT GC does,
which is precisely expires entries one by one in user-space:

https://github.com/cilium/cilium/blob/0f57292c0037ee23ba1ca2f9abb113f36a664645/pkg/bpf/map_linux.go#L728
https://github.com/cilium/cilium/blob/master/pkg/maps/ctmap/ctmap.go#L398

and of course what people complained:

https://github.com/cilium/cilium/issues/5048

>
> > > Anything extra can be added on top from user space
> > > which can easily copy with 1 sec granularity.
> >
> > The problem is never about granularity, it is about how efficient we can
> > GC. User-space has to scan the whole table one by one, while the kernel
> > can just do this behind the scene with a much lower overhead.
> >
> > Let's say we arm a timer for each entry in user-space, it requires a syscall
> > and locking buckets each time for each entry. Kernel could do it without
> > any additional syscall and batching. Like I said above, we could have
> > millions of entries, so the overhead would be big in this scenario.
>
> and the user space can pick any other implementation instead
> of trivial entry by entry gc with timer.

Unless they don't have to, right? With timeout implementation in kernel,
user space does not need to invent any wheel.


>
> > > Say the kernel does GC and deletes htab entries.
> > > How user space will know that it's gone? There would need to be
> >
> > By a lookup.
> >
> > > an event sent to user space when entry is being deleted by the kernel.
> > > But then such event will be racy. Instead when timers and expirations
> > > are done by user space everything is in sync.
> >
> > Why there has to be an event?
>
> because when any production worthy implementation moves
> past the prototype stage there is something that user space needs to keep
> as well. Sometimes the bpf map in the kernel is alone.
> But a lot of times there is a user space mirror of the map in c++ or golang
> with the same key where user space keeps extra data.

So... what event does LRU map send when it deletes a different entry
when the map is full?

Thanks.
Cong Wang Dec. 17, 2020, 6:29 a.m. UTC | #13
On Wed, Dec 16, 2020 at 10:35 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Dec 15, 2020 at 4:15 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Tue, Dec 15, 2020 at 2:08 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > >
> > > > On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > > > <andrii.nakryiko@gmail.com> wrote:
> > > > >
> > > > > On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > >
> > > > > > From: Cong Wang <cong.wang@bytedance.com>
> > > > > >
> > > > > > This borrows the idea from conntrack and will be used for conntrack in
> > > > > > bpf too. Each element in a timeout map has a user-specified timeout
> > > > > > in secs, after it expires it will be automatically removed from the map.
> > > > > >
> > > > > > There are two cases here:
> > > > > >
> > > > > > 1. When the timeout map is idle, that is, no one updates or accesses it,
> > > > > >    we rely on the idle work to scan the whole hash table and remove
> > > > > >    these expired. The idle work is scheduled every 1 sec.
> > > > >
> > > > > Would 1 second be a good period for a lot of cases? Probably would be
> > > > > good to expand on what went into this decision.
> > > >
> > > > Sure, because our granularity is 1 sec, I will add it into changelog.
> > > >
> > >
> > > Granularity of a timeout is not that coupled with the period of
> > > garbage collection. In this case, with 1 second period, you can have
> > > some items not garbage collected for up to 2 seconds due to timing and
> > > races. Just keep that in mind.
> >
> > Well, it is. Let's say we add entries every ms and kick gc every sec, we
> > could end up with thousands of expired entries in hash map, which could
> > be a problem under memory pressure.
>
> You can have the same thousands of entries expired with 1 second
> timeout granularity, so not sure what point you are making. Think

It is impossible to have expired entries within 1 sec when the granularity
is 1 sec and GC interval is 1 sec (which is my current code).

> about entries being added 1 every millisecond with 1 second timeout.
> So at time +1ms you have 1 message with timeout at +1001ms, at +2ms,
> you have 2 messages, one expiring at +1001ms and another at +1002ms.
> So when you 1 second period GC kicks in at, say, +1000ms, it discards
> nothing. By the time it kicks in second time at +2000ms, you are going
> to expire 1000items, but you could have expired 500 at +1500ms, if
> your period was 500ms, for example. With a 200ms period, you'd have at
> most 200 not expired entries.
>
> This is why I'm saying granularity of timeout units is not coupled
> with the GC period. I hope this makes it clearer. More granular
> timeout units give more flexibility and power to users without
> changing anything else.

The point is the smaller the granularity is, the more entries could be
expired within a GC schedule interval. This is an issue when we have
a burst within an interval and it would cause memory pressure during this
interval.

>
> But relying purely on GC period is bad, because with more frequent
> updates you can accumulate almost arbitrary number of expired entries
> between two GC passes. No matter the timeout granularity.

True, this is why xt_hashlimit simply lets users pick the gc interval. And
in fact, my initial implementation of timeout map exposed gc interval to
user too, I removed it when I learned the granularity can be just 1 sec
for conntrack use case (see Documentation/networking/nf_conntrack-sysctl.txt).

Anyway, it is not a simple task to just convert sec to ms here, the gc
interval matters more when the granularity becomes smaller.


> > >
> > > >
> > > > >
> > > > > >  enum {
> > > > > >         BPF_ANY         = 0, /* create new element or update existing */
> > > > > >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > > > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > > > > index f0b7b54fa3a8..178cb376c397 100644
> > > > > > --- a/kernel/bpf/hashtab.c
> > > > > > +++ b/kernel/bpf/hashtab.c
> > > > > > @@ -8,6 +8,8 @@
> > > > > >  #include <linux/filter.h>
> > > > > >  #include <linux/rculist_nulls.h>
> > > > > >  #include <linux/random.h>
> > > > > > +#include <linux/llist.h>
> > > > > > +#include <linux/workqueue.h>
> > > > > >  #include <uapi/linux/btf.h>
> > > > > >  #include <linux/rcupdate_trace.h>
> > > > > >  #include "percpu_freelist.h"
> > > > > > @@ -84,6 +86,8 @@ struct bucket {
> > > > > >                 raw_spinlock_t raw_lock;
> > > > > >                 spinlock_t     lock;
> > > > > >         };
> > > > > > +       struct llist_node gc_node;
> > > > > > +       atomic_t pending;
> > > > >
> > > > > HASH is an extremely frequently used type of map, and oftentimes with
> > > > > a lot of entries/buckets. I don't think users of normal
> > > > > BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> > > > > timeouts. So I think it's not appropriate to increase the size of the
> > > > > struct bucket here.
> > > >
> > > > I understand that, but what's a better way to do this? I can wrap it up
> > > > on top of struct bucket for sure, but it would need to change a lot of code.
> > > > So, basically code reuse vs. struct bucket size increase. ;)
> > >
> > > I think not paying potentially lots of memory for unused features
> > > wins. Some struct embedding might work. Or just better code reuse.
> > > Please think this through, don't wait for me to write the code for
> > > you.
> >
> > I perfectly understand this point, but other reviewers could easily argue
> > why not just reuse the existing hashmap code given they are pretty much
> > similar.
> >
> > I personally have no problem duplicating the code, but I need to justify it,
> > right? :-/
>
> Minimize duplication of the code, no one said copy/paste all the code.
> But memory bloat is a real problem and should be justification enough
> to at least consider other options.

Sure, I have no problem with this. The question is how do we balance?
Is rewriting 200 lines of code to save 8 bytes of each entry acceptable?
What about rewriting 2000 lines of code? Do people prefer to review 200
or 2000 (or whatever number) lines of code? Or people just want a
minimal change for easier reviews?

>
> [...]
>
> >
> > >
> > > >
> > > > Similarly, please suggest how to expand struct htab_elem without changing
> > > > a lot of code. I also tried to find some hole in the struct, but I
> > > > couldn't, so I
> > > > ran out of ideas here.
> > >
> > > I mentioned above, you can have your own struct and embed htab_elem
> > > inside. It might need some refactoring, of course.
> >
> > So increasing 8 bytes of struct htab_elem is a solid reason to change
> > _potentially_ all of the hash map code? It does not sound solid to me,
> > at least it is arguable.
>
> 8 bytes for htab_elem and 16 bytes for bucket (which equals
> max_entries). Solid enough for me. But I certainly hope that not all
> of the hashmap code would need to be changed.

I can try, but I have to say the worst case is I have to duplicate most the
hashmap lookup/update code. I'd estimate few hundreds more lines of
code. And I want to make sure this is also acceptable to reviewers, in case
of wasting my time.

>
> >
> > I also doubt I could really wrap up on top of htab_elem, as it assumes
> > key and value are stored at the end. And these structs are internal,
> > it is really hard to factor out.
>
> I didn't do the exercise of trying to implement this, so discussing
> this is a bit meaningless at this time. But
>
> struct htab_elem_timeout {
>   ... my timeout related stuff ...
>   struct htab_elem elem;
> };
>
> would preserve that property.

Sure, but you know once the type changes, literally all the code has
to be changed. We can not just pass timeout_elem->elem to
htab_map_update_elem() as it is just internal.

>
>
> >
> > >
> > > >
> > > > >
> > > > > >         char key[] __aligned(8);
> > > > > >  };
> > > > > >
> > > > > > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > > > >
> > > > > >         for (i = 0; i < htab->n_buckets; i++) {
> > > > > >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > > > > +               atomic_set(&htab->buckets[i].pending, 0);
> > > > > >                 if (htab_use_raw_lock(htab)) {
> > > > > >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > > > >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > > > > > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > > > >         return 0;
> > > > > >  }
> > > > > >
> > > > > > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > > > > +{
> > > > > > +       if (atomic_fetch_or(1, &b->pending))
> > > > > > +               return;
> > > > > > +       llist_add(&b->gc_node, &htab->gc_list);
> > > > > > +       queue_work(system_unbound_wq, &htab->gc_work);
> > > > > > +}
> > > > >
> > > > > I'm concerned about each bucket being scheduled individually... And
> > > > > similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > > > scheduling independently. Can you think about the way to have a
> > > > > "global" gc/purging logic, and just make sure that buckets that need
> > > > > processing would be just internally chained together. So the purging
> > > > > routing would iterate all the scheduled hashmaps, and within each it
> > > > > will have a linked list of buckets that need processing? And all that
> > > > > is done just once each GC period. Not N times for N maps or N*M times
> > > > > for N maps with M buckets in each.
> > > >
> > > > Our internal discussion went to the opposite actually, people here argued
> > > > one work is not sufficient for a hashtable because there would be millions
> > > > of entries (max_entries, which is also number of buckets). ;)
> > >
> > > I was hoping that it's possible to expire elements without iterating
> > > the entire hash table every single time, only items that need to be
> > > processed. Hashed timing wheel is one way to do something like this,
> >
> > How could we know which ones are expired without scanning the
> > whole table? They are clearly not sorted even within a bucket. Sorting
> > them with expiration? Slightly better, as we can just stop at the first
> > non-expired but with an expense of slowing down the update path.
>
> Have you looked up "hashed timing wheel"?

I thought you mean the timer wheel in Linux kernel, I can not immediately
see how it can be used for our GC here. I guess you suggest to to link
each entry based on expiration sec?

>
> >
> > > kernel has to solve similar problems with timeouts as well, why not
> > > taking inspiration there?
> >
> > Mind to point out which similar problems in the kernel?
> >
> > If you mean inspiration from conntrack, it is even worse, it uses multiple
> > locking and locks on fast path too. I also looked at xt_hashlimit, it is not
> > any better either.
>
> I was thinking about epoll timeouts, but I don't know all the
> implementation details, of course. My point was that kernel solves the
> problem of maintaining a lot of uncorrelated timeouts already (epoll,
> timeout signals, etc).

They possibly just use a timer or delayed work etc.. This is why I only
look at similar timeout hash maps.

Thanks!
Cong Wang Dec. 17, 2020, 9:14 p.m. UTC | #14
On Wed, Dec 16, 2020 at 10:29 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Dec 16, 2020 at 10:35 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> > Minimize duplication of the code, no one said copy/paste all the code.
> > But memory bloat is a real problem and should be justification enough
> > to at least consider other options.
>
> Sure, I have no problem with this. The question is how do we balance?
> Is rewriting 200 lines of code to save 8 bytes of each entry acceptable?
> What about rewriting 2000 lines of code? Do people prefer to review 200
> or 2000 (or whatever number) lines of code? Or people just want a
> minimal change for easier reviews?

No worry any more. I manage to find some way to reuse the existing
members, that is lru_node. So the end result is putting gc stuff into
the union with lru_node without increasing the size of htab_elem.
And of course, without duplicating/refactoring regular htab code.

Thanks.
Daniel Borkmann Dec. 17, 2020, 10:39 p.m. UTC | #15
On 12/16/20 1:22 AM, Cong Wang wrote:
> On Tue, Dec 15, 2020 at 3:23 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> On 12/15/20 11:03 PM, Andrii Nakryiko wrote:
>>> On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>
>>>> On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
>>>> <andrii.nakryiko@gmail.com> wrote:
>>>>>
>>>>> On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>>>>>>
>>>>>> From: Cong Wang <cong.wang@bytedance.com>
>>>>>>
>>>>>> This borrows the idea from conntrack and will be used for conntrack in
>>>>>> bpf too. Each element in a timeout map has a user-specified timeout
>>>>>> in secs, after it expires it will be automatically removed from the map.
>> [...]
>>>>>>           char key[] __aligned(8);
>>>>>>    };
>>>>>>
>>>>>> @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
>>>>>>
>>>>>>           for (i = 0; i < htab->n_buckets; i++) {
>>>>>>                   INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
>>>>>> +               atomic_set(&htab->buckets[i].pending, 0);
>>>>>>                   if (htab_use_raw_lock(htab)) {
>>>>>>                           raw_spin_lock_init(&htab->buckets[i].raw_lock);
>>>>>>                           lockdep_set_class(&htab->buckets[i].raw_lock,
>>>>>> @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
>>>>>>           return 0;
>>>>>>    }
>>>>>>
>>>>>> +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
>>>>>> +{
>>>>>> +       if (atomic_fetch_or(1, &b->pending))
>>>>>> +               return;
>>>>>> +       llist_add(&b->gc_node, &htab->gc_list);
>>>>>> +       queue_work(system_unbound_wq, &htab->gc_work);
>>>>>> +}
>>>>>
>>>>> I'm concerned about each bucket being scheduled individually... And
>>>>> similarly concerned that each instance of TIMEOUT_HASH will do its own
>>>>> scheduling independently. Can you think about the way to have a
>>>>> "global" gc/purging logic, and just make sure that buckets that need
>>>>> processing would be just internally chained together. So the purging
>>>>> routing would iterate all the scheduled hashmaps, and within each it
>>>>> will have a linked list of buckets that need processing? And all that
>>>>> is done just once each GC period. Not N times for N maps or N*M times
>>>>> for N maps with M buckets in each.
>>>>
>>>> Our internal discussion went to the opposite actually, people here argued
>>>> one work is not sufficient for a hashtable because there would be millions
>>>> of entries (max_entries, which is also number of buckets). ;)
>>>
>>> I was hoping that it's possible to expire elements without iterating
>>> the entire hash table every single time, only items that need to be
>>> processed. Hashed timing wheel is one way to do something like this,
>>> kernel has to solve similar problems with timeouts as well, why not
>>> taking inspiration there?
>>
>> Couldn't this map be coupled with LRU map for example through flag on map
>> creation so that the different LRU map flavors can be used with it? For BPF
>> CT use case we do rely on LRU map to purge 'inactive' entries once full. I
>> wonder if for that case you then still need to schedule a GC at all.. e.g.
>> if you hit the condition time_after_eq64(now, entry->expires) you'd just
>> re-link the expired element from the public htab to e.g. the LRU's local
>> CPU's free/pending-list instead.
> 
> I doubt we can use size as a limit to kick off GC or LRU, it must be
> time-based. And in case of idle, there has to be an async GC, right?

I was thinking no GC at all, meaning, above mentioned re-linking of expired
elements would be done lazily e.g. whenever we walk a given bucket (e.g. on
lookup/update/delete) under the assumption we don't have deep lists there to
keep the time comparison not too expensive and that element migration has low
overhead (e.g. move to local CPU free-list).
Andrii Nakryiko Dec. 18, 2020, 7:13 p.m. UTC | #16
On Wed, Dec 16, 2020 at 10:29 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Dec 16, 2020 at 10:35 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Dec 15, 2020 at 4:15 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > >
> > > On Tue, Dec 15, 2020 at 2:08 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Tue, Dec 15, 2020 at 12:06 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > >
> > > > > On Tue, Dec 15, 2020 at 11:27 AM Andrii Nakryiko
> > > > > <andrii.nakryiko@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Dec 14, 2020 at 12:17 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > > > > > >
> > > > > > > From: Cong Wang <cong.wang@bytedance.com>
> > > > > > >
> > > > > > > This borrows the idea from conntrack and will be used for conntrack in
> > > > > > > bpf too. Each element in a timeout map has a user-specified timeout
> > > > > > > in secs, after it expires it will be automatically removed from the map.
> > > > > > >
> > > > > > > There are two cases here:
> > > > > > >
> > > > > > > 1. When the timeout map is idle, that is, no one updates or accesses it,
> > > > > > >    we rely on the idle work to scan the whole hash table and remove
> > > > > > >    these expired. The idle work is scheduled every 1 sec.
> > > > > >
> > > > > > Would 1 second be a good period for a lot of cases? Probably would be
> > > > > > good to expand on what went into this decision.
> > > > >
> > > > > Sure, because our granularity is 1 sec, I will add it into changelog.
> > > > >
> > > >
> > > > Granularity of a timeout is not that coupled with the period of
> > > > garbage collection. In this case, with 1 second period, you can have
> > > > some items not garbage collected for up to 2 seconds due to timing and
> > > > races. Just keep that in mind.
> > >
> > > Well, it is. Let's say we add entries every ms and kick gc every sec, we
> > > could end up with thousands of expired entries in hash map, which could
> > > be a problem under memory pressure.
> >
> > You can have the same thousands of entries expired with 1 second
> > timeout granularity, so not sure what point you are making. Think
>
> It is impossible to have expired entries within 1 sec when the granularity
> is 1 sec and GC interval is 1 sec (which is my current code).

What are you talking about? Have you read an example I described
below? Have you seen your __htab_timeout_map_lookup_elem()
implementation?

l_new->expires = now + HZ * timeout;

and

time_after_eq64(get_jiffies_64(), l->expires)

Your expiration is in jiffies internally, which is most definitely
more granular than one second, with HZ=1000 it's going to be in
milliseconds. So even if you allow to specify the timeout with only 1
second granularity, they will expire at "jiffies granularity". Don't
forget that expiration time is a function of when you updated/inserted
the element (jiffy granularity) and timeout (a second granularity), so
results in jiffy granularity.

Just because it's not physically removed from the hashmap (because GC
hasn't happened) it doesn't mean it didn't expire. You won't return
the item if its l_new->expires signals expiration, so for the BPF
program and user-space that element doesn't exist anymore. So all I'm
asking is to allow to specify a timeout in milliseconds, that's the
only change. All the other concerns stay exactly the same.

>
> > about entries being added 1 every millisecond with 1 second timeout.
> > So at time +1ms you have 1 message with timeout at +1001ms, at +2ms,
> > you have 2 messages, one expiring at +1001ms and another at +1002ms.
> > So when you 1 second period GC kicks in at, say, +1000ms, it discards
> > nothing. By the time it kicks in second time at +2000ms, you are going
> > to expire 1000items, but you could have expired 500 at +1500ms, if
> > your period was 500ms, for example. With a 200ms period, you'd have at
> > most 200 not expired entries.
> >
> > This is why I'm saying granularity of timeout units is not coupled
> > with the GC period. I hope this makes it clearer. More granular
> > timeout units give more flexibility and power to users without
> > changing anything else.
>
> The point is the smaller the granularity is, the more entries could be
> expired within a GC schedule interval. This is an issue when we have
> a burst within an interval and it would cause memory pressure during this
> interval.

Not at all. See above.

>
> >
> > But relying purely on GC period is bad, because with more frequent
> > updates you can accumulate almost arbitrary number of expired entries
> > between two GC passes. No matter the timeout granularity.
>
> True, this is why xt_hashlimit simply lets users pick the gc interval. And
> in fact, my initial implementation of timeout map exposed gc interval to
> user too, I removed it when I learned the granularity can be just 1 sec
> for conntrack use case (see Documentation/networking/nf_conntrack-sysctl.txt).
>
> Anyway, it is not a simple task to just convert sec to ms here, the gc
> interval matters more when the granularity becomes smaller.

GC interval matters, it just has nothing to do with the timeout
granularity. And relying only on GC period is also problematic, as me
and Daniel pointed out already.

>
>
> > > >
> > > > >
> > > > > >
> > > > > > >  enum {
> > > > > > >         BPF_ANY         = 0, /* create new element or update existing */
> > > > > > >         BPF_NOEXIST     = 1, /* create new element if it didn't exist */
> > > > > > > diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> > > > > > > index f0b7b54fa3a8..178cb376c397 100644
> > > > > > > --- a/kernel/bpf/hashtab.c
> > > > > > > +++ b/kernel/bpf/hashtab.c
> > > > > > > @@ -8,6 +8,8 @@
> > > > > > >  #include <linux/filter.h>
> > > > > > >  #include <linux/rculist_nulls.h>
> > > > > > >  #include <linux/random.h>
> > > > > > > +#include <linux/llist.h>
> > > > > > > +#include <linux/workqueue.h>
> > > > > > >  #include <uapi/linux/btf.h>
> > > > > > >  #include <linux/rcupdate_trace.h>
> > > > > > >  #include "percpu_freelist.h"
> > > > > > > @@ -84,6 +86,8 @@ struct bucket {
> > > > > > >                 raw_spinlock_t raw_lock;
> > > > > > >                 spinlock_t     lock;
> > > > > > >         };
> > > > > > > +       struct llist_node gc_node;
> > > > > > > +       atomic_t pending;
> > > > > >
> > > > > > HASH is an extremely frequently used type of map, and oftentimes with
> > > > > > a lot of entries/buckets. I don't think users of normal
> > > > > > BPF_MAP_TYPE_HASH should pay the price of way more niche hashmap with
> > > > > > timeouts. So I think it's not appropriate to increase the size of the
> > > > > > struct bucket here.
> > > > >
> > > > > I understand that, but what's a better way to do this? I can wrap it up
> > > > > on top of struct bucket for sure, but it would need to change a lot of code.
> > > > > So, basically code reuse vs. struct bucket size increase. ;)
> > > >
> > > > I think not paying potentially lots of memory for unused features
> > > > wins. Some struct embedding might work. Or just better code reuse.
> > > > Please think this through, don't wait for me to write the code for
> > > > you.
> > >
> > > I perfectly understand this point, but other reviewers could easily argue
> > > why not just reuse the existing hashmap code given they are pretty much
> > > similar.
> > >
> > > I personally have no problem duplicating the code, but I need to justify it,
> > > right? :-/
> >
> > Minimize duplication of the code, no one said copy/paste all the code.
> > But memory bloat is a real problem and should be justification enough
> > to at least consider other options.
>
> Sure, I have no problem with this. The question is how do we balance?
> Is rewriting 200 lines of code to save 8 bytes of each entry acceptable?
> What about rewriting 2000 lines of code? Do people prefer to review 200
> or 2000 (or whatever number) lines of code? Or people just want a
> minimal change for easier reviews?

If I were "people" I'd want a robust functionality first and foremost.
Minimizing the amount of code is good, but secondary to the main goal.

>
> >
> > [...]
> >
> > >
> > > >
> > > > >
> > > > > Similarly, please suggest how to expand struct htab_elem without changing
> > > > > a lot of code. I also tried to find some hole in the struct, but I
> > > > > couldn't, so I
> > > > > ran out of ideas here.
> > > >
> > > > I mentioned above, you can have your own struct and embed htab_elem
> > > > inside. It might need some refactoring, of course.
> > >
> > > So increasing 8 bytes of struct htab_elem is a solid reason to change
> > > _potentially_ all of the hash map code? It does not sound solid to me,
> > > at least it is arguable.
> >
> > 8 bytes for htab_elem and 16 bytes for bucket (which equals
> > max_entries). Solid enough for me. But I certainly hope that not all
> > of the hashmap code would need to be changed.
>
> I can try, but I have to say the worst case is I have to duplicate most the
> hashmap lookup/update code. I'd estimate few hundreds more lines of
> code. And I want to make sure this is also acceptable to reviewers, in case
> of wasting my time.
>
> >
> > >
> > > I also doubt I could really wrap up on top of htab_elem, as it assumes
> > > key and value are stored at the end. And these structs are internal,
> > > it is really hard to factor out.
> >
> > I didn't do the exercise of trying to implement this, so discussing
> > this is a bit meaningless at this time. But
> >
> > struct htab_elem_timeout {
> >   ... my timeout related stuff ...
> >   struct htab_elem elem;
> > };
> >
> > would preserve that property.
>
> Sure, but you know once the type changes, literally all the code has
> to be changed. We can not just pass timeout_elem->elem to
> htab_map_update_elem() as it is just internal.

Literally?..

>
> >
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >         char key[] __aligned(8);
> > > > > > >  };
> > > > > > >
> > > > > > > @@ -143,6 +151,7 @@ static void htab_init_buckets(struct bpf_htab *htab)
> > > > > > >
> > > > > > >         for (i = 0; i < htab->n_buckets; i++) {
> > > > > > >                 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
> > > > > > > +               atomic_set(&htab->buckets[i].pending, 0);
> > > > > > >                 if (htab_use_raw_lock(htab)) {
> > > > > > >                         raw_spin_lock_init(&htab->buckets[i].raw_lock);
> > > > > > >                         lockdep_set_class(&htab->buckets[i].raw_lock,
> > > > > > > @@ -431,6 +440,14 @@ static int htab_map_alloc_check(union bpf_attr *attr)
> > > > > > >         return 0;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
> > > > > > > +{
> > > > > > > +       if (atomic_fetch_or(1, &b->pending))
> > > > > > > +               return;
> > > > > > > +       llist_add(&b->gc_node, &htab->gc_list);
> > > > > > > +       queue_work(system_unbound_wq, &htab->gc_work);
> > > > > > > +}
> > > > > >
> > > > > > I'm concerned about each bucket being scheduled individually... And
> > > > > > similarly concerned that each instance of TIMEOUT_HASH will do its own
> > > > > > scheduling independently. Can you think about the way to have a
> > > > > > "global" gc/purging logic, and just make sure that buckets that need
> > > > > > processing would be just internally chained together. So the purging
> > > > > > routing would iterate all the scheduled hashmaps, and within each it
> > > > > > will have a linked list of buckets that need processing? And all that
> > > > > > is done just once each GC period. Not N times for N maps or N*M times
> > > > > > for N maps with M buckets in each.
> > > > >
> > > > > Our internal discussion went to the opposite actually, people here argued
> > > > > one work is not sufficient for a hashtable because there would be millions
> > > > > of entries (max_entries, which is also number of buckets). ;)
> > > >
> > > > I was hoping that it's possible to expire elements without iterating
> > > > the entire hash table every single time, only items that need to be
> > > > processed. Hashed timing wheel is one way to do something like this,
> > >
> > > How could we know which ones are expired without scanning the
> > > whole table? They are clearly not sorted even within a bucket. Sorting
> > > them with expiration? Slightly better, as we can just stop at the first
> > > non-expired but with an expense of slowing down the update path.
> >
> > Have you looked up "hashed timing wheel"?
>
> I thought you mean the timer wheel in Linux kernel, I can not immediately
> see how it can be used for our GC here. I guess you suggest to to link
> each entry based on expiration sec?

Expiration second or jiffy or whatever other bucket is an
implementation detail. The point is that a hash timer wheel allows to
have only a small subset of items to iterate through (that have a
chance to be expired in a given "time bucket", modulo time hash
collisions).

>
> >
> > >
> > > > kernel has to solve similar problems with timeouts as well, why not
> > > > taking inspiration there?
> > >
> > > Mind to point out which similar problems in the kernel?
> > >
> > > If you mean inspiration from conntrack, it is even worse, it uses multiple
> > > locking and locks on fast path too. I also looked at xt_hashlimit, it is not
> > > any better either.
> >
> > I was thinking about epoll timeouts, but I don't know all the
> > implementation details, of course. My point was that kernel solves the
> > problem of maintaining a lot of uncorrelated timeouts already (epoll,
> > timeout signals, etc).
>
> They possibly just use a timer or delayed work etc.. This is why I only
> look at similar timeout hash maps.

And what does the "timer" use? I bet it's a hashed time wheel.

>
> Thanks!
Andrii Nakryiko Dec. 18, 2020, 7:14 p.m. UTC | #17
On Thu, Dec 17, 2020 at 1:14 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> On Wed, Dec 16, 2020 at 10:29 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >
> > On Wed, Dec 16, 2020 at 10:35 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > > Minimize duplication of the code, no one said copy/paste all the code.
> > > But memory bloat is a real problem and should be justification enough
> > > to at least consider other options.
> >
> > Sure, I have no problem with this. The question is how do we balance?
> > Is rewriting 200 lines of code to save 8 bytes of each entry acceptable?
> > What about rewriting 2000 lines of code? Do people prefer to review 200
> > or 2000 (or whatever number) lines of code? Or people just want a
> > minimal change for easier reviews?
>
> No worry any more. I manage to find some way to reuse the existing

I never worried. But I'm glad you figured it out.

> members, that is lru_node. So the end result is putting gc stuff into
> the union with lru_node without increasing the size of htab_elem.
> And of course, without duplicating/refactoring regular htab code.
>
> Thanks.
diff mbox series

Patch

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 99f7fd657d87..00a3b17b6af2 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -125,6 +125,7 @@  BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_TIMEOUT_HASH, htab_timeout_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 30b477a26482..dedb47bc3f52 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -158,6 +158,7 @@  enum bpf_map_type {
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
+	BPF_MAP_TYPE_TIMEOUT_HASH,
 };
 
 /* Note that tracing related programs such as
@@ -393,7 +394,7 @@  enum bpf_link_type {
  */
 #define BPF_PSEUDO_CALL		1
 
-/* flags for BPF_MAP_UPDATE_ELEM command */
+/* flags for BPF_MAP_UPDATE_ELEM command, upper 32 bits are timeout */
 enum {
 	BPF_ANY		= 0, /* create new element or update existing */
 	BPF_NOEXIST	= 1, /* create new element if it didn't exist */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f0b7b54fa3a8..178cb376c397 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -8,6 +8,8 @@ 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/llist.h>
+#include <linux/workqueue.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include "percpu_freelist.h"
@@ -84,6 +86,8 @@  struct bucket {
 		raw_spinlock_t raw_lock;
 		spinlock_t     lock;
 	};
+	struct llist_node gc_node;
+	atomic_t pending;
 };
 
 #define HASHTAB_MAP_LOCK_COUNT 8
@@ -104,6 +108,9 @@  struct bpf_htab {
 	u32 hashrnd;
 	struct lock_class_key lockdep_key;
 	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
+	struct llist_head gc_list;
+	struct work_struct gc_work;
+	struct delayed_work gc_idle_work;
 };
 
 /* each htab element is struct htab_elem + key + value */
@@ -124,6 +131,7 @@  struct htab_elem {
 		struct bpf_lru_node lru_node;
 	};
 	u32 hash;
+	u64 expires;
 	char key[] __aligned(8);
 };
 
@@ -143,6 +151,7 @@  static void htab_init_buckets(struct bpf_htab *htab)
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+		atomic_set(&htab->buckets[i].pending, 0);
 		if (htab_use_raw_lock(htab)) {
 			raw_spin_lock_init(&htab->buckets[i].raw_lock);
 			lockdep_set_class(&htab->buckets[i].raw_lock,
@@ -431,6 +440,14 @@  static int htab_map_alloc_check(union bpf_attr *attr)
 	return 0;
 }
 
+static void htab_sched_gc(struct bpf_htab *htab, struct bucket *b)
+{
+	if (atomic_fetch_or(1, &b->pending))
+		return;
+	llist_add(&b->gc_node, &htab->gc_list);
+	queue_work(system_unbound_wq, &htab->gc_work);
+}
+
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -732,10 +749,13 @@  static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	bool is_timeout = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
 	struct hlist_nulls_head *head;
 	struct htab_elem *l, *next_l;
 	u32 hash, key_size;
+	struct bucket *b;
 	int i = 0;
+	u64 now;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -746,7 +766,8 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 	hash = htab_map_hash(key, key_size, htab->hashrnd);
 
-	head = select_bucket(htab, hash);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
 
 	/* lookup the key */
 	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
@@ -759,6 +780,13 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 				  struct htab_elem, hash_node);
 
 	if (next_l) {
+		if (is_timeout) {
+			now = get_jiffies_64();
+			if (time_after_eq64(now, next_l->expires)) {
+				htab_sched_gc(htab, b);
+				goto find_first_elem;
+			}
+		}
 		/* if next elem in this hash list is non-zero, just return it */
 		memcpy(next_key, next_l->key, key_size);
 		return 0;
@@ -771,12 +799,20 @@  static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 find_first_elem:
 	/* iterate over buckets */
 	for (; i < htab->n_buckets; i++) {
-		head = select_bucket(htab, i);
+		b = __select_bucket(htab, i);
+		head = &b->head;
 
 		/* pick first element in the bucket */
 		next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 					  struct htab_elem, hash_node);
 		if (next_l) {
+			if (is_timeout) {
+				now = get_jiffies_64();
+				if (time_after_eq64(now, next_l->expires)) {
+					htab_sched_gc(htab, b);
+					continue;
+				}
+			}
 			/* if it's not empty, just return it */
 			memcpy(next_key, next_l->key, key_size);
 			return 0;
@@ -975,18 +1011,31 @@  static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 	return 0;
 }
 
+static u32 fetch_timeout(u64 *map_flags)
+{
+	u32 timeout = (*map_flags) >> 32;
+
+	*map_flags = (*map_flags) & 0xffffffff;
+	return timeout;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	bool timeout_map = map->map_type == BPF_MAP_TYPE_TIMEOUT_HASH;
 	struct htab_elem *l_new = NULL, *l_old;
 	struct hlist_nulls_head *head;
 	unsigned long flags;
 	struct bucket *b;
 	u32 key_size, hash;
+	u32 timeout;
+	u64 now;
 	int ret;
 
+	timeout = fetch_timeout(&map_flags);
+
 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
@@ -1042,6 +1091,10 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		copy_map_value_locked(map,
 				      l_old->key + round_up(key_size, 8),
 				      value, false);
+		if (timeout_map) {
+			now = get_jiffies_64();
+			l_old->expires = now + HZ * timeout;
+		}
 		ret = 0;
 		goto err;
 	}
@@ -1054,6 +1107,13 @@  static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		goto err;
 	}
 
+	if (timeout_map) {
+		now = get_jiffies_64();
+		if (l_old && time_after_eq64(now, l_old->expires))
+			htab_sched_gc(htab, b);
+		l_new->expires = now + HZ * timeout;
+	}
+
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
@@ -2180,3 +2240,183 @@  const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_of_maps_map_btf_id,
 };
+
+static void __htab_gc_bucket(struct bpf_htab *htab, struct bucket *b)
+{
+	struct hlist_nulls_head *head = &b->head;
+	struct hlist_nulls_node *n;
+	u64 now = get_jiffies_64();
+	struct htab_elem *l;
+
+	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+		if (time_after_eq64(now, l->expires)) {
+			hlist_nulls_del_rcu(&l->hash_node);
+			free_htab_elem(htab, l);
+		}
+	}
+}
+
+static void htab_gc(struct work_struct *work)
+{
+	struct llist_node *head;
+	struct bpf_htab *htab;
+	struct bucket *b, *n;
+
+	htab = container_of(work, struct bpf_htab, gc_work);
+	head = llist_del_all(&htab->gc_list);
+
+	llist_for_each_entry_safe(b, n, head, gc_node) {
+		unsigned long flags;
+		int ret;
+
+		ret = htab_lock_bucket(htab, b, &flags);
+		if (ret)
+			continue;
+		__htab_gc_bucket(htab, b);
+		htab_unlock_bucket(htab, b, flags);
+
+		atomic_set(&b->pending, 0);
+
+		cond_resched();
+	}
+}
+
+static void htab_gc_idle(struct work_struct *work)
+{
+	struct bpf_htab *htab;
+	int i;
+
+	htab = container_of(work, struct bpf_htab, gc_idle_work.work);
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		unsigned long flags;
+		struct bucket *b;
+		int ret;
+
+		b = __select_bucket(htab, i);
+		if (hlist_nulls_empty(&b->head))
+			continue;
+		if (atomic_read(&b->pending))
+			continue;
+		ret = htab_lock_bucket(htab, b, &flags);
+		if (ret)
+			continue;
+		__htab_gc_bucket(htab, b);
+		htab_unlock_bucket(htab, b, flags);
+		cond_resched();
+	}
+
+	queue_delayed_work(system_power_efficient_wq, &htab->gc_idle_work, HZ);
+}
+
+static void *__htab_timeout_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_nulls_head *head;
+	struct htab_elem *l;
+	struct bucket *b;
+	u32 key_size = map->key_size;
+	u32 hash;
+
+	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
+	if (l && time_after_eq64(get_jiffies_64(), l->expires)) {
+		htab_sched_gc(htab, b);
+		l = NULL;
+	}
+
+	return l;
+}
+
+static void *htab_timeout_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_timeout_map_lookup_elem(map, key);
+
+	if (l)
+		return l->key + round_up(map->key_size, 8);
+	return NULL;
+}
+
+static int htab_timeout_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+	struct bpf_insn *insn = insn_buf;
+	const int ret = BPF_REG_0;
+
+	BUILD_BUG_ON(!__same_type(&__htab_timeout_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_timeout_map_lookup_elem));
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+				offsetof(struct htab_elem, key) +
+				round_up(map->key_size, 8));
+	return insn - insn_buf;
+}
+
+static void htab_timeout_map_seq_show_elem(struct bpf_map *map, void *key,
+					   struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = htab_timeout_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+	seq_puts(m, ": ");
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static struct bpf_map *htab_timeout_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map *map = htab_map_alloc(attr);
+	struct bpf_htab *htab;
+
+	if (!IS_ERR(map)) {
+		htab = container_of(map, struct bpf_htab, map);
+		init_llist_head(&htab->gc_list);
+		INIT_WORK(&htab->gc_work, htab_gc);
+		INIT_DEFERRABLE_WORK(&htab->gc_idle_work, htab_gc_idle);
+		queue_delayed_work(system_power_efficient_wq,
+				   &htab->gc_idle_work, HZ);
+	}
+
+	return map;
+}
+
+static void htab_timeout_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	cancel_work_sync(&htab->gc_work);
+	cancel_delayed_work_sync(&htab->gc_idle_work);
+
+	htab_map_free(map);
+}
+
+static int htab_timeout_map_btf_id;
+const struct bpf_map_ops htab_timeout_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc_check = htab_map_alloc_check,
+	.map_alloc = htab_timeout_map_alloc,
+	.map_free = htab_timeout_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_timeout_map_lookup_elem,
+	.map_update_elem = htab_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+	.map_gen_lookup = htab_timeout_map_gen_lookup,
+	.map_seq_show_elem = htab_timeout_map_seq_show_elem,
+	BATCH_OPS(htab),
+	.map_btf_name = "bpf_htab",
+	.map_btf_id = &htab_timeout_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 287be337d5f6..9ebd2e380a57 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -778,7 +778,8 @@  static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
 		    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
-		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+		    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+		    map->map_type != BPF_MAP_TYPE_TIMEOUT_HASH)
 			return -ENOTSUPP;
 		if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
 		    map->value_size) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 30b477a26482..684b8011a97a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -158,6 +158,7 @@  enum bpf_map_type {
 	BPF_MAP_TYPE_RINGBUF,
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
+	BPF_MAP_TYPE_TIMEOUT_HASH,
 };
 
 /* Note that tracing related programs such as