Message ID | 20200529151952.15184-4-idryomov@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | libceph: support for replica reads | expand |
On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote: > Allow expressing client's location in terms of CRUSH hierarchy as > a set of (bucket type name, bucket name) pairs. The userspace syntax > "crush_location = key1=value1 key2=value2" is incompatible with mount > options and needed adaptation: > > - ':' separator > - one key:value pair per crush_location option > - crush_location options are combined together > > So for: > > crush_location = host=foo rack=bar > > one would write: > > crush_location=host:foo,crush_location=rack:bar > > As in userspace, "multipath" locations are supported, so indicating > locality for parallel hierarchies is possible: > > crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar > Blech, that syntax is hideous. It's also problematic in that the options are additive -- you can't override an option that was given earlier (e.g. in fstab), or in a shell script. Is it not possible to do something with a single crush_location= option? Maybe: crush_location=rack:foo1/rack:foo2/datacenter:bar It's still ugly with the embedded '=' signs, but it would at least make it so that the options aren't additive. > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > --- > include/linux/ceph/libceph.h | 1 + > include/linux/ceph/osdmap.h | 16 ++++- > net/ceph/ceph_common.c | 25 ++++++++ > net/ceph/osdmap.c | 114 +++++++++++++++++++++++++++++++++++ > 4 files changed, 155 insertions(+), 1 deletion(-) > > diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h > index 4b5a47bcaba4..4733959f1ec7 100644 > --- a/include/linux/ceph/libceph.h > +++ b/include/linux/ceph/libceph.h > @@ -64,6 +64,7 @@ struct ceph_options { > int num_mon; > char *name; > struct ceph_crypto_key *key; > + struct rb_root crush_locs; > }; > > /* > diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h > index 5e601975745f..ef8619ad1401 100644 > --- a/include/linux/ceph/osdmap.h > +++ b/include/linux/ceph/osdmap.h > @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, > int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, > const struct ceph_pg *raw_pgid); > > +struct crush_loc { > + char *cl_type_name; > + char *cl_name; > +}; > + > +struct crush_loc_node { > + struct rb_node cl_node; > + struct crush_loc cl_loc; /* pointers into cl_data */ > + char cl_data[]; > +}; > + > +int ceph_parse_crush_loc(const char *str, struct rb_root *locs); > +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); > +void ceph_clear_crush_locs(struct rb_root *locs); > + > extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, > u64 id); > - > extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); > extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); > u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > index a0e97f6c1072..6d495685ee03 100644 > --- a/net/ceph/ceph_common.c > +++ b/net/ceph/ceph_common.c > @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt, > } > } > > + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); > + if (ret) > + return ret; > + > /* any matching mon ip implies a match */ > for (i = 0; i < opt1->num_mon; i++) { > if (ceph_monmap_contains(client->monc.monmap, > @@ -260,6 +264,7 @@ enum { > Opt_secret, > Opt_key, > Opt_ip, > + Opt_crush_location, > /* string args above */ > Opt_share, > Opt_crc, > @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { > fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), > fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), > fsparam_flag_no ("crc", Opt_crc), > + fsparam_string ("crush_location", Opt_crush_location), > fsparam_string ("fsid", Opt_fsid), > fsparam_string ("ip", Opt_ip), > fsparam_string ("key", Opt_key), > @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void) > if (!opt) > return NULL; > > + opt->crush_locs = RB_ROOT; > opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), > GFP_KERNEL); > if (!opt->mon_addr) { > @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt) > if (!opt) > return; > > + ceph_clear_crush_locs(&opt->crush_locs); > kfree(opt->name); > if (opt->key) { > ceph_crypto_key_destroy(opt->key); > @@ -454,6 +462,14 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, > if (!opt->key) > return -ENOMEM; > return get_secret(opt->key, param->string, &log); > + case Opt_crush_location: > + err = ceph_parse_crush_loc(param->string, &opt->crush_locs); > + if (err) { > + error_plog(&log, "Failed to parse crush location: %d", > + err); > + return err; > + } > + break; > > case Opt_osdtimeout: > warn_plog(&log, "Ignoring osdtimeout"); > @@ -536,6 +552,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, > { > struct ceph_options *opt = client->options; > size_t pos = m->count; > + struct rb_node *n; > > if (opt->name) { > seq_puts(m, "name="); > @@ -545,6 +562,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, > if (opt->key) > seq_puts(m, "secret=<hidden>,"); > > + for (n = rb_first(&opt->crush_locs); n; n = rb_next(n)) { > + struct crush_loc_node *loc = > + rb_entry(n, struct crush_loc_node, cl_node); > + > + seq_printf(m, "crush_location=%s:%s,", > + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); > + } > + > if (opt->flags & CEPH_OPT_FSID) > seq_printf(m, "fsid=%pU,", &opt->fsid); > if (opt->flags & CEPH_OPT_NOSHARE) > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c > index e74130876d3a..995cdb8b559e 100644 > --- a/net/ceph/osdmap.c > +++ b/net/ceph/osdmap.c > @@ -2715,3 +2715,117 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, > return acting.primary; > } > EXPORT_SYMBOL(ceph_pg_to_acting_primary); > + > +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, > + size_t name_len) > +{ > + struct crush_loc_node *loc; > + > + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); > + if (!loc) > + return NULL; > + > + RB_CLEAR_NODE(&loc->cl_node); > + return loc; > +} > + > +static void free_crush_loc(struct crush_loc_node *loc) > +{ > + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); > + > + kfree(loc); > +} > + > +static int crush_loc_compare(const struct crush_loc *loc1, > + const struct crush_loc *loc2) > +{ > + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: > + strcmp(loc1->cl_name, loc2->cl_name); > +} > + > +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, > + RB_BYPTR, const struct crush_loc *, cl_node) > + > +/* > + * A <bucket type name>:<bucket name> pair, e.g. "zone:us-east". > + */ > +int ceph_parse_crush_loc(const char *str, struct rb_root *locs) > +{ > + struct crush_loc_node *loc; > + const char *type_name, *name; > + size_t type_name_len, name_len; > + > + type_name = str; > + str = strchrnul(str, ':'); > + if (*str == '\0') > + return -EINVAL; /* no ':' */ > + > + type_name_len = str - type_name; > + if (type_name_len == 0) > + return -EINVAL; > + > + name = ++str; > + str = strchrnul(str, ':'); > + if (*str != '\0') > + return -EINVAL; /* another ':' */ > + > + name_len = str - name; > + if (name_len == 0) > + return -EINVAL; > + > + loc = alloc_crush_loc(type_name_len, name_len); > + if (!loc) > + return -ENOMEM; > + > + loc->cl_loc.cl_type_name = loc->cl_data; > + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); > + loc->cl_loc.cl_type_name[type_name_len] = '\0'; > + > + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; > + memcpy(loc->cl_loc.cl_name, name, name_len); > + loc->cl_loc.cl_name[name_len] = '\0'; > + > + if (!__insert_crush_loc(locs, loc)) { > + free_crush_loc(loc); > + return -EEXIST; > + } > + > + dout("%s type_name '%s' name '%s'\n", __func__, > + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); > + return 0; > +} > + > +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) > +{ > + struct rb_node *n1 = rb_first(locs1); > + struct rb_node *n2 = rb_first(locs2); > + int ret; > + > + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { > + struct crush_loc_node *loc1 = > + rb_entry(n1, struct crush_loc_node, cl_node); > + struct crush_loc_node *loc2 = > + rb_entry(n2, struct crush_loc_node, cl_node); > + > + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); > + if (ret) > + return ret; > + } > + > + if (!n1 && n2) > + return -1; > + if (n1 && !n2) > + return 1; > + return 0; > +} > + > +void ceph_clear_crush_locs(struct rb_root *locs) > +{ > + while (!RB_EMPTY_ROOT(locs)) { > + struct crush_loc_node *loc = > + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); > + > + erase_crush_loc(locs, loc); > + free_crush_loc(loc); > + } > +}
On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote: > > Allow expressing client's location in terms of CRUSH hierarchy as > > a set of (bucket type name, bucket name) pairs. The userspace syntax > > "crush_location = key1=value1 key2=value2" is incompatible with mount > > options and needed adaptation: > > > > - ':' separator > > - one key:value pair per crush_location option > > - crush_location options are combined together > > > > So for: > > > > crush_location = host=foo rack=bar > > > > one would write: > > > > crush_location=host:foo,crush_location=rack:bar > > > > As in userspace, "multipath" locations are supported, so indicating > > locality for parallel hierarchies is possible: > > > > crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar > > > > Blech, that syntax is hideous. It's also problematic in that the options > are additive -- you can't override an option that was given earlier > (e.g. in fstab), or in a shell script. > > Is it not possible to do something with a single crush_location= option? > Maybe: > > crush_location=rack:foo1/rack:foo2/datacenter:bar > > It's still ugly with the embedded '=' signs, but it would at least make > it so that the options aren't additive. I suppose we could do something like that at the cost of more parsing boilerplate, but I'm not sure additive options are that hideous. I don't think additive options are unprecedented and more importantly I think many simple boolean and integer options are not properly overridable even in major filesystems. What embedded '=' signs are you referring to? I see ':' and '/' in your suggested syntax. Thanks, Ilya
On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote: > On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote: > > > Allow expressing client's location in terms of CRUSH hierarchy as > > > a set of (bucket type name, bucket name) pairs. The userspace syntax > > > "crush_location = key1=value1 key2=value2" is incompatible with mount > > > options and needed adaptation: > > > > > > - ':' separator > > > - one key:value pair per crush_location option > > > - crush_location options are combined together > > > > > > So for: > > > > > > crush_location = host=foo rack=bar > > > > > > one would write: > > > > > > crush_location=host:foo,crush_location=rack:bar > > > > > > As in userspace, "multipath" locations are supported, so indicating > > > locality for parallel hierarchies is possible: > > > > > > crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar > > > > > > > Blech, that syntax is hideous. It's also problematic in that the options > > are additive -- you can't override an option that was given earlier > > (e.g. in fstab), or in a shell script. > > > > Is it not possible to do something with a single crush_location= option? > > Maybe: > > > > crush_location=rack:foo1/rack:foo2/datacenter:bar > > > > It's still ugly with the embedded '=' signs, but it would at least make > > it so that the options aren't additive. > > I suppose we could do something like that at the cost of more > parsing boilerplate, but I'm not sure additive options are that > hideous. I don't think additive options are unprecedented and > more importantly I think many simple boolean and integer options > are not properly overridable even in major filesystems. > That is the long-standing convention though. There are reasons to deviate from it, but I don't see it here. Plus, I think the syntax I proposed above is more readable (and compact) as well. It would mean a bit more parsing code though, granted. > What embedded '=' signs are you referring to? I see ':' and '/' > in your suggested syntax. > Sorry, yeah... I had originally done one that had '=' chars in it, but converted it to the above. Please disregard that paragraph.
On Fri, May 29, 2020 at 9:10 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote: > > On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote: > > > > Allow expressing client's location in terms of CRUSH hierarchy as > > > > a set of (bucket type name, bucket name) pairs. The userspace syntax > > > > "crush_location = key1=value1 key2=value2" is incompatible with mount > > > > options and needed adaptation: > > > > > > > > - ':' separator > > > > - one key:value pair per crush_location option > > > > - crush_location options are combined together > > > > > > > > So for: > > > > > > > > crush_location = host=foo rack=bar > > > > > > > > one would write: > > > > > > > > crush_location=host:foo,crush_location=rack:bar > > > > > > > > As in userspace, "multipath" locations are supported, so indicating > > > > locality for parallel hierarchies is possible: > > > > > > > > crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar > > > > > > > > > > Blech, that syntax is hideous. It's also problematic in that the options > > > are additive -- you can't override an option that was given earlier > > > (e.g. in fstab), or in a shell script. > > > > > > Is it not possible to do something with a single crush_location= option? > > > Maybe: > > > > > > crush_location=rack:foo1/rack:foo2/datacenter:bar > > > > > > It's still ugly with the embedded '=' signs, but it would at least make > > > it so that the options aren't additive. > > > > I suppose we could do something like that at the cost of more > > parsing boilerplate, but I'm not sure additive options are that > > hideous. I don't think additive options are unprecedented and > > more importantly I think many simple boolean and integer options > > are not properly overridable even in major filesystems. > > > > That is the long-standing convention though. There are reasons to > deviate from it, but I don't see it here. Plus, I think the syntax I > proposed above is more readable (and compact) as well. > > It would mean a bit more parsing code though, granted. > > > What embedded '=' signs are you referring to? I see ':' and '/' > > in your suggested syntax. > > > > Sorry, yeah... I had originally done one that had '=' chars in it, but > converted it to the above. Please disregard that paragraph. One of the reasons I did it this way is that crush_location is inherently additive. I don't have a strong opinion on this though so let's adhere to the convention. I'll implement the suggested syntax and repost. Thanks, Ilya
On Fri, May 29, 2020 at 10:42 PM Ilya Dryomov <idryomov@gmail.com> wrote: > > On Fri, May 29, 2020 at 9:10 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Fri, 2020-05-29 at 20:38 +0200, Ilya Dryomov wrote: > > > On Fri, May 29, 2020 at 7:27 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Fri, 2020-05-29 at 17:19 +0200, Ilya Dryomov wrote: > > > > > Allow expressing client's location in terms of CRUSH hierarchy as > > > > > a set of (bucket type name, bucket name) pairs. The userspace syntax > > > > > "crush_location = key1=value1 key2=value2" is incompatible with mount > > > > > options and needed adaptation: > > > > > > > > > > - ':' separator > > > > > - one key:value pair per crush_location option > > > > > - crush_location options are combined together > > > > > > > > > > So for: > > > > > > > > > > crush_location = host=foo rack=bar > > > > > > > > > > one would write: > > > > > > > > > > crush_location=host:foo,crush_location=rack:bar > > > > > > > > > > As in userspace, "multipath" locations are supported, so indicating > > > > > locality for parallel hierarchies is possible: > > > > > > > > > > crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar > > > > > > > > > > > > > Blech, that syntax is hideous. It's also problematic in that the options > > > > are additive -- you can't override an option that was given earlier > > > > (e.g. in fstab), or in a shell script. > > > > > > > > Is it not possible to do something with a single crush_location= option? > > > > Maybe: > > > > > > > > crush_location=rack:foo1/rack:foo2/datacenter:bar > > > > > > > > It's still ugly with the embedded '=' signs, but it would at least make > > > > it so that the options aren't additive. > > > > > > I suppose we could do something like that at the cost of more > > > parsing boilerplate, but I'm not sure additive options are that > > > hideous. I don't think additive options are unprecedented and > > > more importantly I think many simple boolean and integer options > > > are not properly overridable even in major filesystems. > > > > > > > That is the long-standing convention though. There are reasons to > > deviate from it, but I don't see it here. Plus, I think the syntax I > > proposed above is more readable (and compact) as well. > > > > It would mean a bit more parsing code though, granted. > > > > > What embedded '=' signs are you referring to? I see ':' and '/' > > > in your suggested syntax. > > > > > > > Sorry, yeah... I had originally done one that had '=' chars in it, but > > converted it to the above. Please disregard that paragraph. > > One of the reasons I did it this way is that crush_location is > inherently additive. I don't have a strong opinion on this though > so let's adhere to the convention. > > I'll implement the suggested syntax and repost. I went with '|' instead of '/' for the separator to try to stress the additivity (in the OR sense). '/' makes it look like a path to the root of the tree which it really isn't. Thanks, Ilya
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 4b5a47bcaba4..4733959f1ec7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -64,6 +64,7 @@ struct ceph_options { int num_mon; char *name; struct ceph_crypto_key *key; + struct rb_root crush_locs; }; /* diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5e601975745f..ef8619ad1401 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); +struct crush_loc { + char *cl_type_name; + char *cl_name; +}; + +struct crush_loc_node { + struct rb_node cl_node; + struct crush_loc cl_loc; /* pointers into cl_data */ + char cl_data[]; +}; + +int ceph_parse_crush_loc(const char *str, struct rb_root *locs); +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); +void ceph_clear_crush_locs(struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); - extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..6d495685ee03 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt, } } + ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs); + if (ret) + return ret; + /* any matching mon ip implies a match */ for (i = 0; i < opt1->num_mon; i++) { if (ceph_monmap_contains(client->monc.monmap, @@ -260,6 +264,7 @@ enum { Opt_secret, Opt_key, Opt_ip, + Opt_crush_location, /* string args above */ Opt_share, Opt_crc, @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages), fsparam_flag_no ("crc", Opt_crc), + fsparam_string ("crush_location", Opt_crush_location), fsparam_string ("fsid", Opt_fsid), fsparam_string ("ip", Opt_ip), fsparam_string ("key", Opt_key), @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void) if (!opt) return NULL; + opt->crush_locs = RB_ROOT; opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), GFP_KERNEL); if (!opt->mon_addr) { @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt) if (!opt) return; + ceph_clear_crush_locs(&opt->crush_locs); kfree(opt->name); if (opt->key) { ceph_crypto_key_destroy(opt->key); @@ -454,6 +462,14 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, if (!opt->key) return -ENOMEM; return get_secret(opt->key, param->string, &log); + case Opt_crush_location: + err = ceph_parse_crush_loc(param->string, &opt->crush_locs); + if (err) { + error_plog(&log, "Failed to parse crush location: %d", + err); + return err; + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -536,6 +552,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, { struct ceph_options *opt = client->options; size_t pos = m->count; + struct rb_node *n; if (opt->name) { seq_puts(m, "name="); @@ -545,6 +562,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, if (opt->key) seq_puts(m, "secret=<hidden>,"); + for (n = rb_first(&opt->crush_locs); n; n = rb_next(n)) { + struct crush_loc_node *loc = + rb_entry(n, struct crush_loc_node, cl_node); + + seq_printf(m, "crush_location=%s:%s,", + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); + } + if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); if (opt->flags & CEPH_OPT_NOSHARE) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index e74130876d3a..995cdb8b559e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2715,3 +2715,117 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); + +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, + size_t name_len) +{ + struct crush_loc_node *loc; + + loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); + if (!loc) + return NULL; + + RB_CLEAR_NODE(&loc->cl_node); + return loc; +} + +static void free_crush_loc(struct crush_loc_node *loc) +{ + WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); + + kfree(loc); +} + +static int crush_loc_compare(const struct crush_loc *loc1, + const struct crush_loc *loc2) +{ + return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: + strcmp(loc1->cl_name, loc2->cl_name); +} + +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, + RB_BYPTR, const struct crush_loc *, cl_node) + +/* + * A <bucket type name>:<bucket name> pair, e.g. "zone:us-east". + */ +int ceph_parse_crush_loc(const char *str, struct rb_root *locs) +{ + struct crush_loc_node *loc; + const char *type_name, *name; + size_t type_name_len, name_len; + + type_name = str; + str = strchrnul(str, ':'); + if (*str == '\0') + return -EINVAL; /* no ':' */ + + type_name_len = str - type_name; + if (type_name_len == 0) + return -EINVAL; + + name = ++str; + str = strchrnul(str, ':'); + if (*str != '\0') + return -EINVAL; /* another ':' */ + + name_len = str - name; + if (name_len == 0) + return -EINVAL; + + loc = alloc_crush_loc(type_name_len, name_len); + if (!loc) + return -ENOMEM; + + loc->cl_loc.cl_type_name = loc->cl_data; + memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); + loc->cl_loc.cl_type_name[type_name_len] = '\0'; + + loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; + memcpy(loc->cl_loc.cl_name, name, name_len); + loc->cl_loc.cl_name[name_len] = '\0'; + + if (!__insert_crush_loc(locs, loc)) { + free_crush_loc(loc); + return -EEXIST; + } + + dout("%s type_name '%s' name '%s'\n", __func__, + loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); + return 0; +} + +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) +{ + struct rb_node *n1 = rb_first(locs1); + struct rb_node *n2 = rb_first(locs2); + int ret; + + for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { + struct crush_loc_node *loc1 = + rb_entry(n1, struct crush_loc_node, cl_node); + struct crush_loc_node *loc2 = + rb_entry(n2, struct crush_loc_node, cl_node); + + ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); + if (ret) + return ret; + } + + if (!n1 && n2) + return -1; + if (n1 && !n2) + return 1; + return 0; +} + +void ceph_clear_crush_locs(struct rb_root *locs) +{ + while (!RB_EMPTY_ROOT(locs)) { + struct crush_loc_node *loc = + rb_entry(rb_first(locs), struct crush_loc_node, cl_node); + + erase_crush_loc(locs, loc); + free_crush_loc(loc); + } +}
Allow expressing client's location in terms of CRUSH hierarchy as a set of (bucket type name, bucket name) pairs. The userspace syntax "crush_location = key1=value1 key2=value2" is incompatible with mount options and needed adaptation: - ':' separator - one key:value pair per crush_location option - crush_location options are combined together So for: crush_location = host=foo rack=bar one would write: crush_location=host:foo,crush_location=rack:bar As in userspace, "multipath" locations are supported, so indicating locality for parallel hierarchies is possible: crush_location=rack:foo1,crush_location=rack:foo2,crush_location=datacenter:bar Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/osdmap.h | 16 ++++- net/ceph/ceph_common.c | 25 ++++++++ net/ceph/osdmap.c | 114 +++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 1 deletion(-)