Message ID | 20200129082715.5285-9-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: add perf metrics support | expand |
On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > Add enable/disable sending metrics to MDS debugfs and disabled as > default, if it's enabled the kclient will send metrics every > second. > > This will send global dentry lease hit/miss and read/write/metadata > latency metrics and each session's caps hit/miss metric to MDS. > > Every time only sends the global metrics once via any availible > session. > > URL: https://tracker.ceph.com/issues/43215 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/debugfs.c | 44 +++++++- > fs/ceph/mds_client.c | 201 ++++++++++++++++++++++++++++++++--- > fs/ceph/mds_client.h | 3 + > fs/ceph/metric.h | 76 +++++++++++++ > fs/ceph/super.h | 1 + > include/linux/ceph/ceph_fs.h | 1 + > 6 files changed, 307 insertions(+), 19 deletions(-) > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c > index 7fd031c18309..8aae7ecea54a 100644 > --- a/fs/ceph/debugfs.c > +++ b/fs/ceph/debugfs.c > @@ -124,6 +124,40 @@ static int mdsc_show(struct seq_file *s, void *p) > return 0; > } > > +/* > + * metrics debugfs > + */ > +static int sending_metrics_set(void *data, u64 val) > +{ > + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; > + struct ceph_mds_client *mdsc = fsc->mdsc; > + > + if (val > 1) { > + pr_err("Invalid sending metrics set value %llu\n", val); > + return -EINVAL; > + } > + > + mutex_lock(&mdsc->mutex); > + mdsc->sending_metrics = (unsigned int)val; Shouldn't that be a bool cast? Do we even need a cast there? > + mutex_unlock(&mdsc->mutex); > + > + return 0; > +} > + > +static int sending_metrics_get(void *data, u64 *val) > +{ > + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; > + struct ceph_mds_client *mdsc = fsc->mdsc; > + > + mutex_lock(&mdsc->mutex); > + *val = (u64)mdsc->sending_metrics; > + mutex_unlock(&mdsc->mutex); > + > + return 0; > +} > +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, > + sending_metrics_set, "%llu\n"); > + I'd like to hear more about how we expect users to use this facility. This debugfs file doesn't seem consistent with the rest of the UI, and I imagine if the box reboots you'd have to (manually) re-enable it after mount, right? Maybe this should be a mount option instead? > static int metric_show(struct seq_file *s, void *p) > { > struct ceph_fs_client *fsc = s->private; > @@ -302,11 +336,9 @@ static int congestion_kb_get(void *data, u64 *val) > *val = (u64)fsc->mount_options->congestion_kb; > return 0; > } > - > DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, > congestion_kb_set, "%llu\n"); > > - > void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) > { > dout("ceph_fs_debugfs_cleanup\n"); > @@ -316,6 +348,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) > debugfs_remove(fsc->debugfs_mds_sessions); > debugfs_remove(fsc->debugfs_caps); > debugfs_remove(fsc->debugfs_metric); > + debugfs_remove(fsc->debugfs_sending_metrics); > debugfs_remove(fsc->debugfs_mdsc); > } > > @@ -356,6 +389,13 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) > fsc, > &mdsc_show_fops); > > + fsc->debugfs_sending_metrics = > + debugfs_create_file("sending_metrics", > + 0600, > + fsc->client->debugfs_dir, > + fsc, > + &sending_metrics_fops); > + > fsc->debugfs_metric = debugfs_create_file("metrics", > 0400, > fsc->client->debugfs_dir, > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 92a933810a79..d765804dc855 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -4104,13 +4104,156 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) > ceph_force_reconnect(fsc->sb); > } > > +/* > + * called under s_mutex > + */ > +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, > + struct ceph_mds_session *s, > + bool skip_global) > +{ > + struct ceph_metric_head *head; > + struct ceph_metric_cap *cap; > + struct ceph_metric_dentry_lease *lease; > + struct ceph_metric_read_latency *read; > + struct ceph_metric_write_latency *write; > + struct ceph_metric_metadata_latency *meta; > + struct ceph_msg *msg; > + struct timespec64 ts; > + s32 len = sizeof(*head) + sizeof(*cap); > + s64 sum, total, avg; > + s32 items = 0; > + > + if (!mdsc || !s) > + return false; > + > + if (!skip_global) { > + len += sizeof(*lease); > + len += sizeof(*read); > + len += sizeof(*write); > + len += sizeof(*meta); > + } > + > + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); > + if (!msg) { > + pr_err("send metrics to mds%d, failed to allocate message\n", > + s->s_mds); > + return false; > + } > + > + head = msg->front.iov_base; > + > + /* encode the cap metric */ > + cap = (struct ceph_metric_cap *)(head + 1); > + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); > + cap->ver = 1; > + cap->campat = 1; > + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); > + cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit)); > + cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis)); > + cap->total = cpu_to_le64(s->s_nr_caps); > + items++; > + > + dout("cap metric hit %lld, mis %lld, total caps %lld", > + le64_to_cpu(cap->hit), le64_to_cpu(cap->mis), > + le64_to_cpu(cap->total)); > + > + /* only send the global once */ > + if (skip_global) > + goto skip_global; > + > + /* encode the dentry lease metric */ > + lease = (struct ceph_metric_dentry_lease *)(cap + 1); > + lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); > + lease->ver = 1; > + lease->campat = 1; > + lease->data_len = cpu_to_le32(sizeof(*lease) - 10); > + lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit)); > + lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis)); > + lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries)); > + items++; > + > + dout("dentry lease metric hit %lld, mis %lld, total dentries %lld", > + le64_to_cpu(lease->hit), le64_to_cpu(lease->mis), > + le64_to_cpu(lease->total)); > + > + /* encode the read latency metric */ > + read = (struct ceph_metric_read_latency *)(lease + 1); > + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); > + read->ver = 1; > + read->campat = 1; > + read->data_len = cpu_to_le32(sizeof(*read) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_reads), > + sum = percpu_counter_sum(&mdsc->metric.read_latency_sum); > + avg = total ? sum / total : 0; > + ts = ns_to_timespec64(avg); > + read->sec = cpu_to_le32(ts.tv_sec); > + read->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + > + dout("read latency metric total %lld, sum lat %lld, avg lat %lld", > + total, sum, avg); > + > + /* encode the write latency metric */ > + write = (struct ceph_metric_write_latency *)(read + 1); > + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); > + write->ver = 1; > + write->campat = 1; > + write->data_len = cpu_to_le32(sizeof(*write) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_writes), > + sum = percpu_counter_sum(&mdsc->metric.write_latency_sum); > + avg = total ? sum / total : 0; > + ts = ns_to_timespec64(avg); > + write->sec = cpu_to_le32(ts.tv_sec); > + write->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + > + dout("write latency metric total %lld, sum lat %lld, avg lat %lld", > + total, sum, avg); > + > + /* encode the metadata latency metric */ > + meta = (struct ceph_metric_metadata_latency *)(write + 1); > + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); > + meta->ver = 1; > + meta->campat = 1; > + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); > + total = percpu_counter_sum(&mdsc->metric.total_metadatas), > + sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum); > + avg = total ? sum / total : 0; > + ts = ns_to_timespec64(avg); > + meta->sec = cpu_to_le32(ts.tv_sec); > + meta->nsec = cpu_to_le32(ts.tv_nsec); > + items++; > + > + dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld", > + total, sum, avg); > + > +skip_global: > + put_unaligned_le32(items, &head->num); > + msg->front.iov_len = cpu_to_le32(len); > + msg->hdr.version = cpu_to_le16(1); > + msg->hdr.compat_version = cpu_to_le16(1); > + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); > + dout("send metrics to mds%d %p\n", s->s_mds, msg); > + ceph_con_send(&s->s_con, msg); > + > + return true; > +} > + > /* > * delayed work -- periodically trim expired leases, renew caps with mds > */ > +#define CEPH_WORK_DELAY_DEF 5 > static void schedule_delayed(struct ceph_mds_client *mdsc) > { > - int delay = 5; > - unsigned hz = round_jiffies_relative(HZ * delay); > + unsigned int hz; > + int delay = CEPH_WORK_DELAY_DEF; > + > + mutex_lock(&mdsc->mutex); > + if (mdsc->sending_metrics) > + delay = 1; > + mutex_unlock(&mdsc->mutex); > + The mdsc->mutex is dropped in the callers a little before this is called, so this is a little too mutex-thrashy. I think you'd be better off changing this function to be called with the mutex still held. > + hz = round_jiffies_relative(HZ * delay); > schedule_delayed_work(&mdsc->delayed_work, hz); > } > > @@ -4121,18 +4264,28 @@ static void delayed_work(struct work_struct *work) > container_of(work, struct ceph_mds_client, delayed_work.work); > int renew_interval; > int renew_caps; > + bool metric_only; > + bool sending_metrics; > + bool g_skip = false; > > dout("mdsc delayed_work\n"); > > mutex_lock(&mdsc->mutex); > - renew_interval = mdsc->mdsmap->m_session_timeout >> 2; > - renew_caps = time_after_eq(jiffies, HZ*renew_interval + > - mdsc->last_renew_caps); > - if (renew_caps) > - mdsc->last_renew_caps = jiffies; > + sending_metrics = !!mdsc->sending_metrics; > + metric_only = mdsc->sending_metrics && > + (mdsc->ticks++ % CEPH_WORK_DELAY_DEF); > + > + if (!metric_only) { > + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; > + renew_caps = time_after_eq(jiffies, HZ*renew_interval + > + mdsc->last_renew_caps); > + if (renew_caps) > + mdsc->last_renew_caps = jiffies; > + } > > for (i = 0; i < mdsc->max_sessions; i++) { > struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); > + > if (!s) > continue; > if (s->s_state == CEPH_MDS_SESSION_CLOSING) { > @@ -4158,13 +4311,20 @@ static void delayed_work(struct work_struct *work) > mutex_unlock(&mdsc->mutex); > > mutex_lock(&s->s_mutex); > - if (renew_caps) > - send_renew_caps(mdsc, s); > - else > - ceph_con_keepalive(&s->s_con); > - if (s->s_state == CEPH_MDS_SESSION_OPEN || > - s->s_state == CEPH_MDS_SESSION_HUNG) > - ceph_send_cap_releases(mdsc, s); > + > + if (sending_metrics) > + g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip); > + > + if (!metric_only) { > + if (renew_caps) > + send_renew_caps(mdsc, s); > + else > + ceph_con_keepalive(&s->s_con); > + if (s->s_state == CEPH_MDS_SESSION_OPEN || > + s->s_state == CEPH_MDS_SESSION_HUNG) > + ceph_send_cap_releases(mdsc, s); > + } > + > mutex_unlock(&s->s_mutex); > ceph_put_mds_session(s); > > @@ -4172,6 +4332,9 @@ static void delayed_work(struct work_struct *work) > } > mutex_unlock(&mdsc->mutex); > > + if (metric_only) > + goto delay_work; > + > ceph_check_delayed_caps(mdsc); > > ceph_queue_cap_reclaim_work(mdsc); > @@ -4180,11 +4343,13 @@ static void delayed_work(struct work_struct *work) > > maybe_recover_session(mdsc); > > +delay_work: > schedule_delayed(mdsc); > } > > -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > +static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc) > { > + struct ceph_client_metric *metric = &mdsc->metric; > int ret; > > if (!metric) > @@ -4222,7 +4387,9 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) > if (ret) > goto err_metadata_latency_sum; > > - return ret; > + mdsc->sending_metrics = 0; > + mdsc->ticks = 0; > + return 0; > err_metadata_latency_sum: > percpu_counter_destroy(&metric->total_metadatas); > err_total_metadatas: > @@ -4294,7 +4461,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) > init_waitqueue_head(&mdsc->cap_flushing_wq); > INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); > atomic_set(&mdsc->cap_reclaim_pending, 0); > - err = ceph_mdsc_metric_init(&mdsc->metric); > + err = ceph_mdsc_metric_init(mdsc); > if (err) > goto err_mdsmap; > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 574d4e5a5de2..a0ece55d987c 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -451,6 +451,9 @@ struct ceph_mds_client { > struct list_head dentry_leases; /* fifo list */ > struct list_head dentry_dir_leases; /* lru list */ > > + /* metrics */ > + unsigned int sending_metrics; > + unsigned int ticks; > struct ceph_client_metric metric; > > spinlock_t snapid_map_lock; > diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h > index 3cda616ba594..352eb753ce25 100644 > --- a/fs/ceph/metric.h > +++ b/fs/ceph/metric.h > @@ -4,6 +4,82 @@ > > #include <linux/ceph/osd_client.h> > > +enum ceph_metric_type { > + CLIENT_METRIC_TYPE_CAP_INFO, > + CLIENT_METRIC_TYPE_READ_LATENCY, > + CLIENT_METRIC_TYPE_WRITE_LATENCY, > + CLIENT_METRIC_TYPE_METADATA_LATENCY, > + CLIENT_METRIC_TYPE_DENTRY_LEASE, > + > + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, > +}; > + > +/* metric caps header */ > +struct ceph_metric_cap { > + __le32 type; /* ceph metric type */ > + > + __u8 ver; > + __u8 campat; I think you meant "compat" here. > + > + __le32 data_len; /* length of sizeof(hit + mis + total) */ > + __le64 hit; > + __le64 mis; > + __le64 total; > +} __attribute__ ((packed)); > + > +/* metric dentry lease header */ > +struct ceph_metric_dentry_lease { > + __le32 type; /* ceph metric type */ > + > + __u8 ver; > + __u8 campat; > + > + __le32 data_len; /* length of sizeof(hit + mis + total) */ > + __le64 hit; > + __le64 mis; > + __le64 total; > +} __attribute__ ((packed)); > + > +/* metric read latency header */ > +struct ceph_metric_read_latency { > + __le32 type; /* ceph metric type */ > + > + __u8 ver; > + __u8 campat; > + > + __le32 data_len; /* length of sizeof(sec + nsec) */ > + __le32 sec; > + __le32 nsec; > +} __attribute__ ((packed)); > + > +/* metric write latency header */ > +struct ceph_metric_write_latency { > + __le32 type; /* ceph metric type */ > + > + __u8 ver; > + __u8 campat; > + > + __le32 data_len; /* length of sizeof(sec + nsec) */ > + __le32 sec; > + __le32 nsec; > +} __attribute__ ((packed)); > + > +/* metric metadata latency header */ > +struct ceph_metric_metadata_latency { > + __le32 type; /* ceph metric type */ > + > + __u8 ver; > + __u8 campat; > + > + __le32 data_len; /* length of sizeof(sec + nsec) */ > + __le32 sec; > + __le32 nsec; > +} __attribute__ ((packed)); > + > +struct ceph_metric_head { > + __le32 num; /* the number of metrics will be sent */ "the number of metrics that will be sent" > +} __attribute__ ((packed)); > + > /* This is the global metrics */ > struct ceph_client_metric { > atomic64_t total_dentries; > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 3f4829222528..a91431e9bdf7 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -128,6 +128,7 @@ struct ceph_fs_client { > struct dentry *debugfs_congestion_kb; > struct dentry *debugfs_bdi; > struct dentry *debugfs_mdsc, *debugfs_mdsmap; > + struct dentry *debugfs_sending_metrics; > struct dentry *debugfs_metric; > struct dentry *debugfs_mds_sessions; > #endif > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h > index a099f60feb7b..6028d3e865e4 100644 > --- a/include/linux/ceph/ceph_fs.h > +++ b/include/linux/ceph/ceph_fs.h > @@ -130,6 +130,7 @@ struct ceph_dir_layout { > #define CEPH_MSG_CLIENT_REQUEST 24 > #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 > #define CEPH_MSG_CLIENT_REPLY 26 > +#define CEPH_MSG_CLIENT_METRICS 29 > #define CEPH_MSG_CLIENT_CAPS 0x310 > #define CEPH_MSG_CLIENT_LEASE 0x311 > #define CEPH_MSG_CLIENT_SNAP 0x312
On 2020/2/6 5:43, Jeff Layton wrote: > On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: [...] >> >> +/* >> + * metrics debugfs >> + */ >> +static int sending_metrics_set(void *data, u64 val) >> +{ >> + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; >> + struct ceph_mds_client *mdsc = fsc->mdsc; >> + >> + if (val > 1) { >> + pr_err("Invalid sending metrics set value %llu\n", val); >> + return -EINVAL; >> + } >> + >> + mutex_lock(&mdsc->mutex); >> + mdsc->sending_metrics = (unsigned int)val; > Shouldn't that be a bool cast? Do we even need a cast there? Will switch sending_metrics to bool type instead. >> + mutex_unlock(&mdsc->mutex); >> + >> + return 0; >> +} >> + >> +static int sending_metrics_get(void *data, u64 *val) >> +{ >> + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; >> + struct ceph_mds_client *mdsc = fsc->mdsc; >> + >> + mutex_lock(&mdsc->mutex); >> + *val = (u64)mdsc->sending_metrics; >> + mutex_unlock(&mdsc->mutex); >> + >> + return 0; >> +} >> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, >> + sending_metrics_set, "%llu\n"); >> + > I'd like to hear more about how we expect users to use this facility. > This debugfs file doesn't seem consistent with the rest of the UI, and I > imagine if the box reboots you'd have to (manually) re-enable it after > mount, right? Maybe this should be a mount option instead? A mount option means we must do the unmounting to disable it. I was thinking with the debugfs file we can do the debug or tuning even in the product setups at any time, usually this should be disabled since it will send it per second. Or we could merge the "sending_metric" to "metrics" UI, just writing "enable"/"disable" to enable/disable sending the metrics to ceph, and just like the "reset" does to clean the metrics. Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be writable with: "reset" --> to clean and reset the metrics counters "enable" --> enable sending metrics to ceph cluster "disable" --> disable sending metrics to ceph cluster Will this be better ? [...] > /* > * delayed work -- periodically trim expired leases, renew caps with mds > */ > +#define CEPH_WORK_DELAY_DEF 5 > static void schedule_delayed(struct ceph_mds_client *mdsc) > { > - int delay = 5; > - unsigned hz = round_jiffies_relative(HZ * delay); > + unsigned int hz; > + int delay = CEPH_WORK_DELAY_DEF; > + > + mutex_lock(&mdsc->mutex); > + if (mdsc->sending_metrics) > + delay = 1; > + mutex_unlock(&mdsc->mutex); > + > The mdsc->mutex is dropped in the callers a little before this is > called, so this is a little too mutex-thrashy. I think you'd be better > off changing this function to be called with the mutex still held. Will fix it. [...] >> +/* metric caps header */ >> +struct ceph_metric_cap { >> + __le32 type; /* ceph metric type */ >> + >> + __u8 ver; >> + __u8 campat; > I think you meant "compat" here. Will fix it. [...] >> +/* metric metadata latency header */ >> +struct ceph_metric_metadata_latency { >> + __le32 type; /* ceph metric type */ >> + >> + __u8 ver; >> + __u8 campat; >> + >> + __le32 data_len; /* length of sizeof(sec + nsec) */ >> + __le32 sec; >> + __le32 nsec; >> +} __attribute__ ((packed)); >> + >> +struct ceph_metric_head { >> + __le32 num; /* the number of metrics will be sent */ > "the number of metrics that will be sent" Will fix it. Thanks, >> +} __attribute__ ((packed)); >> + >> /* This is the global metrics */ >> struct ceph_client_metric { >> atomic64_t total_dentries; >> diff --git a/fs/ceph/super.h b/fs/ceph/super.h >> index 3f4829222528..a91431e9bdf7 100644 >> --- a/fs/ceph/super.h >> +++ b/fs/ceph/super.h >> @@ -128,6 +128,7 @@ struct ceph_fs_client { >> struct dentry *debugfs_congestion_kb; >> struct dentry *debugfs_bdi; >> struct dentry *debugfs_mdsc, *debugfs_mdsmap; >> + struct dentry *debugfs_sending_metrics; >> struct dentry *debugfs_metric; >> struct dentry *debugfs_mds_sessions; >> #endif >> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h >> index a099f60feb7b..6028d3e865e4 100644 >> --- a/include/linux/ceph/ceph_fs.h >> +++ b/include/linux/ceph/ceph_fs.h >> @@ -130,6 +130,7 @@ struct ceph_dir_layout { >> #define CEPH_MSG_CLIENT_REQUEST 24 >> #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 >> #define CEPH_MSG_CLIENT_REPLY 26 >> +#define CEPH_MSG_CLIENT_METRICS 29 >> #define CEPH_MSG_CLIENT_CAPS 0x310 >> #define CEPH_MSG_CLIENT_LEASE 0x311 >> #define CEPH_MSG_CLIENT_SNAP 0x312
On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote: > On 2020/2/6 5:43, Jeff Layton wrote: > > On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: > [...] > > > + > > > +static int sending_metrics_get(void *data, u64 *val) > > > +{ > > > + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; > > > + struct ceph_mds_client *mdsc = fsc->mdsc; > > > + > > > + mutex_lock(&mdsc->mutex); > > > + *val = (u64)mdsc->sending_metrics; > > > + mutex_unlock(&mdsc->mutex); > > > + > > > + return 0; > > > +} > > > +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, > > > + sending_metrics_set, "%llu\n"); > > > + > > I'd like to hear more about how we expect users to use this facility. > > This debugfs file doesn't seem consistent with the rest of the UI, and I > > imagine if the box reboots you'd have to (manually) re-enable it after > > mount, right? Maybe this should be a mount option instead? > > A mount option means we must do the unmounting to disable it. > Technically, no. You could wire it up so that you could enable and disable it via -o remount. For example: # mount -o remount,metrics=disabled Another option might be a module parameter if this is something that you really want to be global (and not per-mount or per-session). > I was thinking with the debugfs file we can do the debug or tuning even > in the product setups at any time, usually this should be disabled since > it will send it per second. > Meh, one frame per second doesn't seem like it'll add much overhead. Also, why one update per second? Should that interval be tunable? > Or we could merge the "sending_metric" to "metrics" UI, just writing > "enable"/"disable" to enable/disable sending the metrics to ceph, and > just like the "reset" does to clean the metrics. > > Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be > writable with: > > "reset" --> to clean and reset the metrics counters > > "enable" --> enable sending metrics to ceph cluster > > "disable" --> disable sending metrics to ceph cluster > > Will this be better ? > I guess it's not clear to me how you intend for this to be used. A debugfs switch means that this is being enabled and disabled on a per- session basis. Is the user supposed to turn this on for all, or just one session? How do they know? Is this something we expect people to just turn on briefly when they are experiencing a problem, or is this something that we expect to be turned on and left on for long periods of time? If it's the latter then setting up a mount in /etc/fstab is not going to be sufficient for an admin. She'll have to write a script or something that goes in after the mount and enables this by writing to debugfs after rebooting. Yuck.
On 2020/2/6 19:31, Jeff Layton wrote: > On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote: >> On 2020/2/6 5:43, Jeff Layton wrote: >>> On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: >> [...] >>>> + >>>> +static int sending_metrics_get(void *data, u64 *val) >>>> +{ >>>> + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; >>>> + struct ceph_mds_client *mdsc = fsc->mdsc; >>>> + >>>> + mutex_lock(&mdsc->mutex); >>>> + *val = (u64)mdsc->sending_metrics; >>>> + mutex_unlock(&mdsc->mutex); >>>> + >>>> + return 0; >>>> +} >>>> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, >>>> + sending_metrics_set, "%llu\n"); >>>> + >>> I'd like to hear more about how we expect users to use this facility. >>> This debugfs file doesn't seem consistent with the rest of the UI, and I >>> imagine if the box reboots you'd have to (manually) re-enable it after >>> mount, right? Maybe this should be a mount option instead? >> A mount option means we must do the unmounting to disable it. >> > Technically, no. You could wire it up so that you could enable and > disable it via -o remount. For example: > > # mount -o remount,metrics=disabled Yeah, this is cool. > > Another option might be a module parameter if this is something that you > really want to be global (and not per-mount or per-session). > >> I was thinking with the debugfs file we can do the debug or tuning even >> in the product setups at any time, usually this should be disabled since >> it will send it per second. >> > Meh, one frame per second doesn't seem like it'll add much overhead. Okay. > > Also, why one update per second? Should that interval be tunable? Per second just keep it the same with the fuse client. >> Or we could merge the "sending_metric" to "metrics" UI, just writing >> "enable"/"disable" to enable/disable sending the metrics to ceph, and >> just like the "reset" does to clean the metrics. >> >> Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be >> writable with: >> >> "reset" --> to clean and reset the metrics counters >> >> "enable" --> enable sending metrics to ceph cluster >> >> "disable" --> disable sending metrics to ceph cluster >> >> Will this be better ? >> > I guess it's not clear to me how you intend for this to be used. > > A debugfs switch means that this is being enabled and disabled on a per- > session basis. Is the user supposed to turn this on for all, or just one > session? How do they know? Not for all, just per-superblock. > > Is this something we expect people to just turn on briefly when they are > experiencing a problem, or is this something that we expect to be turned > on and left on for long periods of time? If this won't add much overhead even per second, let's keep sending the metrics to ceph always and the mount option for this switch is not needed any more. And there is already a switch to enable/disable showing the metrics in the ceph side, if here add another switch per client, it will be also yucky for admins . Let's make the update interval tunable and per second as default. Maybe we should make this as a global UI for all clients ? Is this okay ? Thanks. > If it's the latter then setting up a mount in /etc/fstab is not going to > be sufficient for an admin. She'll have to write a script or something > that goes in after the mount and enables this by writing to debugfs > after rebooting. Yuck. >
On Thu, 2020-02-06 at 20:26 +0800, Xiubo Li wrote: > On 2020/2/6 19:31, Jeff Layton wrote: > > On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote: > > > On 2020/2/6 5:43, Jeff Layton wrote: > > > > On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: > > > [...] > > > > > + > > > > > +static int sending_metrics_get(void *data, u64 *val) > > > > > +{ > > > > > + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; > > > > > + struct ceph_mds_client *mdsc = fsc->mdsc; > > > > > + > > > > > + mutex_lock(&mdsc->mutex); > > > > > + *val = (u64)mdsc->sending_metrics; > > > > > + mutex_unlock(&mdsc->mutex); > > > > > + > > > > > + return 0; > > > > > +} > > > > > +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, > > > > > + sending_metrics_set, "%llu\n"); > > > > > + > > > > I'd like to hear more about how we expect users to use this facility. > > > > This debugfs file doesn't seem consistent with the rest of the UI, and I > > > > imagine if the box reboots you'd have to (manually) re-enable it after > > > > mount, right? Maybe this should be a mount option instead? > > > A mount option means we must do the unmounting to disable it. > > > > > Technically, no. You could wire it up so that you could enable and > > disable it via -o remount. For example: > > > > # mount -o remount,metrics=disabled > > Yeah, this is cool. > > > Another option might be a module parameter if this is something that you > > really want to be global (and not per-mount or per-session). > > > > > I was thinking with the debugfs file we can do the debug or tuning even > > > in the product setups at any time, usually this should be disabled since > > > it will send it per second. > > > > > Meh, one frame per second doesn't seem like it'll add much overhead. > Okay. > > Also, why one update per second? Should that interval be tunable? > > Per second just keep it the same with the fuse client. > Ok. > > > > Or we could merge the "sending_metric" to "metrics" UI, just writing > > > "enable"/"disable" to enable/disable sending the metrics to ceph, and > > > just like the "reset" does to clean the metrics. > > > > > > Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be > > > writable with: > > > > > > "reset" --> to clean and reset the metrics counters > > > > > > "enable" --> enable sending metrics to ceph cluster > > > > > > "disable" --> disable sending metrics to ceph cluster > > > > > > Will this be better ? > > > > > I guess it's not clear to me how you intend for this to be used. > > > > A debugfs switch means that this is being enabled and disabled on a per- > > session basis. Is the user supposed to turn this on for all, or just one > > session? How do they know? > > Not for all, just per-superblock. > If it's per-superblock, then a debugfs-based switch seems particularly ill-suited for this, as that's really a per-session interface. > > Is this something we expect people to just turn on briefly when they are > > experiencing a problem, or is this something that we expect to be turned > > on and left on for long periods of time? > > If this won't add much overhead even per second, let's keep sending the > metrics to ceph always and the mount option for this switch is not > needed any more. > Note that I don't really _know_ that it won't be a problem, just that it doesn't sound too bad. I think we probably will want some mechanism to enable/disable this until we have some experience with it in the field. > And there is already a switch to enable/disable showing the metrics in > the ceph side, if here add another switch per client, it will be also > yucky for admins . > > Let's make the update interval tunable and per second as default. Maybe > we should make this as a global UI for all clients ? > If you want a global setting for the interval that would take effect on all ceph mounts, then maybe a "metric_send_interval" module parameter would be best. Make it an unsigned int, and allow the admin to set it to 0 to turn off stats transmission in the client. We have a well-defined interface for setting module parameters on most distros (via /etc/modprobe.d/), so that would be better than monkeying around with debugfs here, IMO. As to the default, it might be best to have this default to 0 initially. Once we have more experience with it we could make it default to 1 in a later release.
On 2020/2/6 23:21, Jeff Layton wrote: > On Thu, 2020-02-06 at 20:26 +0800, Xiubo Li wrote: >> On 2020/2/6 19:31, Jeff Layton wrote: >>> On Thu, 2020-02-06 at 10:36 +0800, Xiubo Li wrote: >>>> On 2020/2/6 5:43, Jeff Layton wrote: >>>>> On Wed, 2020-01-29 at 03:27 -0500, xiubli@redhat.com wrote: >>>> [...] >>>>>> + >>>>>> +static int sending_metrics_get(void *data, u64 *val) >>>>>> +{ >>>>>> + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; >>>>>> + struct ceph_mds_client *mdsc = fsc->mdsc; >>>>>> + >>>>>> + mutex_lock(&mdsc->mutex); >>>>>> + *val = (u64)mdsc->sending_metrics; >>>>>> + mutex_unlock(&mdsc->mutex); >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, >>>>>> + sending_metrics_set, "%llu\n"); >>>>>> + >>>>> I'd like to hear more about how we expect users to use this facility. >>>>> This debugfs file doesn't seem consistent with the rest of the UI, and I >>>>> imagine if the box reboots you'd have to (manually) re-enable it after >>>>> mount, right? Maybe this should be a mount option instead? >>>> A mount option means we must do the unmounting to disable it. >>>> >>> Technically, no. You could wire it up so that you could enable and >>> disable it via -o remount. For example: >>> >>> # mount -o remount,metrics=disabled >> Yeah, this is cool. >> >>> Another option might be a module parameter if this is something that you >>> really want to be global (and not per-mount or per-session). >>> >>>> I was thinking with the debugfs file we can do the debug or tuning even >>>> in the product setups at any time, usually this should be disabled since >>>> it will send it per second. >>>> >>> Meh, one frame per second doesn't seem like it'll add much overhead. >> Okay. >>> Also, why one update per second? Should that interval be tunable? >> Per second just keep it the same with the fuse client. >> > Ok. > >>>> Or we could merge the "sending_metric" to "metrics" UI, just writing >>>> "enable"/"disable" to enable/disable sending the metrics to ceph, and >>>> just like the "reset" does to clean the metrics. >>>> >>>> Then the "/sys/kernel/debug/ceph/XXX.clientYYY/metrics" could be >>>> writable with: >>>> >>>> "reset" --> to clean and reset the metrics counters >>>> >>>> "enable" --> enable sending metrics to ceph cluster >>>> >>>> "disable" --> disable sending metrics to ceph cluster >>>> >>>> Will this be better ? >>>> >>> I guess it's not clear to me how you intend for this to be used. >>> >>> A debugfs switch means that this is being enabled and disabled on a per- >>> session basis. Is the user supposed to turn this on for all, or just one >>> session? How do they know? >> Not for all, just per-superblock. >> > If it's per-superblock, then a debugfs-based switch seems particularly > ill-suited for this, as that's really a per-session interface. > >>> Is this something we expect people to just turn on briefly when they are >>> experiencing a problem, or is this something that we expect to be turned >>> on and left on for long periods of time? >> If this won't add much overhead even per second, let's keep sending the >> metrics to ceph always and the mount option for this switch is not >> needed any more. >> > Note that I don't really _know_ that it won't be a problem, just that it > doesn't sound too bad. I think we probably will want some mechanism to > enable/disable this until we have some experience with it in the field. > >> And there is already a switch to enable/disable showing the metrics in >> the ceph side, if here add another switch per client, it will be also >> yucky for admins . >> >> Let's make the update interval tunable and per second as default. Maybe >> we should make this as a global UI for all clients ? >> > If you want a global setting for the interval that would take effect on > all ceph mounts, then maybe a "metric_send_interval" module parameter > would be best. Make it an unsigned int, and allow the admin to set it to > 0 to turn off stats transmission in the client. > > We have a well-defined interface for setting module parameters on most > distros (via /etc/modprobe.d/), so that would be better than monkeying > around with debugfs here, IMO. > > As to the default, it might be best to have this default to 0 initially. > Once we have more experience with it we could make it default to 1 in a > later release. Yeah, this makes sense. Let's switch to the module parameter "metric_send_interval", at the same time this will also act as a switch. 0 means off, >0 will be the interval. Thanks,
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7fd031c18309..8aae7ecea54a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -124,6 +124,40 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } +/* + * metrics debugfs + */ +static int sending_metrics_set(void *data, u64 val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + struct ceph_mds_client *mdsc = fsc->mdsc; + + if (val > 1) { + pr_err("Invalid sending metrics set value %llu\n", val); + return -EINVAL; + } + + mutex_lock(&mdsc->mutex); + mdsc->sending_metrics = (unsigned int)val; + mutex_unlock(&mdsc->mutex); + + return 0; +} + +static int sending_metrics_get(void *data, u64 *val) +{ + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + struct ceph_mds_client *mdsc = fsc->mdsc; + + mutex_lock(&mdsc->mutex); + *val = (u64)mdsc->sending_metrics; + mutex_unlock(&mdsc->mutex); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get, + sending_metrics_set, "%llu\n"); + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; @@ -302,11 +336,9 @@ static int congestion_kb_get(void *data, u64 *val) *val = (u64)fsc->mount_options->congestion_kb; return 0; } - DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); - void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { dout("ceph_fs_debugfs_cleanup\n"); @@ -316,6 +348,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_metric); + debugfs_remove(fsc->debugfs_sending_metrics); debugfs_remove(fsc->debugfs_mdsc); } @@ -356,6 +389,13 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc, &mdsc_show_fops); + fsc->debugfs_sending_metrics = + debugfs_create_file("sending_metrics", + 0600, + fsc->client->debugfs_dir, + fsc, + &sending_metrics_fops); + fsc->debugfs_metric = debugfs_create_file("metrics", 0400, fsc->client->debugfs_dir, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 92a933810a79..d765804dc855 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4104,13 +4104,156 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) ceph_force_reconnect(fsc->sb); } +/* + * called under s_mutex + */ +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s, + bool skip_global) +{ + struct ceph_metric_head *head; + struct ceph_metric_cap *cap; + struct ceph_metric_dentry_lease *lease; + struct ceph_metric_read_latency *read; + struct ceph_metric_write_latency *write; + struct ceph_metric_metadata_latency *meta; + struct ceph_msg *msg; + struct timespec64 ts; + s32 len = sizeof(*head) + sizeof(*cap); + s64 sum, total, avg; + s32 items = 0; + + if (!mdsc || !s) + return false; + + if (!skip_global) { + len += sizeof(*lease); + len += sizeof(*read); + len += sizeof(*write); + len += sizeof(*meta); + } + + msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); + if (!msg) { + pr_err("send metrics to mds%d, failed to allocate message\n", + s->s_mds); + return false; + } + + head = msg->front.iov_base; + + /* encode the cap metric */ + cap = (struct ceph_metric_cap *)(head + 1); + cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->ver = 1; + cap->campat = 1; + cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis)); + cap->total = cpu_to_le64(s->s_nr_caps); + items++; + + dout("cap metric hit %lld, mis %lld, total caps %lld", + le64_to_cpu(cap->hit), le64_to_cpu(cap->mis), + le64_to_cpu(cap->total)); + + /* only send the global once */ + if (skip_global) + goto skip_global; + + /* encode the dentry lease metric */ + lease = (struct ceph_metric_dentry_lease *)(cap + 1); + lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + lease->ver = 1; + lease->campat = 1; + lease->data_len = cpu_to_le32(sizeof(*lease) - 10); + lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit)); + lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis)); + lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries)); + items++; + + dout("dentry lease metric hit %lld, mis %lld, total dentries %lld", + le64_to_cpu(lease->hit), le64_to_cpu(lease->mis), + le64_to_cpu(lease->total)); + + /* encode the read latency metric */ + read = (struct ceph_metric_read_latency *)(lease + 1); + read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->ver = 1; + read->campat = 1; + read->data_len = cpu_to_le32(sizeof(*read) - 10); + total = percpu_counter_sum(&mdsc->metric.total_reads), + sum = percpu_counter_sum(&mdsc->metric.read_latency_sum); + avg = total ? sum / total : 0; + ts = ns_to_timespec64(avg); + read->sec = cpu_to_le32(ts.tv_sec); + read->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + dout("read latency metric total %lld, sum lat %lld, avg lat %lld", + total, sum, avg); + + /* encode the write latency metric */ + write = (struct ceph_metric_write_latency *)(read + 1); + write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->ver = 1; + write->campat = 1; + write->data_len = cpu_to_le32(sizeof(*write) - 10); + total = percpu_counter_sum(&mdsc->metric.total_writes), + sum = percpu_counter_sum(&mdsc->metric.write_latency_sum); + avg = total ? sum / total : 0; + ts = ns_to_timespec64(avg); + write->sec = cpu_to_le32(ts.tv_sec); + write->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + dout("write latency metric total %lld, sum lat %lld, avg lat %lld", + total, sum, avg); + + /* encode the metadata latency metric */ + meta = (struct ceph_metric_metadata_latency *)(write + 1); + meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->ver = 1; + meta->campat = 1; + meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + total = percpu_counter_sum(&mdsc->metric.total_metadatas), + sum = percpu_counter_sum(&mdsc->metric.metadata_latency_sum); + avg = total ? sum / total : 0; + ts = ns_to_timespec64(avg); + meta->sec = cpu_to_le32(ts.tv_sec); + meta->nsec = cpu_to_le32(ts.tv_nsec); + items++; + + dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld", + total, sum, avg); + +skip_global: + put_unaligned_le32(items, &head->num); + msg->front.iov_len = cpu_to_le32(len); + msg->hdr.version = cpu_to_le16(1); + msg->hdr.compat_version = cpu_to_le16(1); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send metrics to mds%d %p\n", s->s_mds, msg); + ceph_con_send(&s->s_con, msg); + + return true; +} + /* * delayed work -- periodically trim expired leases, renew caps with mds */ +#define CEPH_WORK_DELAY_DEF 5 static void schedule_delayed(struct ceph_mds_client *mdsc) { - int delay = 5; - unsigned hz = round_jiffies_relative(HZ * delay); + unsigned int hz; + int delay = CEPH_WORK_DELAY_DEF; + + mutex_lock(&mdsc->mutex); + if (mdsc->sending_metrics) + delay = 1; + mutex_unlock(&mdsc->mutex); + + hz = round_jiffies_relative(HZ * delay); schedule_delayed_work(&mdsc->delayed_work, hz); } @@ -4121,18 +4264,28 @@ static void delayed_work(struct work_struct *work) container_of(work, struct ceph_mds_client, delayed_work.work); int renew_interval; int renew_caps; + bool metric_only; + bool sending_metrics; + bool g_skip = false; dout("mdsc delayed_work\n"); mutex_lock(&mdsc->mutex); - renew_interval = mdsc->mdsmap->m_session_timeout >> 2; - renew_caps = time_after_eq(jiffies, HZ*renew_interval + - mdsc->last_renew_caps); - if (renew_caps) - mdsc->last_renew_caps = jiffies; + sending_metrics = !!mdsc->sending_metrics; + metric_only = mdsc->sending_metrics && + (mdsc->ticks++ % CEPH_WORK_DELAY_DEF); + + if (!metric_only) { + renew_interval = mdsc->mdsmap->m_session_timeout >> 2; + renew_caps = time_after_eq(jiffies, HZ*renew_interval + + mdsc->last_renew_caps); + if (renew_caps) + mdsc->last_renew_caps = jiffies; + } for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); + if (!s) continue; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4158,13 +4311,20 @@ static void delayed_work(struct work_struct *work) mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); - if (renew_caps) - send_renew_caps(mdsc, s); - else - ceph_con_keepalive(&s->s_con); - if (s->s_state == CEPH_MDS_SESSION_OPEN || - s->s_state == CEPH_MDS_SESSION_HUNG) - ceph_send_cap_releases(mdsc, s); + + if (sending_metrics) + g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip); + + if (!metric_only) { + if (renew_caps) + send_renew_caps(mdsc, s); + else + ceph_con_keepalive(&s->s_con); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(mdsc, s); + } + mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -4172,6 +4332,9 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + if (metric_only) + goto delay_work; + ceph_check_delayed_caps(mdsc); ceph_queue_cap_reclaim_work(mdsc); @@ -4180,11 +4343,13 @@ static void delayed_work(struct work_struct *work) maybe_recover_session(mdsc); +delay_work: schedule_delayed(mdsc); } -static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) +static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc) { + struct ceph_client_metric *metric = &mdsc->metric; int ret; if (!metric) @@ -4222,7 +4387,9 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric) if (ret) goto err_metadata_latency_sum; - return ret; + mdsc->sending_metrics = 0; + mdsc->ticks = 0; + return 0; err_metadata_latency_sum: percpu_counter_destroy(&metric->total_metadatas); err_total_metadatas: @@ -4294,7 +4461,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); atomic_set(&mdsc->cap_reclaim_pending, 0); - err = ceph_mdsc_metric_init(&mdsc->metric); + err = ceph_mdsc_metric_init(mdsc); if (err) goto err_mdsmap; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 574d4e5a5de2..a0ece55d987c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -451,6 +451,9 @@ struct ceph_mds_client { struct list_head dentry_leases; /* fifo list */ struct list_head dentry_dir_leases; /* lru list */ + /* metrics */ + unsigned int sending_metrics; + unsigned int ticks; struct ceph_client_metric metric; spinlock_t snapid_map_lock; diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 3cda616ba594..352eb753ce25 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -4,6 +4,82 @@ #include <linux/ceph/osd_client.h> +enum ceph_metric_type { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, +}; + +/* metric caps header */ +struct ceph_metric_cap { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 campat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __attribute__ ((packed)); + +/* metric dentry lease header */ +struct ceph_metric_dentry_lease { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 campat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __attribute__ ((packed)); + +/* metric read latency header */ +struct ceph_metric_read_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 campat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __attribute__ ((packed)); + +/* metric write latency header */ +struct ceph_metric_write_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 campat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __attribute__ ((packed)); + +/* metric metadata latency header */ +struct ceph_metric_metadata_latency { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 campat; + + __le32 data_len; /* length of sizeof(sec + nsec) */ + __le32 sec; + __le32 nsec; +} __attribute__ ((packed)); + +struct ceph_metric_head { + __le32 num; /* the number of metrics will be sent */ +} __attribute__ ((packed)); + /* This is the global metrics */ struct ceph_client_metric { atomic64_t total_dentries; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3f4829222528..a91431e9bdf7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -128,6 +128,7 @@ struct ceph_fs_client { struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_sending_metrics; struct dentry *debugfs_metric; struct dentry *debugfs_mds_sessions; #endif diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index a099f60feb7b..6028d3e865e4 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -130,6 +130,7 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_REQUEST 24 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 #define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_METRICS 29 #define CEPH_MSG_CLIENT_CAPS 0x310 #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312