@@ -124,6 +124,40 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
+/*
+ * metrics debugfs
+ */
+static int sending_metrics_set(void *data, u64 val)
+{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ if (val > 1) {
+ pr_err("Invalid sending metrics set value %llu\n", val);
+ return -EINVAL;
+ }
+
+ mutex_lock(&mdsc->mutex);
+ mdsc->sending_metrics = (unsigned int)val;
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+
+static int sending_metrics_get(void *data, u64 *val)
+{
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ mutex_lock(&mdsc->mutex);
+ *val = (u64)mdsc->sending_metrics;
+ mutex_unlock(&mdsc->mutex);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(sending_metrics_fops, sending_metrics_get,
+ sending_metrics_set, "%llu\n");
+
static int metric_show(struct seq_file *s, void *p)
{
struct ceph_fs_client *fsc = s->private;
@@ -308,11 +342,9 @@ static int congestion_kb_get(void *data, u64 *val)
*val = (u64)fsc->mount_options->congestion_kb;
return 0;
}
-
DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
congestion_kb_set, "%llu\n");
-
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
dout("ceph_fs_debugfs_cleanup\n");
@@ -322,6 +354,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
debugfs_remove(fsc->debugfs_mds_sessions);
debugfs_remove(fsc->debugfs_caps);
debugfs_remove(fsc->debugfs_metric);
+ debugfs_remove(fsc->debugfs_sending_metrics);
debugfs_remove(fsc->debugfs_mdsc);
}
@@ -362,6 +395,13 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
fsc,
&mdsc_show_fops);
+ fsc->debugfs_sending_metrics =
+ debugfs_create_file("sending_metrics",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &sending_metrics_fops);
+
fsc->debugfs_metric = debugfs_create_file("metrics",
0400,
fsc->client->debugfs_dir,
@@ -4149,13 +4149,162 @@ void ceph_mdsc_update_metadata_latency(struct ceph_client_metric *m,
spin_unlock(&m->metadata_lock);
}
+/*
+ * called under s_mutex
+ */
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s,
+ bool skip_global)
+{
+ struct ceph_metric_head *head;
+ struct ceph_metric_cap *cap;
+ struct ceph_metric_dentry_lease *lease;
+ struct ceph_metric_read_latency *read;
+ struct ceph_metric_write_latency *write;
+ struct ceph_metric_metadata_latency *meta;
+ struct ceph_msg *msg;
+ struct timespec64 ts;
+ s32 len = sizeof(*head) + sizeof(*cap);
+ s64 sum, total, avg;
+ s32 items = 0;
+
+ if (!mdsc || !s)
+ return false;
+
+ if (!skip_global) {
+ len += sizeof(*lease);
+ len += sizeof(*read);
+ len += sizeof(*write);
+ len += sizeof(*meta);
+ }
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+ if (!msg) {
+ pr_err("send metrics to mds%d, failed to allocate message\n",
+ s->s_mds);
+ return false;
+ }
+
+ head = msg->front.iov_base;
+
+ /* encode the cap metric */
+ cap = (struct ceph_metric_cap *)(head + 1);
+ cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+ cap->ver = 1;
+ cap->campat = 1;
+ cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+ cap->hit = cpu_to_le64(percpu_counter_sum(&s->i_caps_hit));
+ cap->mis = cpu_to_le64(percpu_counter_sum(&s->i_caps_mis));
+ cap->total = cpu_to_le64(s->s_nr_caps);
+ items++;
+
+ dout("cap metric hit %lld, mis %lld, total caps %lld",
+ le64_to_cpu(cap->hit), le64_to_cpu(cap->mis),
+ le64_to_cpu(cap->total));
+
+ /* only send the global once */
+ if (skip_global)
+ goto skip_global;
+
+ /* encode the dentry lease metric */
+ lease = (struct ceph_metric_dentry_lease *)(cap + 1);
+ lease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+ lease->ver = 1;
+ lease->campat = 1;
+ lease->data_len = cpu_to_le32(sizeof(*lease) - 10);
+ lease->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_hit));
+ lease->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.d_lease_mis));
+ lease->total = cpu_to_le64(atomic64_read(&mdsc->metric.total_dentries));
+ items++;
+
+ dout("dentry lease metric hit %lld, mis %lld, total dentries %lld",
+ le64_to_cpu(lease->hit), le64_to_cpu(lease->mis),
+ le64_to_cpu(lease->total));
+
+ /* encode the read latency metric */
+ read = (struct ceph_metric_read_latency *)(lease + 1);
+ read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+ read->ver = 1;
+ read->campat = 1;
+ read->data_len = cpu_to_le32(sizeof(*read) - 10);
+ spin_lock(&mdsc->metric.read_lock);
+ total = atomic64_read(&mdsc->metric.total_reads),
+ sum = timespec64_to_ns(&mdsc->metric.read_latency_sum);
+ spin_unlock(&mdsc->metric.read_lock);
+ avg = total ? sum / total : 0;
+ ts = ns_to_timespec64(avg);
+ read->sec = cpu_to_le32(ts.tv_sec);
+ read->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ dout("read latency metric total %lld, sum lat %lld, avg lat %lld",
+ total, sum, avg);
+
+ /* encode the write latency metric */
+ write = (struct ceph_metric_write_latency *)(read + 1);
+ write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+ write->ver = 1;
+ write->campat = 1;
+ write->data_len = cpu_to_le32(sizeof(*write) - 10);
+ spin_lock(&mdsc->metric.write_lock);
+ total = atomic64_read(&mdsc->metric.total_writes),
+ sum = timespec64_to_ns(&mdsc->metric.write_latency_sum);
+ spin_unlock(&mdsc->metric.write_lock);
+ avg = total ? sum / total : 0;
+ ts = ns_to_timespec64(avg);
+ write->sec = cpu_to_le32(ts.tv_sec);
+ write->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ dout("write latency metric total %lld, sum lat %lld, avg lat %lld",
+ total, sum, avg);
+
+ /* encode the metadata latency metric */
+ meta = (struct ceph_metric_metadata_latency *)(write + 1);
+ meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+ meta->ver = 1;
+ meta->campat = 1;
+ meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+ spin_lock(&mdsc->metric.metadata_lock);
+ total = atomic64_read(&mdsc->metric.total_metadatas),
+ sum = timespec64_to_ns(&mdsc->metric.metadata_latency_sum);
+ spin_unlock(&mdsc->metric.metadata_lock);
+ avg = total ? sum / total : 0;
+ ts = ns_to_timespec64(avg);
+ meta->sec = cpu_to_le32(ts.tv_sec);
+ meta->nsec = cpu_to_le32(ts.tv_nsec);
+ items++;
+
+ dout("metadata latency metric total %lld, sum lat %lld, avg lat %lld",
+ total, sum, avg);
+
+skip_global:
+ put_unaligned_le32(items, &head->num);
+ msg->front.iov_len = cpu_to_le32(len);
+ msg->hdr.version = cpu_to_le16(1);
+ msg->hdr.compat_version = cpu_to_le16(1);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send metrics to mds%d %p\n", s->s_mds, msg);
+ ceph_con_send(&s->s_con, msg);
+
+ return true;
+}
+
/*
* delayed work -- periodically trim expired leases, renew caps with mds
*/
+#define CEPH_WORK_DELAY_DEF 5
static void schedule_delayed(struct ceph_mds_client *mdsc)
{
- int delay = 5;
- unsigned hz = round_jiffies_relative(HZ * delay);
+ unsigned int hz;
+ int delay = CEPH_WORK_DELAY_DEF;
+
+ mutex_lock(&mdsc->mutex);
+ if (mdsc->sending_metrics)
+ delay = 1;
+ mutex_unlock(&mdsc->mutex);
+
+ hz = round_jiffies_relative(HZ * delay);
schedule_delayed_work(&mdsc->delayed_work, hz);
}
@@ -4166,18 +4315,28 @@ static void delayed_work(struct work_struct *work)
container_of(work, struct ceph_mds_client, delayed_work.work);
int renew_interval;
int renew_caps;
+ bool metric_only;
+ bool sending_metrics;
+ bool g_skip = false;
dout("mdsc delayed_work\n");
mutex_lock(&mdsc->mutex);
- renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
- renew_caps = time_after_eq(jiffies, HZ*renew_interval +
- mdsc->last_renew_caps);
- if (renew_caps)
- mdsc->last_renew_caps = jiffies;
+ sending_metrics = !!mdsc->sending_metrics;
+ metric_only = mdsc->sending_metrics &&
+ (mdsc->ticks++ % CEPH_WORK_DELAY_DEF);
+
+ if (!metric_only) {
+ renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+ renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+ mdsc->last_renew_caps);
+ if (renew_caps)
+ mdsc->last_renew_caps = jiffies;
+ }
for (i = 0; i < mdsc->max_sessions; i++) {
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+
if (!s)
continue;
if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4203,13 +4362,20 @@ static void delayed_work(struct work_struct *work)
mutex_unlock(&mdsc->mutex);
mutex_lock(&s->s_mutex);
- if (renew_caps)
- send_renew_caps(mdsc, s);
- else
- ceph_con_keepalive(&s->s_con);
- if (s->s_state == CEPH_MDS_SESSION_OPEN ||
- s->s_state == CEPH_MDS_SESSION_HUNG)
- ceph_send_cap_releases(mdsc, s);
+
+ if (sending_metrics)
+ g_skip = ceph_mdsc_send_metrics(mdsc, s, g_skip);
+
+ if (!metric_only) {
+ if (renew_caps)
+ send_renew_caps(mdsc, s);
+ else
+ ceph_con_keepalive(&s->s_con);
+ if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+ s->s_state == CEPH_MDS_SESSION_HUNG)
+ ceph_send_cap_releases(mdsc, s);
+ }
+
mutex_unlock(&s->s_mutex);
ceph_put_mds_session(s);
@@ -4217,6 +4383,9 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
+ if (metric_only)
+ goto delay_work;
+
ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4225,11 +4394,13 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
+delay_work:
schedule_delayed(mdsc);
}
-static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
+static int ceph_mdsc_metric_init(struct ceph_mds_client *mdsc)
{
+ struct ceph_client_metric *metric = &mdsc->metric;
int ret;
if (!metric)
@@ -4257,6 +4428,8 @@ static int ceph_mdsc_metric_init(struct ceph_client_metric *metric)
memset(&metric->metadata_latency_sum, 0, sizeof(struct timespec64));
atomic64_set(&metric->total_metadatas, 0);
+ mdsc->sending_metrics = 0;
+ mdsc->ticks = 0;
return 0;
}
@@ -4313,7 +4486,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
atomic_set(&mdsc->cap_reclaim_pending, 0);
- err = ceph_mdsc_metric_init(&mdsc->metric);
+ err = ceph_mdsc_metric_init(mdsc);
if (err)
goto err_mdsmap;
@@ -468,6 +468,9 @@ struct ceph_mds_client {
struct list_head dentry_leases; /* fifo list */
struct list_head dentry_dir_leases; /* lru list */
+ /* metrics */
+ unsigned int sending_metrics;
+ unsigned int ticks;
struct ceph_client_metric metric;
spinlock_t snapid_map_lock;
@@ -128,6 +128,7 @@ struct ceph_fs_client {
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
struct dentry *debugfs_mdsc, *debugfs_mdsmap;
+ struct dentry *debugfs_sending_metrics;
struct dentry *debugfs_metric;
struct dentry *debugfs_mds_sessions;
#endif
@@ -130,6 +130,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_CLIENT_REQUEST 24
#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_METRICS 29
#define CEPH_MSG_CLIENT_CAPS 0x310
#define CEPH_MSG_CLIENT_LEASE 0x311
#define CEPH_MSG_CLIENT_SNAP 0x312
@@ -761,6 +762,82 @@ struct ceph_mds_lease {
} __attribute__ ((packed));
/* followed by a __le32+string for dname */
+enum ceph_metric_type {
+ CLIENT_METRIC_TYPE_CAP_INFO,
+ CLIENT_METRIC_TYPE_READ_LATENCY,
+ CLIENT_METRIC_TYPE_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/* metric caps header */
+struct ceph_metric_cap {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 campat;
+
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __attribute__ ((packed));
+
+/* metric dentry lease header */
+struct ceph_metric_dentry_lease {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 campat;
+
+ __le32 data_len; /* length of sizeof(hit + mis + total) */
+ __le64 hit;
+ __le64 mis;
+ __le64 total;
+} __attribute__ ((packed));
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 campat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __attribute__ ((packed));
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 campat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __attribute__ ((packed));
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+ __le32 type; /* ceph metric type */
+
+ __u8 ver;
+ __u8 campat;
+
+ __le32 data_len; /* length of sizeof(sec + nsec) */
+ __le32 sec;
+ __le32 nsec;
+} __attribute__ ((packed));
+
+struct ceph_metric_head {
+ __le32 num; /* the number of metrics will be sent */
+} __attribute__ ((packed));
+
/* client reconnect */
struct ceph_mds_cap_reconnect {
__le64 cap_id;