diff mbox series

[v2,2/4] trace-cmd library: Extend host-guest time sync with fraction bits

Message ID 20211014150204.2485499-3-tz.stoyanov@gmail.com (mailing list archive)
State Accepted
Commit f0b3ed1222dd92d796dca5e0e46dc03558915abb
Headers show
Series trace-cmd: Align guest TSC calculation with the kernel | expand

Commit Message

Tzvetomir Stoyanov (VMware) Oct. 14, 2021, 3:02 p.m. UTC
The formula, used in the Linux kernel for guest TSC clock is:
 guest_tsc = tsc-offset + (host_tsc * tsc-scaling-ratio) >> tsc-scaling-ratio-frac-bits

The KVM hypervisor exports these parameters in the debugfs, if clock
scaling is supported:
 /sys/kernel/debug/kvm:
	tsc-offset
	tsc-scaling-ratio
	tsc-scaling-ratio-frac-bits

Current trace-cmd implementation does not take into account fraction
bits, when calculating guest clock offset in case of a clock scaling.
The TRACECMD_OPTION_TIME_SHIFT option is extended with per-cpu array
with fraction bits. KVM and PTP algorithms are extended to get fraction:
 - in PTP, there are fixed scaling=1 and fraction=0;
 - KVM gets these from KVM debugfs, if available.
The formula from the kernel is implemented in trace-cmd library, when
calculating timestamps of the guest events.

This change is backward compatible with trace-cmd 2.9, where the
TRACECMD_OPTION_TIME_SHIFT option is introduced.

Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
---
 .../include/private/trace-cmd-private.h       |  4 +-
 lib/trace-cmd/include/trace-tsync-local.h     |  7 ++-
 lib/trace-cmd/trace-timesync-kvm.c            | 56 ++++++++++++++-----
 lib/trace-cmd/trace-timesync-ptp.c            |  4 +-
 lib/trace-cmd/trace-timesync.c                | 55 ++++++++++++++----
 5 files changed, 96 insertions(+), 30 deletions(-)
diff mbox series

Patch

diff --git a/lib/trace-cmd/include/private/trace-cmd-private.h b/lib/trace-cmd/include/private/trace-cmd-private.h
index c58b09e9..f40babef 100644
--- a/lib/trace-cmd/include/private/trace-cmd-private.h
+++ b/lib/trace-cmd/include/private/trace-cmd-private.h
@@ -454,8 +454,8 @@  tracecmd_tsync_with_guest(unsigned long long trace_id, int loop_interval,
 			  int guest_cpus, const char *proto_name, const char *clock);
 int tracecmd_tsync_with_guest_stop(struct tracecmd_time_sync *tsync);
 int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu,
-				int *count, long long **ts,
-				long long **offsets, long long **scalings);
+			       int *count, long long **ts,
+			       long long **offsets, long long **scalings, long long **frac);
 int tracecmd_tsync_get_session_params(struct tracecmd_time_sync *tsync,
 				      char **selected_proto,
 				      unsigned int *tsync_port);
diff --git a/lib/trace-cmd/include/trace-tsync-local.h b/lib/trace-cmd/include/trace-tsync-local.h
index 7e56d187..b357662c 100644
--- a/lib/trace-cmd/include/trace-tsync-local.h
+++ b/lib/trace-cmd/include/trace-tsync-local.h
@@ -27,14 +27,15 @@  struct tracecmd_time_sync {
 struct clock_sync_offsets {
 	/* Arrays with calculated time offsets at given time */
 	int				sync_size;	/* Allocated size of sync_ts,
-							 * sync_offsets and sync_scalings
+							 * sync_offsets, sync_scalings and sync_frac
 							 */
 	int				sync_count;	/* Number of elements in sync_ts,
-							 * sync_offsets and sync_scalings
+							 * sync_offsets, sync_scalings and sync_frac
 							 */
 	long long			*sync_ts;
 	long long			*sync_offsets;
 	long long			*sync_scalings;
+	long long			*sync_frac;
 };
 
 struct clock_sync_context {
@@ -60,7 +61,7 @@  int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int role
 				  int (*init)(struct tracecmd_time_sync *),
 				  int (*free)(struct tracecmd_time_sync *),
 				  int (*calc)(struct tracecmd_time_sync *,
-					      long long *, long long *,
+					      long long *, long long *, long long*,
 					      long long *, unsigned int));
 int tracecmd_tsync_proto_unregister(char *proto_name);
 
diff --git a/lib/trace-cmd/trace-timesync-kvm.c b/lib/trace-cmd/trace-timesync-kvm.c
index 6f76fe1f..8d531922 100644
--- a/lib/trace-cmd/trace-timesync-kvm.c
+++ b/lib/trace-cmd/trace-timesync-kvm.c
@@ -19,6 +19,7 @@ 
 #define KVM_DEBUG_FS "/sys/kernel/debug/kvm"
 #define KVM_DEBUG_OFFSET_FILE	"tsc-offset"
 #define KVM_DEBUG_SCALING_FILE	"tsc-scaling-ratio"
+#define KVM_DEBUG_FRACTION_FILE	"tsc-scaling-ratio-frac-bits"
 #define KVM_DEBUG_VCPU_DIR	"vcpu"
 
 /* default KVM scaling values, taken from the Linux kernel */
@@ -37,6 +38,7 @@  struct kvm_clock_sync {
 	int vcpu_count;
 	char **vcpu_offsets;
 	char **vcpu_scalings;
+	char **vcpu_frac;
 	int marker_fd;
 	struct tep_handle *tep;
 	int raw_id;
@@ -47,6 +49,7 @@  struct kvm_clock_offset_msg {
 	s64	ts;
 	s64	offset;
 	s64	scaling;
+	s64	frac;
 };
 
 static int read_ll_from_file(char *file, long long *res)
@@ -72,21 +75,27 @@  static int read_ll_from_file(char *file, long long *res)
 
 static bool kvm_scaling_check_vm_cpu(char *vname, char *cpu)
 {
-	long long scaling;
+	long long scaling, frac;
+	bool has_scaling = false;
+	bool has_frac = false;
 	char *path;
 	int ret;
 
 	if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_SCALING_FILE) < 0)
-		return true;
+		return false;
 	ret = read_ll_from_file(path, &scaling);
 	free(path);
+	if (!ret)
+		has_scaling = true;
+
+	if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_FRACTION_FILE) < 0)
+		return false;
+	ret = read_ll_from_file(path, &frac);
+	free(path);
+	if (!ret)
+		has_frac = true;
 
-	/*
-	 * If there is a scaling, different from the default -
-	 * return false, not supported.
-	 */
-	if (!ret &&
-	    scaling != KVM_SCALING_AMD_DEFAULT && scaling != KVM_SCALING_INTEL_DEFAULT)
+	if (has_scaling != has_frac)
 		return false;
 
 	return true;
@@ -173,6 +182,11 @@  static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
 					 dir_str, entry->d_name);
 				kvm->vcpu_scalings[cpu] = strdup(path);
 			}
+			if (!strcmp(entry->d_name, KVM_DEBUG_FRACTION_FILE)) {
+				snprintf(path, sizeof(path), "%s/%s",
+					 dir_str, entry->d_name);
+				kvm->vcpu_frac[cpu] = strdup(path);
+			}
 		}
 	}
 	if (!kvm->vcpu_offsets[cpu])
@@ -187,6 +201,8 @@  error:
 	kvm->vcpu_offsets[cpu] = NULL;
 	free(kvm->vcpu_scalings[cpu]);
 	kvm->vcpu_scalings[cpu] = NULL;
+	free(kvm->vcpu_frac[cpu]);
+	kvm->vcpu_frac[cpu] = NULL;
 	return -1;
 }
 
@@ -252,7 +268,8 @@  static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync,
 	kvm->vcpu_count = tsync->vcpu_count;
 	kvm->vcpu_offsets = calloc(kvm->vcpu_count, sizeof(char *));
 	kvm->vcpu_scalings = calloc(kvm->vcpu_count, sizeof(char *));
-	if (!kvm->vcpu_offsets || !kvm->vcpu_scalings)
+	kvm->vcpu_frac = calloc(kvm->vcpu_count, sizeof(char *));
+	if (!kvm->vcpu_offsets || !kvm->vcpu_scalings || !kvm->vcpu_frac)
 		goto error;
 	if (kvm_open_debug_files(kvm, tsync->guest_pid) < 0)
 		goto error;
@@ -261,6 +278,7 @@  static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync,
 error:
 	free(kvm->vcpu_offsets);
 	free(kvm->vcpu_scalings);
+	free(kvm->vcpu_frac);
 	return -1;
 }
 
@@ -351,6 +369,8 @@  static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync)
 			kvm->vcpu_offsets[i] = NULL;
 			free(kvm->vcpu_scalings[i]);
 			kvm->vcpu_scalings[i] = NULL;
+			free(kvm->vcpu_frac[i]);
+			kvm->vcpu_frac[i] = NULL;
 		}
 		if (kvm->tep)
 			tep_free(kvm->tep);
@@ -362,7 +382,7 @@  static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync)
 }
 
 static int kvm_clock_host(struct tracecmd_time_sync *tsync,
-			  long long *offset, long long *scaling,
+			  long long *offset, long long *scaling, long long *frac,
 			  long long *timestamp, unsigned int cpu)
 {
 	char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
@@ -372,6 +392,7 @@  static int kvm_clock_host(struct tracecmd_time_sync *tsync,
 	long long kvm_scaling = 1;
 	unsigned int sync_msg;
 	long long kvm_offset;
+	long long kvm_frac = 0;
 	unsigned int size;
 	char *msg;
 	int ret;
@@ -386,12 +407,16 @@  static int kvm_clock_host(struct tracecmd_time_sync *tsync,
 	ret = read_ll_from_file(kvm->vcpu_offsets[cpu], &kvm_offset);
 	if (ret < 0)
 		return -1;
+
 	if (kvm->vcpu_scalings && kvm->vcpu_scalings[cpu]) {
 		read_ll_from_file(kvm->vcpu_scalings[cpu], &kvm_scaling);
 		if (kvm_scaling == KVM_SCALING_AMD_DEFAULT ||
 		    kvm_scaling == KVM_SCALING_INTEL_DEFAULT)
 			kvm_scaling = 1;
 	}
+
+	if (kvm->vcpu_frac && kvm->vcpu_frac[cpu])
+		ret = read_ll_from_file(kvm->vcpu_frac[cpu], &kvm_frac);
 	msg = (char *)&packet;
 	size = sizeof(packet);
 	ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
@@ -403,6 +428,7 @@  static int kvm_clock_host(struct tracecmd_time_sync *tsync,
 
 	packet.offset = -kvm_offset;
 	packet.scaling = kvm_scaling;
+	packet.frac = kvm_frac;
 	ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME,
 					  KVM_SYNC_PKT_RESPONSE, sizeof(packet),
 					  (char *)&packet);
@@ -411,6 +437,7 @@  static int kvm_clock_host(struct tracecmd_time_sync *tsync,
 
 	*scaling = packet.scaling;
 	*offset = packet.offset;
+	*frac = kvm_frac;
 	*timestamp = packet.ts;
 
 	return 0;
@@ -442,10 +469,10 @@  static int kvm_marker_find(struct tep_event *event, struct tep_record *record,
 	return 0;
 }
 
-
 static int kvm_clock_guest(struct tracecmd_time_sync *tsync,
 			   long long *offset,
 			   long long *scaling,
+			   long long *frac,
 			   long long *timestamp)
 {
 	char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
@@ -486,12 +513,13 @@  static int kvm_clock_guest(struct tracecmd_time_sync *tsync,
 
 	*scaling = packet.scaling;
 	*offset = packet.offset;
+	*frac = packet.frac;
 	*timestamp = packet.ts;
 	return 0;
 }
 
 static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync,
-			       long long *offset, long long *scaling,
+			       long long *offset, long long *scaling, long long *frac,
 			       long long *timestamp, unsigned int cpu)
 {
 	struct clock_sync_context *clock_context;
@@ -503,9 +531,9 @@  static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync,
 	clock_context = (struct clock_sync_context *)tsync->context;
 
 	if (clock_context->is_guest)
-		ret = kvm_clock_guest(tsync, offset, scaling, timestamp);
+		ret = kvm_clock_guest(tsync, offset, scaling, frac, timestamp);
 	else
-		ret = kvm_clock_host(tsync, offset, scaling, timestamp, cpu);
+		ret = kvm_clock_host(tsync, offset, scaling, frac, timestamp, cpu);
 	return ret;
 }
 
diff --git a/lib/trace-cmd/trace-timesync-ptp.c b/lib/trace-cmd/trace-timesync-ptp.c
index b05f1cd0..70242ee3 100644
--- a/lib/trace-cmd/trace-timesync-ptp.c
+++ b/lib/trace-cmd/trace-timesync-ptp.c
@@ -663,7 +663,7 @@  static int ptp_clock_server(struct tracecmd_time_sync *tsync,
 }
 
 static int ptp_clock_sync_calc(struct tracecmd_time_sync *tsync,
-			       long long *offset, long long *scaling,
+			       long long *offset, long long *scaling, long long *frac,
 			       long long *timestamp, unsigned int cpu)
 {
 	struct clock_sync_context *clock_context;
@@ -689,6 +689,8 @@  static int ptp_clock_sync_calc(struct tracecmd_time_sync *tsync,
 
 	if (scaling)
 		*scaling = 1;
+	if (frac)
+		*frac = 0;
 	if (clock_context->is_server)
 		ret = ptp_clock_server(tsync, offset, timestamp);
 	else
diff --git a/lib/trace-cmd/trace-timesync.c b/lib/trace-cmd/trace-timesync.c
index 19ca19d7..f8ec15a3 100644
--- a/lib/trace-cmd/trace-timesync.c
+++ b/lib/trace-cmd/trace-timesync.c
@@ -36,7 +36,7 @@  struct tsync_proto {
 	int (*clock_sync_init)(struct tracecmd_time_sync *clock_context);
 	int (*clock_sync_free)(struct tracecmd_time_sync *clock_context);
 	int (*clock_sync_calc)(struct tracecmd_time_sync *clock_context,
-			       long long *offset, long long *scaling,
+			       long long *offset, long long *scaling, long long *frac,
 			       long long *timestamp, unsigned int cpu);
 };
 
@@ -76,7 +76,7 @@  int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int role
 				  int (*init)(struct tracecmd_time_sync *),
 				  int (*free)(struct tracecmd_time_sync *),
 				  int (*calc)(struct tracecmd_time_sync *,
-					      long long *, long long *,
+					      long long *, long long *, long long *,
 					      long long *, unsigned int))
 {
 	struct tsync_proto *proto = NULL;
@@ -137,12 +137,13 @@  bool __hidden tsync_proto_is_supported(const char *proto_name)
  * @ts: Array of size @count containing timestamps of callculated offsets
  * @offsets: array of size @count, containing offsets for each timestamp
  * @scalings: array of size @count, containing scaling ratios for each timestamp
+ * @frac: array of size @count, containing fraction bits for each timestamp
  *
  * Retuns -1 in case of an error, or 0 otherwise
  */
 int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu,
 			       int *count, long long **ts,
-			       long long **offsets, long long **scalings)
+			       long long **offsets, long long **scalings, long long **frac)
 {
 	struct clock_sync_context *tsync_context;
 
@@ -159,6 +160,8 @@  int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu,
 		*offsets = tsync_context->offsets[cpu].sync_offsets;
 	if (scalings)
 		*scalings = tsync_context->offsets[cpu].sync_scalings;
+	if (frac)
+		*frac = tsync_context->offsets[cpu].sync_frac;
 
 	return 0;
 }
@@ -564,9 +567,11 @@  void tracecmd_tsync_free(struct tracecmd_time_sync *tsync)
 				free(tsync_context->offsets[i].sync_ts);
 				free(tsync_context->offsets[i].sync_offsets);
 				free(tsync_context->offsets[i].sync_scalings);
+				free(tsync_context->offsets[i].sync_frac);
 				tsync_context->offsets[i].sync_ts = NULL;
 				tsync_context->offsets[i].sync_offsets = NULL;
 				tsync_context->offsets[i].sync_scalings = NULL;
+				tsync_context->offsets[i].sync_frac = NULL;
 				tsync_context->offsets[i].sync_count = 0;
 				tsync_context->offsets[i].sync_size = 0;
 			}
@@ -643,10 +648,11 @@  static int tsync_send(struct tracecmd_time_sync *tsync,
 	long long timestamp = 0;
 	long long scaling = 0;
 	long long offset = 0;
+	long long frac = 0;
 	int ret;
 
 	old_set = pin_to_cpu(cpu);
-	ret = proto->clock_sync_calc(tsync, &offset, &scaling, &timestamp, cpu);
+	ret = proto->clock_sync_calc(tsync, &offset, &scaling, &frac, &timestamp, cpu);
 	if (old_set)
 		restore_pin_to_cpu(old_set);
 
@@ -685,10 +691,11 @@  static void tsync_with_host(struct tracecmd_time_sync *tsync)
 }
 
 static int record_sync_sample(struct clock_sync_offsets *offsets, int array_step,
-			      long long offset, long long scaling, long long ts)
+			      long long offset, long long scaling, long long frac, long long ts)
 {
 	long long *sync_scalings = NULL;
 	long long *sync_offsets = NULL;
+	long long *sync_frac = NULL;
 	long long *sync_ts = NULL;
 
 	if (offsets->sync_count >= offsets->sync_size) {
@@ -698,22 +705,27 @@  static int record_sync_sample(struct clock_sync_offsets *offsets, int array_step
 				       (offsets->sync_size + array_step) * sizeof(long long));
 		sync_scalings = realloc(offsets->sync_scalings,
 				       (offsets->sync_size + array_step) * sizeof(long long));
+		sync_frac = realloc(offsets->sync_frac,
+				    (offsets->sync_size + array_step) * sizeof(long long));
 
-		if (!sync_ts || !sync_offsets || !sync_scalings) {
+		if (!sync_ts || !sync_offsets || !sync_scalings || !sync_frac) {
 			free(sync_ts);
 			free(sync_offsets);
 			free(sync_scalings);
+			free(sync_frac);
 			return -1;
 		}
 		offsets->sync_size += array_step;
 		offsets->sync_ts = sync_ts;
 		offsets->sync_offsets = sync_offsets;
 		offsets->sync_scalings = sync_scalings;
+		offsets->sync_frac = sync_frac;
 	}
 
 	offsets->sync_ts[offsets->sync_count] = ts;
 	offsets->sync_offsets[offsets->sync_count] = offset;
 	offsets->sync_scalings[offsets->sync_count] = scaling;
+	offsets->sync_frac[offsets->sync_count] = frac;
 	offsets->sync_count++;
 
 	return 0;
@@ -726,9 +738,10 @@  static int tsync_get_sample(struct tracecmd_time_sync *tsync, unsigned int cpu,
 	long long timestamp = 0;
 	long long scaling = 0;
 	long long offset = 0;
+	long long frac = 0;
 	int ret;
 
-	ret = proto->clock_sync_calc(tsync, &offset, &scaling, &timestamp, cpu);
+	ret = proto->clock_sync_calc(tsync, &offset, &scaling, &frac, &timestamp, cpu);
 	if (ret) {
 		tracecmd_warning("Failed to synchronize timestamps with guest");
 		return -1;
@@ -739,7 +752,7 @@  static int tsync_get_sample(struct tracecmd_time_sync *tsync, unsigned int cpu,
 	if (!clock || cpu >= clock->cpu_count || !clock->offsets)
 		return -1;
 	return record_sync_sample(&clock->offsets[cpu], array_step,
-				  offset, scaling, timestamp);
+				  offset, scaling, frac, timestamp);
 }
 
 #define TIMER_SEC_NANO 1000000000LL
@@ -928,6 +941,7 @@  int tracecmd_write_guest_time_shift(struct tracecmd_output *handle,
 	unsigned int flags;
 	long long *scalings = NULL;
 	long long *offsets = NULL;
+	long long *frac = NULL;
 	long long *ts = NULL;
 	int vcount;
 	int count;
@@ -936,7 +950,7 @@  int tracecmd_write_guest_time_shift(struct tracecmd_output *handle,
 
 	if (!tsync->vcpu_count)
 		return -1;
-	vcount = 3 + (4 * tsync->vcpu_count);
+	vcount = 3 + (5 * tsync->vcpu_count);
 	vector = calloc(vcount, sizeof(struct iovec));
 	if (!vector)
 		return -1;
@@ -955,7 +969,7 @@  int tracecmd_write_guest_time_shift(struct tracecmd_output *handle,
 		if (j >= vcount)
 			break;
 		ret = tracecmd_tsync_get_offsets(tsync, i, &count,
-						 &ts, &offsets, &scalings);
+						 &ts, &offsets, &scalings, NULL);
 		if (ret < 0 || !count || !ts || !offsets || !scalings)
 			break;
 		vector[j].iov_len = 4;
@@ -971,6 +985,27 @@  int tracecmd_write_guest_time_shift(struct tracecmd_output *handle,
 		ret = -1;
 		goto out;
 	}
+	/*
+	 * Writing fraction bits into the option is implemented in a separate loop for
+	 * backward compatibility. In the trace-cmd 2.9 release, this option has only offset
+	 * and scaling. That legacy code must work with the new extended option.
+	 *
+	 */
+	for (i = 0; i < tsync->vcpu_count; i++) {
+		if (j >= vcount)
+			break;
+		ret = tracecmd_tsync_get_offsets(tsync, i, NULL,
+						 NULL, NULL, NULL, &frac);
+		if (ret < 0)
+			break;
+		vector[j].iov_len = 8 * count;
+		vector[j++].iov_base = frac;
+	}
+	if (i < tsync->vcpu_count) {
+		ret = -1;
+		goto out;
+	}
+
 	tracecmd_add_option_v(handle, TRACECMD_OPTION_TIME_SHIFT, vector, vcount);
 #ifdef TSYNC_DEBUG
 	if (count > 1)