diff mbox series

[V7,vfio,07/10] vfio/mlx5: Create and destroy page tracker object

Message ID 20220908183448.195262-8-yishaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add device DMA logging support for mlx5 driver | expand

Commit Message

Yishai Hadas Sept. 8, 2022, 6:34 p.m. UTC
Add support for creating and destroying page tracker object.

This object is used to control/report the device dirty pages.

As part of creating the tracker need to consider the device capabilities
for max ranges and adapt/combine ranges accordingly.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 drivers/vfio/pci/mlx5/cmd.c | 147 ++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/mlx5/cmd.h |   1 +
 2 files changed, 148 insertions(+)

Comments

Cédric Le Goater Sept. 6, 2023, 8:55 a.m. UTC | #1
Hello,

On 9/8/22 20:34, Yishai Hadas wrote:
> Add support for creating and destroying page tracker object.
> 
> This object is used to control/report the device dirty pages.
> 
> As part of creating the tracker need to consider the device capabilities
> for max ranges and adapt/combine ranges accordingly.
> 
> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
> ---
>   drivers/vfio/pci/mlx5/cmd.c | 147 ++++++++++++++++++++++++++++++++++++
>   drivers/vfio/pci/mlx5/cmd.h |   1 +
>   2 files changed, 148 insertions(+)
> 
> diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
> index 0a362796d567..f1cad96af6ab 100644
> --- a/drivers/vfio/pci/mlx5/cmd.c
> +++ b/drivers/vfio/pci/mlx5/cmd.c
> @@ -410,6 +410,148 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
>   	return err;
>   }
>   
> +static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
> +			   u32 req_nodes)
> +{
> +	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
> +	unsigned long min_gap;
> +	unsigned long curr_gap;
> +
> +	/* Special shortcut when a single range is required */
> +	if (req_nodes == 1) {
> +		unsigned long last;
> +
> +		curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
> +		while (curr) {
> +			last = curr->last;
> +			prev = curr;
> +			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
> +			if (prev != comb_start)
> +				interval_tree_remove(prev, root);
> +		}
> +		comb_start->last = last;
> +		return;
> +	}
> +
> +	/* Combine ranges which have the smallest gap */
> +	while (cur_nodes > req_nodes) {
> +		prev = NULL;
> +		min_gap = ULONG_MAX;
> +		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
> +		while (curr) {
> +			if (prev) {
> +				curr_gap = curr->start - prev->last;
> +				if (curr_gap < min_gap) {
> +					min_gap = curr_gap;
> +					comb_start = prev;
> +					comb_end = curr;
> +				}
> +			}
> +			prev = curr;
> +			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
> +		}
> +		comb_start->last = comb_end->last;
> +		interval_tree_remove(comb_end, root);
> +		cur_nodes--;
> +	}
> +}
> +
> +static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
> +				 struct mlx5vf_pci_core_device *mvdev,
> +				 struct rb_root_cached *ranges, u32 nnodes)
> +{
> +	int max_num_range =
> +		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
> +	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
> +	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
> +	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
> +	struct interval_tree_node *node = NULL;
> +	u64 total_ranges_len = 0;
> +	u32 num_ranges = nnodes;
> +	u8 log_addr_space_size;
> +	void *range_list_ptr;
> +	void *obj_context;
> +	void *cmd_hdr;
> +	int inlen;
> +	void *in;
> +	int err;
> +	int i;
> +
> +	if (num_ranges > max_num_range) {
> +		combine_ranges(ranges, nnodes, max_num_range);
> +		num_ranges = max_num_range;
> +	}
> +
> +	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
> +				 record_size * num_ranges;
> +	in = kzalloc(inlen, GFP_KERNEL);
> +	if (!in)
> +		return -ENOMEM;
> +
> +	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
> +			       general_obj_in_cmd_hdr);
> +	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
> +		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
> +	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
> +		 MLX5_OBJ_TYPE_PAGE_TRACK);
> +	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
> +	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
> +	MLX5_SET(page_track, obj_context, track_type, 1);
> +	MLX5_SET(page_track, obj_context, log_page_size,
> +		 ilog2(tracker->host_qp->tracked_page_size));
> +	MLX5_SET(page_track, obj_context, log_msg_size,
> +		 ilog2(tracker->host_qp->max_msg_size));
> +	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
> +	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
> +
> +	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
> +	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
> +	for (i = 0; i < num_ranges; i++) {
> +		void *addr_range_i_base = range_list_ptr + record_size * i;
> +		unsigned long length = node->last - node->start;
> +
> +		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
> +			   node->start);
> +		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
> +		total_ranges_len += length;
> +		node = interval_tree_iter_next(node, 0, ULONG_MAX);
> +	}
> +
> +	WARN_ON(node);
> +	log_addr_space_size = ilog2(total_ranges_len);
> +	if (log_addr_space_size <
> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
> +	    log_addr_space_size >
> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
> +		err = -EOPNOTSUPP;
> +		goto out;
> +	}


We are seeing an issue with dirty page tracking when doing migration
of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
device complains when dirty page tracking is initialized from QEMU :

   qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation not supported)

The 64-bit computed range is  :

   vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff], 64:[0x100000000 - 0x3838000fffff]

which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
bits address space limitation for dirty tracking (min is 12). Is it a
FW tunable or a strict limitation ?

We should probably introduce more ranges to overcome the issue.

Thanks,

C.


> +	MLX5_SET(page_track, obj_context, log_addr_space_size,
> +		 log_addr_space_size);
> +	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
> +	if (err)
> +		goto out;
> +
> +	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
> +out:
> +	kfree(in);
> +	return err;
> +}
> +
> +static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
> +				      u32 tracker_id)
> +{
> +	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
> +	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
> +
> +	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
> +	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
> +	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
> +
> +	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
> +}
> +
>   static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
>   			     struct mlx5_vhca_cq_buf *buf, int nent,
>   			     int cqe_size)
> @@ -833,6 +975,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
>   
>   	WARN_ON(mvdev->mdev_detach);
>   
> +	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
>   	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
>   	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
>   	mlx5vf_destroy_qp(mdev, tracker->host_qp);
> @@ -941,6 +1084,10 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
>   
>   	tracker->host_qp = host_qp;
>   	tracker->fw_qp = fw_qp;
> +	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
> +	if (err)
> +		goto err_activate;
> +
>   	*page_size = host_qp->tracked_page_size;
>   	mvdev->log_active = true;
>   	mlx5vf_state_mutex_unlock(mvdev);
> diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
> index e71ec017bf04..658925ba5459 100644
> --- a/drivers/vfio/pci/mlx5/cmd.h
> +++ b/drivers/vfio/pci/mlx5/cmd.h
> @@ -80,6 +80,7 @@ struct mlx5_vhca_qp {
>   };
>   
>   struct mlx5_vhca_page_tracker {
> +	u32 id;
>   	u32 pdn;
>   	struct mlx5_uars_page *uar;
>   	struct mlx5_vhca_cq cq;
Yishai Hadas Sept. 6, 2023, 9:48 a.m. UTC | #2
On 06/09/2023 11:55, Cédric Le Goater wrote:
> Hello,
>
> On 9/8/22 20:34, Yishai Hadas wrote:
>> Add support for creating and destroying page tracker object.
>>
>> This object is used to control/report the device dirty pages.
>>
>> As part of creating the tracker need to consider the device capabilities
>> for max ranges and adapt/combine ranges accordingly.
>>
>> Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
>> ---
>>   drivers/vfio/pci/mlx5/cmd.c | 147 ++++++++++++++++++++++++++++++++++++
>>   drivers/vfio/pci/mlx5/cmd.h |   1 +
>>   2 files changed, 148 insertions(+)
>>
>> diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
>> index 0a362796d567..f1cad96af6ab 100644
>> --- a/drivers/vfio/pci/mlx5/cmd.c
>> +++ b/drivers/vfio/pci/mlx5/cmd.c
>> @@ -410,6 +410,148 @@ int mlx5vf_cmd_load_vhca_state(struct 
>> mlx5vf_pci_core_device *mvdev,
>>       return err;
>>   }
>>   +static void combine_ranges(struct rb_root_cached *root, u32 
>> cur_nodes,
>> +               u32 req_nodes)
>> +{
>> +    struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
>> +    unsigned long min_gap;
>> +    unsigned long curr_gap;
>> +
>> +    /* Special shortcut when a single range is required */
>> +    if (req_nodes == 1) {
>> +        unsigned long last;
>> +
>> +        curr = comb_start = interval_tree_iter_first(root, 0, 
>> ULONG_MAX);
>> +        while (curr) {
>> +            last = curr->last;
>> +            prev = curr;
>> +            curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
>> +            if (prev != comb_start)
>> +                interval_tree_remove(prev, root);
>> +        }
>> +        comb_start->last = last;
>> +        return;
>> +    }
>> +
>> +    /* Combine ranges which have the smallest gap */
>> +    while (cur_nodes > req_nodes) {
>> +        prev = NULL;
>> +        min_gap = ULONG_MAX;
>> +        curr = interval_tree_iter_first(root, 0, ULONG_MAX);
>> +        while (curr) {
>> +            if (prev) {
>> +                curr_gap = curr->start - prev->last;
>> +                if (curr_gap < min_gap) {
>> +                    min_gap = curr_gap;
>> +                    comb_start = prev;
>> +                    comb_end = curr;
>> +                }
>> +            }
>> +            prev = curr;
>> +            curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
>> +        }
>> +        comb_start->last = comb_end->last;
>> +        interval_tree_remove(comb_end, root);
>> +        cur_nodes--;
>> +    }
>> +}
>> +
>> +static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
>> +                 struct mlx5vf_pci_core_device *mvdev,
>> +                 struct rb_root_cached *ranges, u32 nnodes)
>> +{
>> +    int max_num_range =
>> +        MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
>> +    struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
>> +    int record_size = MLX5_ST_SZ_BYTES(page_track_range);
>> +    u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
>> +    struct interval_tree_node *node = NULL;
>> +    u64 total_ranges_len = 0;
>> +    u32 num_ranges = nnodes;
>> +    u8 log_addr_space_size;
>> +    void *range_list_ptr;
>> +    void *obj_context;
>> +    void *cmd_hdr;
>> +    int inlen;
>> +    void *in;
>> +    int err;
>> +    int i;
>> +
>> +    if (num_ranges > max_num_range) {
>> +        combine_ranges(ranges, nnodes, max_num_range);
>> +        num_ranges = max_num_range;
>> +    }
>> +
>> +    inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
>> +                 record_size * num_ranges;
>> +    in = kzalloc(inlen, GFP_KERNEL);
>> +    if (!in)
>> +        return -ENOMEM;
>> +
>> +    cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
>> +                   general_obj_in_cmd_hdr);
>> +    MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
>> +         MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
>> +    MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
>> +         MLX5_OBJ_TYPE_PAGE_TRACK);
>> +    obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, 
>> obj_context);
>> +    MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
>> +    MLX5_SET(page_track, obj_context, track_type, 1);
>> +    MLX5_SET(page_track, obj_context, log_page_size,
>> +         ilog2(tracker->host_qp->tracked_page_size));
>> +    MLX5_SET(page_track, obj_context, log_msg_size,
>> +         ilog2(tracker->host_qp->max_msg_size));
>> +    MLX5_SET(page_track, obj_context, reporting_qpn, 
>> tracker->fw_qp->qpn);
>> +    MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
>> +
>> +    range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, 
>> track_range);
>> +    node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
>> +    for (i = 0; i < num_ranges; i++) {
>> +        void *addr_range_i_base = range_list_ptr + record_size * i;
>> +        unsigned long length = node->last - node->start;
>> +
>> +        MLX5_SET64(page_track_range, addr_range_i_base, start_address,
>> +               node->start);
>> +        MLX5_SET64(page_track_range, addr_range_i_base, length, 
>> length);
>> +        total_ranges_len += length;
>> +        node = interval_tree_iter_next(node, 0, ULONG_MAX);
>> +    }
>> +
>> +    WARN_ON(node);
>> +    log_addr_space_size = ilog2(total_ranges_len);
>> +    if (log_addr_space_size <
>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, 
>> pg_track_log_min_addr_space)) ||
>> +        log_addr_space_size >
>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, 
>> pg_track_log_max_addr_space))) {
>> +        err = -EOPNOTSUPP;
>> +        goto out;
>> +    }
>
>
> We are seeing an issue with dirty page tracking when doing migration
> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
> device complains when dirty page tracking is initialized from QEMU :
>
>   qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 
> (Operation not supported)
>
> The 64-bit computed range is  :
>
>   vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff], 
> 64:[0x100000000 - 0x3838000fffff]
>
> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
> bits address space limitation for dirty tracking (min is 12). Is it a
> FW tunable or a strict limitation ?

It's mainly a FW limitation.

Tracking larger address space than 2^42 might take a lot of time in FW 
to allocate the required resources which might end-up in command 
timeout, etc.

>
> We should probably introduce more ranges to overcome the issue.

More ranges can help only if the total address space of the given ranges 
is < 2^42.

So, if there are some areas that don't require tracking (why?), breaking 
into more ranges with smaller total size can help.

Yishai
Jason Gunthorpe Sept. 6, 2023, 11:51 a.m. UTC | #3
On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:

> > +	WARN_ON(node);
> > +	log_addr_space_size = ilog2(total_ranges_len);
> > +	if (log_addr_space_size <
> > +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
> > +	    log_addr_space_size >
> > +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
> > +		err = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> 
> 
> We are seeing an issue with dirty page tracking when doing migration
> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
> device complains when dirty page tracking is initialized from QEMU :
> 
>   qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation not supported)
> 
> The 64-bit computed range is  :
> 
>   vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff], 64:[0x100000000 - 0x3838000fffff]
> 
> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
> bits address space limitation for dirty tracking (min is 12). Is it a
> FW tunable or a strict limitation ?

It would be good to explain where this is coming from, all devices
need to make some decision on what address space ranges to track and I
would say 2^42 is already pretty generous limit..

Can we go the other direction and reduce the ranges qemu is interested
in?

Jason
Joao Martins Sept. 6, 2023, 12:08 p.m. UTC | #4
On 06/09/2023 12:51, Jason Gunthorpe wrote:
> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
> 
>>> +	WARN_ON(node);
>>> +	log_addr_space_size = ilog2(total_ranges_len);
>>> +	if (log_addr_space_size <
>>> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>> +	    log_addr_space_size >
>>> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>> +		err = -EOPNOTSUPP;
>>> +		goto out;
>>> +	}
>>
>>
>> We are seeing an issue with dirty page tracking when doing migration
>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>> device complains when dirty page tracking is initialized from QEMU :
>>
>>   qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation not supported)
>>
>> The 64-bit computed range is  :
>>
>>   vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff], 64:[0x100000000 - 0x3838000fffff]
>>
>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>> bits address space limitation for dirty tracking (min is 12). Is it a
>> FW tunable or a strict limitation ?
> 
> It would be good to explain where this is coming from, all devices
> need to make some decision on what address space ranges to track and I
> would say 2^42 is already pretty generous limit..
> 
> Can we go the other direction and reduce the ranges qemu is interested
> in?

There's also a chance that this are those 16x-32x socket Intel machines with
48T-64T of memory (judging from the ranges alone). Meaning that these ranges
even if reduced wouldn't remove much of the aggregate address space width.
Cédric Le Goater Sept. 7, 2023, 9:56 a.m. UTC | #5
On 9/6/23 13:51, Jason Gunthorpe wrote:
> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
> 
>>> +	WARN_ON(node);
>>> +	log_addr_space_size = ilog2(total_ranges_len);
>>> +	if (log_addr_space_size <
>>> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>> +	    log_addr_space_size >
>>> +	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>> +		err = -EOPNOTSUPP;
>>> +		goto out;
>>> +	}
>>
>>
>> We are seeing an issue with dirty page tracking when doing migration
>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>> device complains when dirty page tracking is initialized from QEMU :
>>
>>    qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation not supported)
>>
>> The 64-bit computed range is  :
>>
>>    vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff], 64:[0x100000000 - 0x3838000fffff]
>>
>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>> bits address space limitation for dirty tracking (min is 12). Is it a
>> FW tunable or a strict limitation ?
> 
> It would be good to explain where this is coming from, all devices
> need to make some decision on what address space ranges to track and I
> would say 2^42 is already pretty generous limit..


QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
and 64-bit. In the OVMF case, QEMU includes in the 64-bit range, RAM
(at the lower part) and device RAM regions (at the top of the address
space). The size of that range can be bigger than the 2^42 limit of
the MLX5 HW for dirty tracking. QEMU is not making much effort to be
smart. There is room for improvement.


> Can we go the other direction and reduce the ranges qemu is interested
> in?


We can not exclude the device RAM regions since we don't know what
the HW could do with them. Correct me here. But we can certainly
avoid the huge gap in the 64-bit range by adding support for more
than 2 ranges in QEMU.

Then, if the overall size exceeds what HW supports, the vfio-pci
variant driver will report an error, as of today.

Thanks,

C.
Joao Martins Sept. 7, 2023, 10:51 a.m. UTC | #6
On 07/09/2023 10:56, Cédric Le Goater wrote:
> On 9/6/23 13:51, Jason Gunthorpe wrote:
>> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
>>
>>>> +    WARN_ON(node);
>>>> +    log_addr_space_size = ilog2(total_ranges_len);
>>>> +    if (log_addr_space_size <
>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>>> +        log_addr_space_size >
>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>>> +        err = -EOPNOTSUPP;
>>>> +        goto out;
>>>> +    }
>>>
>>>
>>> We are seeing an issue with dirty page tracking when doing migration
>>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>>> device complains when dirty page tracking is initialized from QEMU :
>>>
>>>    qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation
>>> not supported)
>>>
>>> The 64-bit computed range is  :
>>>
>>>    vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff],
>>> 64:[0x100000000 - 0x3838000fffff]
>>>
>>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>>> bits address space limitation for dirty tracking (min is 12). Is it a
>>> FW tunable or a strict limitation ?
>>
>> It would be good to explain where this is coming from, all devices
>> need to make some decision on what address space ranges to track and I
>> would say 2^42 is already pretty generous limit..
> 
> 
> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
> and 64-bit. In the OVMF case, QEMU includes in the 64-bit range, RAM
> (at the lower part) and device RAM regions (at the top of the address
> space). The size of that range can be bigger than the 2^42 limit of
> the MLX5 HW for dirty tracking. QEMU is not making much effort to be
> smart. There is room for improvement.
>

Interesting, we haven't reproduced this in our testing with OVMF multi-TB
configs with these VFs. Could you share the OVMF base version you were using? or
maybe we didn't triggered it considering the total device RAM regions would be
small enough to fit the 32G PCI hole64 that is advertised that avoids a
hypothetical relocation.

We could use do more than 2 ranges (or going back to sharing all ranges), or add
a set of ranges that represents the device RAM without computing a min/max there
(not sure we can figure that out from within the memory listener does all this
logic); it would perhaps a bit too BIOS specific if we start looking at specific
parts of the address space (e.g. phys-bits-1) to compute these ranges.

	Joao
Cédric Le Goater Sept. 7, 2023, 12:16 p.m. UTC | #7
On 9/7/23 12:51, Joao Martins wrote:
> On 07/09/2023 10:56, Cédric Le Goater wrote:
>> On 9/6/23 13:51, Jason Gunthorpe wrote:
>>> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
>>>
>>>>> +    WARN_ON(node);
>>>>> +    log_addr_space_size = ilog2(total_ranges_len);
>>>>> +    if (log_addr_space_size <
>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>>>> +        log_addr_space_size >
>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>>>> +        err = -EOPNOTSUPP;
>>>>> +        goto out;
>>>>> +    }
>>>>
>>>>
>>>> We are seeing an issue with dirty page tracking when doing migration
>>>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>>>> device complains when dirty page tracking is initialized from QEMU :
>>>>
>>>>     qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation
>>>> not supported)
>>>>
>>>> The 64-bit computed range is  :
>>>>
>>>>     vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff],
>>>> 64:[0x100000000 - 0x3838000fffff]
>>>>
>>>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>>>> bits address space limitation for dirty tracking (min is 12). Is it a
>>>> FW tunable or a strict limitation ?
>>>
>>> It would be good to explain where this is coming from, all devices
>>> need to make some decision on what address space ranges to track and I
>>> would say 2^42 is already pretty generous limit..
>>
>>
>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>> and 64-bit. In the OVMF case, QEMU includes in the 64-bit range, RAM
>> (at the lower part) and device RAM regions (at the top of the address
>> space). The size of that range can be bigger than the 2^42 limit of
>> the MLX5 HW for dirty tracking. QEMU is not making much effort to be
>> smart. There is room for improvement.
>>
> 
> Interesting, we haven't reproduced this in our testing with OVMF multi-TB
> configs with these VFs. Could you share the OVMF base version you were using? 

edk2-ovmf-20230524-3.el9.noarch

host is a :
     
     Architecture:            x86_64
       CPU op-mode(s):        32-bit, 64-bit
       Address sizes:         46 bits physical, 57 bits virtual
       Byte Order:            Little Endian
     CPU(s):                  48
       On-line CPU(s) list:   0-47
     Vendor ID:               GenuineIntel
       Model name:            Intel(R) Xeon(R) Silver 4310 CPU @ 2.10GHz


> or
> maybe we didn't triggered it considering the total device RAM regions would be
> small enough to fit the 32G PCI hole64 that is advertised that avoids a
> hypothetical relocation.

You need RAM above 4G in the guest :
     
     100000000-27fffffff : System RAM
       237800000-2387fffff : Kernel code
       238800000-23932cfff : Kernel rodata
       239400000-239977cff : Kernel data
       23a202000-23b3fffff : Kernel bss
     380000000000-3807ffffffff : PCI Bus 0000:00
       380000000000-3800000fffff : 0000:00:03.0
         380000000000-3800000fffff : mlx5_core

Activating the QEMU trace events shows quickly the issue :

     vfio_device_dirty_tracking_update section 0x0 - 0x9ffff -> update [0x0 - 0x9ffff]
     vfio_device_dirty_tracking_update section 0xa0000 - 0xaffff -> update [0x0 - 0xaffff]
     vfio_device_dirty_tracking_update section 0xc0000 - 0xc3fff -> update [0x0 - 0xc3fff]
     vfio_device_dirty_tracking_update section 0xc4000 - 0xdffff -> update [0x0 - 0xdffff]
     vfio_device_dirty_tracking_update section 0xe0000 - 0xfffff -> update [0x0 - 0xfffff]
     vfio_device_dirty_tracking_update section 0x100000 - 0x7fffffff -> update [0x0 - 0x7fffffff]
     vfio_device_dirty_tracking_update section 0x80000000 - 0x807fffff -> update [0x0 - 0x807fffff]
     vfio_device_dirty_tracking_update section 0x100000000 - 0x27fffffff -> update [0x100000000 - 0x27fffffff]
     vfio_device_dirty_tracking_update section 0x383800000000 - 0x383800001fff -> update [0x100000000 - 0x383800001fff]
     vfio_device_dirty_tracking_update section 0x383800003000 - 0x3838000fffff -> update [0x100000000 - 0x3838000fffff]

So that's nice. And with less RAM in the VM, 2G, migration should work though.

> We could use do more than 2 ranges (or going back to sharing all ranges), or add
> a set of ranges that represents the device RAM without computing a min/max there
> (not sure we can figure that out from within the memory listener does all this
> logic); 

The listener is container based. May we could add one range per device
if we can identify a different owner per memory section.


C.

> it would perhaps a bit too BIOS specific if we start looking at specific
> parts of the address space (e.g. phys-bits-1) to compute these ranges.
> 
> 	Joao
Joao Martins Sept. 7, 2023, 4:33 p.m. UTC | #8
On 07/09/2023 13:16, Cédric Le Goater wrote:
> On 9/7/23 12:51, Joao Martins wrote:
>> On 07/09/2023 10:56, Cédric Le Goater wrote:
>>> On 9/6/23 13:51, Jason Gunthorpe wrote:
>>>> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
>>>>
>>>>>> +    WARN_ON(node);
>>>>>> +    log_addr_space_size = ilog2(total_ranges_len);
>>>>>> +    if (log_addr_space_size <
>>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>>>>> +        log_addr_space_size >
>>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>>>>> +        err = -EOPNOTSUPP;
>>>>>> +        goto out;
>>>>>> +    }
>>>>>
>>>>>
>>>>> We are seeing an issue with dirty page tracking when doing migration
>>>>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>>>>> device complains when dirty page tracking is initialized from QEMU :
>>>>>
>>>>>     qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation
>>>>> not supported)
>>>>>
>>>>> The 64-bit computed range is  :
>>>>>
>>>>>     vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff],
>>>>> 64:[0x100000000 - 0x3838000fffff]
>>>>>
>>>>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>>>>> bits address space limitation for dirty tracking (min is 12). Is it a
>>>>> FW tunable or a strict limitation ?
>>>>
>>>> It would be good to explain where this is coming from, all devices
>>>> need to make some decision on what address space ranges to track and I
>>>> would say 2^42 is already pretty generous limit..
>>>
>>>
>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>> and 64-bit. In the OVMF case, QEMU includes in the 64-bit range, RAM
>>> (at the lower part) and device RAM regions (at the top of the address
>>> space). The size of that range can be bigger than the 2^42 limit of
>>> the MLX5 HW for dirty tracking. QEMU is not making much effort to be
>>> smart. There is room for improvement.
>>>
>>
>> Interesting, we haven't reproduced this in our testing with OVMF multi-TB
>> configs with these VFs. Could you share the OVMF base version you were using? 
> 
> edk2-ovmf-20230524-3.el9.noarch
> 
> host is a :
>         Architecture:            x86_64
>       CPU op-mode(s):        32-bit, 64-bit
>       Address sizes:         46 bits physical, 57 bits virtual
>       Byte Order:            Little Endian
>     CPU(s):                  48
>       On-line CPU(s) list:   0-47
>     Vendor ID:               GenuineIntel
>       Model name:            Intel(R) Xeon(R) Silver 4310 CPU @ 2.10GHz
> 
> 
>> or
>> maybe we didn't triggered it considering the total device RAM regions would be
>> small enough to fit the 32G PCI hole64 that is advertised that avoids a
>> hypothetical relocation.
> 
> You need RAM above 4G in the guest :
>         100000000-27fffffff : System RAM
>       237800000-2387fffff : Kernel code
>       238800000-23932cfff : Kernel rodata
>       239400000-239977cff : Kernel data
>       23a202000-23b3fffff : Kernel bss
>     380000000000-3807ffffffff : PCI Bus 0000:00
>       380000000000-3800000fffff : 0000:00:03.0
>         380000000000-3800000fffff : mlx5_core

Similar machine to yours, but in my 32G guests with older OVMF it's putting the
PCI area after max-ram:

vfio_device_dirty_tracking_update section 0x0 - 0x9ffff -> update [0x0 - 0x9ffff]
vfio_device_dirty_tracking_update section 0xc0000 - 0xcafff -> update [0x0 -
0xcafff]
vfio_device_dirty_tracking_update section 0xcb000 - 0xcdfff -> update [0x0 -
0xcdfff]
vfio_device_dirty_tracking_update section 0xce000 - 0xe7fff -> update [0x0 -
0xe7fff]
vfio_device_dirty_tracking_update section 0xe8000 - 0xeffff -> update [0x0 -
0xeffff]
vfio_device_dirty_tracking_update section 0xf0000 - 0xfffff -> update [0x0 -
0xfffff]
vfio_device_dirty_tracking_update section 0x100000 - 0x7fffffff -> update [0x0 -
0x7fffffff]
vfio_device_dirty_tracking_update section 0xfd000000 - 0xfdffffff -> update [0x0
- 0xfdffffff]
vfio_device_dirty_tracking_update section 0xfffc0000 - 0xffffffff -> update [0x0
- 0xffffffff]
vfio_device_dirty_tracking_update section 0x100000000 - 0x87fffffff -> update
[0x100000000 - 0x87fffffff]
vfio_device_dirty_tracking_update section 0x880000000 - 0x880001fff -> update
[0x100000000 - 0x880001fff]
vfio_device_dirty_tracking_update section 0x880003000 - 0x8ffffffff -> update
[0x100000000 - 0x8ffffffff]


> 
> Activating the QEMU trace events shows quickly the issue :
> 
>     vfio_device_dirty_tracking_update section 0x0 - 0x9ffff -> update [0x0 -
> 0x9ffff]
>     vfio_device_dirty_tracking_update section 0xa0000 - 0xaffff -> update [0x0 -
> 0xaffff]
>     vfio_device_dirty_tracking_update section 0xc0000 - 0xc3fff -> update [0x0 -
> 0xc3fff]
>     vfio_device_dirty_tracking_update section 0xc4000 - 0xdffff -> update [0x0 -
> 0xdffff]
>     vfio_device_dirty_tracking_update section 0xe0000 - 0xfffff -> update [0x0 -
> 0xfffff]
>     vfio_device_dirty_tracking_update section 0x100000 - 0x7fffffff -> update
> [0x0 - 0x7fffffff]
>     vfio_device_dirty_tracking_update section 0x80000000 - 0x807fffff -> update
> [0x0 - 0x807fffff]
>     vfio_device_dirty_tracking_update section 0x100000000 - 0x27fffffff ->
> update [0x100000000 - 0x27fffffff]
>     vfio_device_dirty_tracking_update section 0x383800000000 - 0x383800001fff ->
> update [0x100000000 - 0x383800001fff]
>     vfio_device_dirty_tracking_update section 0x383800003000 - 0x3838000fffff ->
> update [0x100000000 - 0x3838000fffff]
> 
> So that's nice. And with less RAM in the VM, 2G, migration should work though.
> 
>> We could use do more than 2 ranges (or going back to sharing all ranges), or add
>> a set of ranges that represents the device RAM without computing a min/max there
>> (not sure we can figure that out from within the memory listener does all this
>> logic); 
> 
> The listener is container based. May we could add one range per device
> if we can identify a different owner per memory section.
> 

For brainstorm purposes ... Maybe something like this below. Should make your
case work. As mentioned earlier in my case it's placed always at maxram+1, so
makes no difference in having the "pci" range

------>8-------
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 7 Sep 2023 09:23:38 -0700
Subject: [PATCH] vfio/common: Separate vfio-pci ranges

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/vfio/common.c     | 48 ++++++++++++++++++++++++++++++++++++++++----
 hw/vfio/trace-events |  2 +-
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f8b20aacc07c..f0b36a98c89a 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -27,6 +27,7 @@

 #include "hw/vfio/vfio-common.h"
 #include "hw/vfio/vfio.h"
+#include "hw/vfio/pci.h"
 #include "exec/address-spaces.h"
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
@@ -1424,6 +1425,8 @@ typedef struct VFIODirtyRanges {
     hwaddr max32;
     hwaddr min64;
     hwaddr max64;
+    hwaddr minpci;
+    hwaddr maxpci;
 } VFIODirtyRanges;

 typedef struct VFIODirtyRangesListener {
@@ -1432,6 +1435,31 @@ typedef struct VFIODirtyRangesListener {
     MemoryListener listener;
 } VFIODirtyRangesListener;

+static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
+                                     VFIOContainer *container)
+{
+    VFIOPCIDevice *pcidev;
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
+    Object *owner;
+
+    owner = memory_region_owner(section->mr);
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
+                continue;
+            }
+            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+            if (OBJECT(pcidev) == owner) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
 static void vfio_dirty_tracking_update(MemoryListener *listener,
                                        MemoryRegionSection *section)
 {
@@ -1458,8 +1486,13 @@ static void vfio_dirty_tracking_update(MemoryListener
*listener,
      * would be an IOVATree but that has a much bigger runtime overhead and
      * unnecessary complexity.
      */
-    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
-    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+    if (!vfio_section_is_vfio_pci(section, dirty->container)) {
+        min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
+        max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
+    } else {
+        min = &range->minpci;
+        max = &range->maxpci;
+    }

     if (*min > iova) {
         *min = iova;
@@ -1485,6 +1518,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
     memset(&dirty, 0, sizeof(dirty));
     dirty.ranges.min32 = UINT32_MAX;
     dirty.ranges.min64 = UINT64_MAX;
+    dirty.ranges.minpci = UINT64_MAX;
     dirty.listener = vfio_dirty_tracking_listener;
     dirty.container = container;

@@ -1555,7 +1589,7 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer
*container,
      * DMA logging uAPI guarantees to support at least a number of ranges that
      * fits into a single host kernel base page.
      */
-    control->num_ranges = !!tracking->max32 + !!tracking->max64;
+    control->num_ranges = !!tracking->max32 + !!tracking->max64 +
!!tracking->maxpci;
     ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
                         control->num_ranges);
     if (!ranges) {
@@ -1574,11 +1608,17 @@
vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
     if (tracking->max64) {
         ranges->iova = tracking->min64;
         ranges->length = (tracking->max64 - tracking->min64) + 1;
+        ranges++;
+    }
+    if (tracking->maxpci) {
+        ranges->iova = tracking->minpci;
+        ranges->length = (tracking->maxpci - tracking->minpci) + 1;
     }

     trace_vfio_device_dirty_tracking_start(control->num_ranges,
                                            tracking->min32, tracking->max32,
-                                           tracking->min64, tracking->max64);
+                                           tracking->min64, tracking->max64,
+                                           tracking->minpci, tracking->maxpci);

     return feature;
 }
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 444c15be47ee..ee5a44893334 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t
iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
- 0x%"PRIx64
 vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
0x%"PRIx64"]"
-vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32,
uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"],
64:[0x%"PRIx64" - 0x%"PRIx64"]"
+vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32,
uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d
32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci:[0x%"PRIx64"
- 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int
num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
--
2.39.3
Cédric Le Goater Sept. 7, 2023, 5:34 p.m. UTC | #9
On 9/7/23 18:33, Joao Martins wrote:
> On 07/09/2023 13:16, Cédric Le Goater wrote:
>> On 9/7/23 12:51, Joao Martins wrote:
>>> On 07/09/2023 10:56, Cédric Le Goater wrote:
>>>> On 9/6/23 13:51, Jason Gunthorpe wrote:
>>>>> On Wed, Sep 06, 2023 at 10:55:26AM +0200, Cédric Le Goater wrote:
>>>>>
>>>>>>> +    WARN_ON(node);
>>>>>>> +    log_addr_space_size = ilog2(total_ranges_len);
>>>>>>> +    if (log_addr_space_size <
>>>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
>>>>>>> +        log_addr_space_size >
>>>>>>> +        (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
>>>>>>> +        err = -EOPNOTSUPP;
>>>>>>> +        goto out;
>>>>>>> +    }
>>>>>>
>>>>>>
>>>>>> We are seeing an issue with dirty page tracking when doing migration
>>>>>> of an OVMF VM guest. The vfio-pci variant driver for the MLX5 VF
>>>>>> device complains when dirty page tracking is initialized from QEMU :
>>>>>>
>>>>>>      qemu-kvm: 0000:b1:00.2: Failed to start DMA logging, err -95 (Operation
>>>>>> not supported)
>>>>>>
>>>>>> The 64-bit computed range is  :
>>>>>>
>>>>>>      vfio_device_dirty_tracking_start nr_ranges 2 32:[0x0 - 0x807fffff],
>>>>>> 64:[0x100000000 - 0x3838000fffff]
>>>>>>
>>>>>> which seems to be too large for the HW. AFAICT, the MLX5 HW has a 42
>>>>>> bits address space limitation for dirty tracking (min is 12). Is it a
>>>>>> FW tunable or a strict limitation ?
>>>>>
>>>>> It would be good to explain where this is coming from, all devices
>>>>> need to make some decision on what address space ranges to track and I
>>>>> would say 2^42 is already pretty generous limit..
>>>>
>>>>
>>>> QEMU computes the DMA logging ranges for two predefined ranges: 32-bit
>>>> and 64-bit. In the OVMF case, QEMU includes in the 64-bit range, RAM
>>>> (at the lower part) and device RAM regions (at the top of the address
>>>> space). The size of that range can be bigger than the 2^42 limit of
>>>> the MLX5 HW for dirty tracking. QEMU is not making much effort to be
>>>> smart. There is room for improvement.
>>>>
>>>
>>> Interesting, we haven't reproduced this in our testing with OVMF multi-TB
>>> configs with these VFs. Could you share the OVMF base version you were using?
>>
>> edk2-ovmf-20230524-3.el9.noarch
>>
>> host is a :
>>          Architecture:            x86_64
>>        CPU op-mode(s):        32-bit, 64-bit
>>        Address sizes:         46 bits physical, 57 bits virtual
>>        Byte Order:            Little Endian
>>      CPU(s):                  48
>>        On-line CPU(s) list:   0-47
>>      Vendor ID:               GenuineIntel
>>        Model name:            Intel(R) Xeon(R) Silver 4310 CPU @ 2.10GHz
>>
>>
>>> or
>>> maybe we didn't triggered it considering the total device RAM regions would be
>>> small enough to fit the 32G PCI hole64 that is advertised that avoids a
>>> hypothetical relocation.
>>
>> You need RAM above 4G in the guest :
>>          100000000-27fffffff : System RAM
>>        237800000-2387fffff : Kernel code
>>        238800000-23932cfff : Kernel rodata
>>        239400000-239977cff : Kernel data
>>        23a202000-23b3fffff : Kernel bss
>>      380000000000-3807ffffffff : PCI Bus 0000:00
>>        380000000000-3800000fffff : 0000:00:03.0
>>          380000000000-3800000fffff : mlx5_core
> 
> Similar machine to yours, but in my 32G guests with older OVMF it's putting the
> PCI area after max-ram:
> 
> vfio_device_dirty_tracking_update section 0x0 - 0x9ffff -> update [0x0 - 0x9ffff]
> vfio_device_dirty_tracking_update section 0xc0000 - 0xcafff -> update [0x0 -
> 0xcafff]
> vfio_device_dirty_tracking_update section 0xcb000 - 0xcdfff -> update [0x0 -
> 0xcdfff]
> vfio_device_dirty_tracking_update section 0xce000 - 0xe7fff -> update [0x0 -
> 0xe7fff]
> vfio_device_dirty_tracking_update section 0xe8000 - 0xeffff -> update [0x0 -
> 0xeffff]
> vfio_device_dirty_tracking_update section 0xf0000 - 0xfffff -> update [0x0 -
> 0xfffff]
> vfio_device_dirty_tracking_update section 0x100000 - 0x7fffffff -> update [0x0 -
> 0x7fffffff]
> vfio_device_dirty_tracking_update section 0xfd000000 - 0xfdffffff -> update [0x0
> - 0xfdffffff]
> vfio_device_dirty_tracking_update section 0xfffc0000 - 0xffffffff -> update [0x0
> - 0xffffffff]
> vfio_device_dirty_tracking_update section 0x100000000 - 0x87fffffff -> update
> [0x100000000 - 0x87fffffff]
> vfio_device_dirty_tracking_update section 0x880000000 - 0x880001fff -> update
> [0x100000000 - 0x880001fff]
> vfio_device_dirty_tracking_update section 0x880003000 - 0x8ffffffff -> update
> [0x100000000 - 0x8ffffffff]
> 

and so the range is smaller.

The latest version of OVMF enables the dynamic mmio window which causes the issue.

>>
>> Activating the QEMU trace events shows quickly the issue :
>>
>>      vfio_device_dirty_tracking_update section 0x0 - 0x9ffff -> update [0x0 -
>> 0x9ffff]
>>      vfio_device_dirty_tracking_update section 0xa0000 - 0xaffff -> update [0x0 -
>> 0xaffff]
>>      vfio_device_dirty_tracking_update section 0xc0000 - 0xc3fff -> update [0x0 -
>> 0xc3fff]
>>      vfio_device_dirty_tracking_update section 0xc4000 - 0xdffff -> update [0x0 -
>> 0xdffff]
>>      vfio_device_dirty_tracking_update section 0xe0000 - 0xfffff -> update [0x0 -
>> 0xfffff]
>>      vfio_device_dirty_tracking_update section 0x100000 - 0x7fffffff -> update
>> [0x0 - 0x7fffffff]
>>      vfio_device_dirty_tracking_update section 0x80000000 - 0x807fffff -> update
>> [0x0 - 0x807fffff]
>>      vfio_device_dirty_tracking_update section 0x100000000 - 0x27fffffff ->
>> update [0x100000000 - 0x27fffffff]
>>      vfio_device_dirty_tracking_update section 0x383800000000 - 0x383800001fff ->
>> update [0x100000000 - 0x383800001fff]
>>      vfio_device_dirty_tracking_update section 0x383800003000 - 0x3838000fffff ->
>> update [0x100000000 - 0x3838000fffff]
>>
>> So that's nice. And with less RAM in the VM, 2G, migration should work though.
>>
>>> We could use do more than 2 ranges (or going back to sharing all ranges), or add
>>> a set of ranges that represents the device RAM without computing a min/max there
>>> (not sure we can figure that out from within the memory listener does all this
>>> logic);
>>
>> The listener is container based. May we could add one range per device
>> if we can identify a different owner per memory section.
>>
> 
> For brainstorm purposes ... Maybe something like this below. Should make your
> case work. As mentioned earlier in my case it's placed always at maxram+1, so
> makes no difference in having the "pci" range

yes.

That said, it looks good and it does handle the useless gap in the 64-bit range.
Thanks for doing it. We should explore the non OVMF case with this patch also.

C.


> 
> ------>8-------
> From: Joao Martins <joao.m.martins@oracle.com>
> Date: Thu, 7 Sep 2023 09:23:38 -0700
> Subject: [PATCH] vfio/common: Separate vfio-pci ranges
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   hw/vfio/common.c     | 48 ++++++++++++++++++++++++++++++++++++++++----
>   hw/vfio/trace-events |  2 +-
>   2 files changed, 45 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index f8b20aacc07c..f0b36a98c89a 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -27,6 +27,7 @@
> 
>   #include "hw/vfio/vfio-common.h"
>   #include "hw/vfio/vfio.h"
> +#include "hw/vfio/pci.h"
>   #include "exec/address-spaces.h"
>   #include "exec/memory.h"
>   #include "exec/ram_addr.h"
> @@ -1424,6 +1425,8 @@ typedef struct VFIODirtyRanges {
>       hwaddr max32;
>       hwaddr min64;
>       hwaddr max64;
> +    hwaddr minpci;
> +    hwaddr maxpci;
>   } VFIODirtyRanges;
> 
>   typedef struct VFIODirtyRangesListener {
> @@ -1432,6 +1435,31 @@ typedef struct VFIODirtyRangesListener {
>       MemoryListener listener;
>   } VFIODirtyRangesListener;
> 
> +static bool vfio_section_is_vfio_pci(MemoryRegionSection *section,
> +                                     VFIOContainer *container)
> +{
> +    VFIOPCIDevice *pcidev;
> +    VFIODevice *vbasedev;
> +    VFIOGroup *group;
> +    Object *owner;
> +
> +    owner = memory_region_owner(section->mr);
> +
> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
> +                continue;
> +            }
> +            pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +            if (OBJECT(pcidev) == owner) {
> +                return true;
> +            }
> +        }
> +    }
> +
> +    return false;
> +}
> +
>   static void vfio_dirty_tracking_update(MemoryListener *listener,
>                                          MemoryRegionSection *section)
>   {
> @@ -1458,8 +1486,13 @@ static void vfio_dirty_tracking_update(MemoryListener
> *listener,
>        * would be an IOVATree but that has a much bigger runtime overhead and
>        * unnecessary complexity.
>        */
> -    min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> -    max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> +    if (!vfio_section_is_vfio_pci(section, dirty->container)) {
> +        min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
> +        max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
> +    } else {
> +        min = &range->minpci;
> +        max = &range->maxpci;
> +    }
> 
>       if (*min > iova) {
>           *min = iova;
> @@ -1485,6 +1518,7 @@ static void vfio_dirty_tracking_init(VFIOContainer *container,
>       memset(&dirty, 0, sizeof(dirty));
>       dirty.ranges.min32 = UINT32_MAX;
>       dirty.ranges.min64 = UINT64_MAX;
> +    dirty.ranges.minpci = UINT64_MAX;
>       dirty.listener = vfio_dirty_tracking_listener;
>       dirty.container = container;
> 
> @@ -1555,7 +1589,7 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer
> *container,
>        * DMA logging uAPI guarantees to support at least a number of ranges that
>        * fits into a single host kernel base page.
>        */
> -    control->num_ranges = !!tracking->max32 + !!tracking->max64;
> +    control->num_ranges = !!tracking->max32 + !!tracking->max64 +
> !!tracking->maxpci;
>       ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
>                           control->num_ranges);
>       if (!ranges) {
> @@ -1574,11 +1608,17 @@
> vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
>       if (tracking->max64) {
>           ranges->iova = tracking->min64;
>           ranges->length = (tracking->max64 - tracking->min64) + 1;
> +        ranges++;
> +    }
> +    if (tracking->maxpci) {
> +        ranges->iova = tracking->minpci;
> +        ranges->length = (tracking->maxpci - tracking->minpci) + 1;
>       }
> 
>       trace_vfio_device_dirty_tracking_start(control->num_ranges,
>                                              tracking->min32, tracking->max32,
> -                                           tracking->min64, tracking->max64);
> +                                           tracking->min64, tracking->max64,
> +                                           tracking->minpci, tracking->maxpci);
> 
>       return feature;
>   }
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 444c15be47ee..ee5a44893334 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,7 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t
> iova, uint64_t offset_wi
>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t
> size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not
> aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64"
> - 0x%"PRIx64
>   vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
> uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
> 0x%"PRIx64"]"
> -vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32,
> uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"],
> 64:[0x%"PRIx64" - 0x%"PRIx64"]"
> +vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32,
> uint64_t min64, uint64_t max64, uint64_t minpci, uint64_t maxpci) "nr_ranges %d
> 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"], pci:[0x%"PRIx64"
> - 0x%"PRIx64"]"
>   vfio_disconnect_container(int fd) "close container->fd=%d"
>   vfio_put_group(int fd) "close group->fd=%d"
>   vfio_get_device(const char * name, unsigned int flags, unsigned int
> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> --
> 2.39.3
>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 0a362796d567..f1cad96af6ab 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -410,6 +410,148 @@  int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	return err;
 }
 
+static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
+			   u32 req_nodes)
+{
+	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
+	unsigned long min_gap;
+	unsigned long curr_gap;
+
+	/* Special shortcut when a single range is required */
+	if (req_nodes == 1) {
+		unsigned long last;
+
+		curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+		while (curr) {
+			last = curr->last;
+			prev = curr;
+			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
+			if (prev != comb_start)
+				interval_tree_remove(prev, root);
+		}
+		comb_start->last = last;
+		return;
+	}
+
+	/* Combine ranges which have the smallest gap */
+	while (cur_nodes > req_nodes) {
+		prev = NULL;
+		min_gap = ULONG_MAX;
+		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
+		while (curr) {
+			if (prev) {
+				curr_gap = curr->start - prev->last;
+				if (curr_gap < min_gap) {
+					min_gap = curr_gap;
+					comb_start = prev;
+					comb_end = curr;
+				}
+			}
+			prev = curr;
+			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
+		}
+		comb_start->last = comb_end->last;
+		interval_tree_remove(comb_end, root);
+		cur_nodes--;
+	}
+}
+
+static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
+				 struct mlx5vf_pci_core_device *mvdev,
+				 struct rb_root_cached *ranges, u32 nnodes)
+{
+	int max_num_range =
+		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
+	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
+	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+	struct interval_tree_node *node = NULL;
+	u64 total_ranges_len = 0;
+	u32 num_ranges = nnodes;
+	u8 log_addr_space_size;
+	void *range_list_ptr;
+	void *obj_context;
+	void *cmd_hdr;
+	int inlen;
+	void *in;
+	int err;
+	int i;
+
+	if (num_ranges > max_num_range) {
+		combine_ranges(ranges, nnodes, max_num_range);
+		num_ranges = max_num_range;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
+				 record_size * num_ranges;
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
+			       general_obj_in_cmd_hdr);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
+		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
+		 MLX5_OBJ_TYPE_PAGE_TRACK);
+	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
+	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
+	MLX5_SET(page_track, obj_context, track_type, 1);
+	MLX5_SET(page_track, obj_context, log_page_size,
+		 ilog2(tracker->host_qp->tracked_page_size));
+	MLX5_SET(page_track, obj_context, log_msg_size,
+		 ilog2(tracker->host_qp->max_msg_size));
+	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
+	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
+
+	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
+	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+	for (i = 0; i < num_ranges; i++) {
+		void *addr_range_i_base = range_list_ptr + record_size * i;
+		unsigned long length = node->last - node->start;
+
+		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
+			   node->start);
+		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
+		total_ranges_len += length;
+		node = interval_tree_iter_next(node, 0, ULONG_MAX);
+	}
+
+	WARN_ON(node);
+	log_addr_space_size = ilog2(total_ranges_len);
+	if (log_addr_space_size <
+	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
+	    log_addr_space_size >
+	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	MLX5_SET(page_track, obj_context, log_addr_space_size,
+		 log_addr_space_size);
+	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+	if (err)
+		goto out;
+
+	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+out:
+	kfree(in);
+	return err;
+}
+
+static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
+				      u32 tracker_id)
+{
+	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+
+	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
+	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
 			     struct mlx5_vhca_cq_buf *buf, int nent,
 			     int cqe_size)
@@ -833,6 +975,7 @@  _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
 
 	WARN_ON(mvdev->mdev_detach);
 
+	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
 	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
 	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
 	mlx5vf_destroy_qp(mdev, tracker->host_qp);
@@ -941,6 +1084,10 @@  int mlx5vf_start_page_tracker(struct vfio_device *vdev,
 
 	tracker->host_qp = host_qp;
 	tracker->fw_qp = fw_qp;
+	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
+	if (err)
+		goto err_activate;
+
 	*page_size = host_qp->tracked_page_size;
 	mvdev->log_active = true;
 	mlx5vf_state_mutex_unlock(mvdev);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index e71ec017bf04..658925ba5459 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -80,6 +80,7 @@  struct mlx5_vhca_qp {
 };
 
 struct mlx5_vhca_page_tracker {
+	u32 id;
 	u32 pdn;
 	struct mlx5_uars_page *uar;
 	struct mlx5_vhca_cq cq;