Message ID | 419f3ba2f732154d8ae079b3deb02d0fdbe3e258.1680038771.git.reinette.chatre@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | vfio/pci: Support dynamic allocation of MSI-X interrupts | expand |
Hi Reinette, I love your patch! Yet something to improve: [auto build test ERROR on 197b6b60ae7bc51dd0814953c562833143b292aa] url: https://github.com/intel-lab-lkp/linux/commits/Reinette-Chatre/vfio-pci-Consolidate-irq-cleanup-on-MSI-MSI-X-disable/20230329-055735 base: 197b6b60ae7bc51dd0814953c562833143b292aa patch link: https://lore.kernel.org/r/419f3ba2f732154d8ae079b3deb02d0fdbe3e258.1680038771.git.reinette.chatre%40intel.com patch subject: [PATCH V2 7/8] vfio/pci: Support dynamic MSI-x config: i386-randconfig-a001-20230327 (https://download.01.org/0day-ci/archive/20230329/202303291000.PWFqGCxH-lkp@intel.com/config) compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/420198d6ba9227a0ef81a2192ca35019fa6439cf git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Reinette-Chatre/vfio-pci-Consolidate-irq-cleanup-on-MSI-MSI-X-disable/20230329-055735 git checkout 420198d6ba9227a0ef81a2192ca35019fa6439cf # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=i386 olddefconfig make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/vfio/pci/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202303291000.PWFqGCxH-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/vfio/pci/vfio_pci_intrs.c: In function 'vfio_msi_set_vector_signal': >> drivers/vfio/pci/vfio_pci_intrs.c:427:21: error: implicit declaration of function 'pci_msix_can_alloc_dyn' [-Werror=implicit-function-declaration] 427 | if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) | ^~~~~~~~~~~~~~~~~~~~~~ cc1: some warnings being treated as errors vim +/pci_msix_can_alloc_dyn +427 drivers/vfio/pci/vfio_pci_intrs.c 413 414 static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, 415 unsigned int vector, int fd, bool msix) 416 { 417 struct pci_dev *pdev = vdev->pdev; 418 struct vfio_pci_irq_ctx *ctx; 419 struct msi_map msix_map = {}; 420 bool allow_dyn_alloc = false; 421 struct eventfd_ctx *trigger; 422 bool new_ctx = false; 423 int irq, ret; 424 u16 cmd; 425 426 /* Only MSI-X allows dynamic allocation. */ > 427 if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) 428 allow_dyn_alloc = true; 429 430 ctx = vfio_irq_ctx_get(vdev, vector); 431 if (!ctx && !allow_dyn_alloc) 432 return -EINVAL; 433 434 irq = pci_irq_vector(pdev, vector); 435 /* Context and interrupt are always allocated together. */ 436 WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); 437 438 if (ctx && ctx->trigger) { 439 irq_bypass_unregister_producer(&ctx->producer); 440 441 cmd = vfio_pci_memory_lock_and_enable(vdev); 442 free_irq(irq, ctx->trigger); 443 if (allow_dyn_alloc) { 444 msix_map.index = vector; 445 msix_map.virq = irq; 446 pci_msix_free_irq(pdev, msix_map); 447 irq = -EINVAL; 448 } 449 vfio_pci_memory_unlock_and_restore(vdev, cmd); 450 kfree(ctx->name); 451 eventfd_ctx_put(ctx->trigger); 452 ctx->trigger = NULL; 453 if (allow_dyn_alloc) { 454 vfio_irq_ctx_free(vdev, ctx, vector); 455 ctx = NULL; 456 } 457 } 458 459 if (fd < 0) 460 return 0; 461 462 if (!ctx) { 463 ctx = vfio_irq_ctx_alloc_single(vdev, vector); 464 if (!ctx) 465 return -ENOMEM; 466 new_ctx = true; 467 } 468 469 ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", 470 msix ? "x" : "", vector, pci_name(pdev)); 471 if (!ctx->name) { 472 ret = -ENOMEM; 473 goto out_free_ctx; 474 } 475 476 trigger = eventfd_ctx_fdget(fd); 477 if (IS_ERR(trigger)) { 478 ret = PTR_ERR(trigger); 479 goto out_free_name; 480 } 481 482 cmd = vfio_pci_memory_lock_and_enable(vdev); 483 if (msix) { 484 if (irq == -EINVAL) { 485 msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); 486 if (msix_map.index < 0) { 487 vfio_pci_memory_unlock_and_restore(vdev, cmd); 488 ret = msix_map.index; 489 goto out_put_eventfd_ctx; 490 } 491 irq = msix_map.virq; 492 } else { 493 /* 494 * The MSIx vector table resides in device memory which 495 * may be cleared via backdoor resets. We don't allow 496 * direct access to the vector table so even if a 497 * userspace driver attempts to save/restore around 498 * such a reset it would be unsuccessful. To avoid 499 * this, restore the cached value of the message prior 500 * to enabling. 501 */ 502 struct msi_msg msg; 503 504 get_cached_msi_msg(irq, &msg); 505 pci_write_msi_msg(irq, &msg); 506 } 507 } 508 509 ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); 510 if (ret) 511 goto out_free_irq_locked; 512 513 vfio_pci_memory_unlock_and_restore(vdev, cmd); 514 515 ctx->producer.token = trigger; 516 ctx->producer.irq = irq; 517 ret = irq_bypass_register_producer(&ctx->producer); 518 if (unlikely(ret)) { 519 dev_info(&pdev->dev, 520 "irq bypass producer (token %p) registration fails: %d\n", 521 ctx->producer.token, ret); 522 523 ctx->producer.token = NULL; 524 } 525 ctx->trigger = trigger; 526 527 return 0; 528 529 out_free_irq_locked: 530 if (allow_dyn_alloc && new_ctx) { 531 msix_map.index = vector; 532 msix_map.virq = irq; 533 pci_msix_free_irq(pdev, msix_map); 534 } 535 vfio_pci_memory_unlock_and_restore(vdev, cmd); 536 out_put_eventfd_ctx: 537 eventfd_ctx_put(trigger); 538 out_free_name: 539 kfree(ctx->name); 540 ctx->name = NULL; 541 out_free_ctx: 542 if (allow_dyn_alloc && new_ctx) 543 vfio_irq_ctx_free(vdev, ctx, vector); 544 return ret; 545 } 546
Hi Reinette, I love your patch! Yet something to improve: [auto build test ERROR on 197b6b60ae7bc51dd0814953c562833143b292aa] url: https://github.com/intel-lab-lkp/linux/commits/Reinette-Chatre/vfio-pci-Consolidate-irq-cleanup-on-MSI-MSI-X-disable/20230329-055735 base: 197b6b60ae7bc51dd0814953c562833143b292aa patch link: https://lore.kernel.org/r/419f3ba2f732154d8ae079b3deb02d0fdbe3e258.1680038771.git.reinette.chatre%40intel.com patch subject: [PATCH V2 7/8] vfio/pci: Support dynamic MSI-x config: i386-randconfig-a014-20230327 (https://download.01.org/0day-ci/archive/20230329/202303291023.2Kc7CcsV-lkp@intel.com/config) compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/420198d6ba9227a0ef81a2192ca35019fa6439cf git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Reinette-Chatre/vfio-pci-Consolidate-irq-cleanup-on-MSI-MSI-X-disable/20230329-055735 git checkout 420198d6ba9227a0ef81a2192ca35019fa6439cf # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/vfio/pci/ If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202303291023.2Kc7CcsV-lkp@intel.com/ All errors (new ones prefixed by >>): >> drivers/vfio/pci/vfio_pci_intrs.c:427:14: error: implicit declaration of function 'pci_msix_can_alloc_dyn' is invalid in C99 [-Werror,-Wimplicit-function-declaration] if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) ^ 1 error generated. vim +/pci_msix_can_alloc_dyn +427 drivers/vfio/pci/vfio_pci_intrs.c 413 414 static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, 415 unsigned int vector, int fd, bool msix) 416 { 417 struct pci_dev *pdev = vdev->pdev; 418 struct vfio_pci_irq_ctx *ctx; 419 struct msi_map msix_map = {}; 420 bool allow_dyn_alloc = false; 421 struct eventfd_ctx *trigger; 422 bool new_ctx = false; 423 int irq, ret; 424 u16 cmd; 425 426 /* Only MSI-X allows dynamic allocation. */ > 427 if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) 428 allow_dyn_alloc = true; 429 430 ctx = vfio_irq_ctx_get(vdev, vector); 431 if (!ctx && !allow_dyn_alloc) 432 return -EINVAL; 433 434 irq = pci_irq_vector(pdev, vector); 435 /* Context and interrupt are always allocated together. */ 436 WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); 437 438 if (ctx && ctx->trigger) { 439 irq_bypass_unregister_producer(&ctx->producer); 440 441 cmd = vfio_pci_memory_lock_and_enable(vdev); 442 free_irq(irq, ctx->trigger); 443 if (allow_dyn_alloc) { 444 msix_map.index = vector; 445 msix_map.virq = irq; 446 pci_msix_free_irq(pdev, msix_map); 447 irq = -EINVAL; 448 } 449 vfio_pci_memory_unlock_and_restore(vdev, cmd); 450 kfree(ctx->name); 451 eventfd_ctx_put(ctx->trigger); 452 ctx->trigger = NULL; 453 if (allow_dyn_alloc) { 454 vfio_irq_ctx_free(vdev, ctx, vector); 455 ctx = NULL; 456 } 457 } 458 459 if (fd < 0) 460 return 0; 461 462 if (!ctx) { 463 ctx = vfio_irq_ctx_alloc_single(vdev, vector); 464 if (!ctx) 465 return -ENOMEM; 466 new_ctx = true; 467 } 468 469 ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", 470 msix ? "x" : "", vector, pci_name(pdev)); 471 if (!ctx->name) { 472 ret = -ENOMEM; 473 goto out_free_ctx; 474 } 475 476 trigger = eventfd_ctx_fdget(fd); 477 if (IS_ERR(trigger)) { 478 ret = PTR_ERR(trigger); 479 goto out_free_name; 480 } 481 482 cmd = vfio_pci_memory_lock_and_enable(vdev); 483 if (msix) { 484 if (irq == -EINVAL) { 485 msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); 486 if (msix_map.index < 0) { 487 vfio_pci_memory_unlock_and_restore(vdev, cmd); 488 ret = msix_map.index; 489 goto out_put_eventfd_ctx; 490 } 491 irq = msix_map.virq; 492 } else { 493 /* 494 * The MSIx vector table resides in device memory which 495 * may be cleared via backdoor resets. We don't allow 496 * direct access to the vector table so even if a 497 * userspace driver attempts to save/restore around 498 * such a reset it would be unsuccessful. To avoid 499 * this, restore the cached value of the message prior 500 * to enabling. 501 */ 502 struct msi_msg msg; 503 504 get_cached_msi_msg(irq, &msg); 505 pci_write_msi_msg(irq, &msg); 506 } 507 } 508 509 ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); 510 if (ret) 511 goto out_free_irq_locked; 512 513 vfio_pci_memory_unlock_and_restore(vdev, cmd); 514 515 ctx->producer.token = trigger; 516 ctx->producer.irq = irq; 517 ret = irq_bypass_register_producer(&ctx->producer); 518 if (unlikely(ret)) { 519 dev_info(&pdev->dev, 520 "irq bypass producer (token %p) registration fails: %d\n", 521 ctx->producer.token, ret); 522 523 ctx->producer.token = NULL; 524 } 525 ctx->trigger = trigger; 526 527 return 0; 528 529 out_free_irq_locked: 530 if (allow_dyn_alloc && new_ctx) { 531 msix_map.index = vector; 532 msix_map.virq = irq; 533 pci_msix_free_irq(pdev, msix_map); 534 } 535 vfio_pci_memory_unlock_and_restore(vdev, cmd); 536 out_put_eventfd_ctx: 537 eventfd_ctx_put(trigger); 538 out_free_name: 539 kfree(ctx->name); 540 ctx->name = NULL; 541 out_free_ctx: 542 if (allow_dyn_alloc && new_ctx) 543 vfio_irq_ctx_free(vdev, ctx, vector); 544 return ret; 545 } 546
Thank you very much for the report. On 3/28/2023 7:48 PM, kernel test robot wrote: > config: i386-randconfig-a001-20230327 (https://download.01.org/0day-ci/archive/20230329/202303291000.PWFqGCxH-lkp@intel.com/config) This config (and all the other configs in the reports about implicit declaration of function 'pci_msix_can_alloc_dyn') has: # CONFIG_PCI_MSI is not set Resulting in: > drivers/vfio/pci/vfio_pci_intrs.c: In function 'vfio_msi_set_vector_signal': >>> drivers/vfio/pci/vfio_pci_intrs.c:427:21: error: implicit declaration of function 'pci_msix_can_alloc_dyn' [-Werror=implicit-function-declaration] > 427 | if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > | ^~~~~~~~~~~~~~~~~~~~~~ > cc1: some warnings being treated as errors > because include/linux/pci.h is missing a stub for pci_msix_can_alloc_dyn() when CONFIG_PCI_MSI is not set. I'll send a separate fix for that. Reinette
On 3/29/2023 7:42 AM, Reinette Chatre wrote:
> I'll send a separate fix for that.
https://lore.kernel.org/lkml/310ecc4815dae4174031062f525245f0755c70e2.1680119924.git.reinette.chatre@intel.com/
Reinette
On Tue, 28 Mar 2023 14:53:34 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Recently introduced pci_msix_alloc_irq_at() and pci_msix_free_irq() > enables an individual MSI-X index to be allocated and freed after > MSI-X enabling. > > Support dynamic MSI-X if supported by the device. Keep the association > between allocated interrupt and vfio interrupt context. Allocate new > context together with the new interrupt if no interrupt context exist > for an MSI-X interrupt. Similarly, release an interrupt with its > context. > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > --- > Changes since RFC V1: > - Add pointer to interrupt context as function parameter to > vfio_irq_ctx_free(). (Alex) > - Initialize new_ctx to false. (Dan Carpenter) > - Only support dynamic allocation if device supports it. (Alex) > > drivers/vfio/pci/vfio_pci_intrs.c | 93 +++++++++++++++++++++++++------ > 1 file changed, 76 insertions(+), 17 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > index b3a258e58625..755b752ca17e 100644 > --- a/drivers/vfio/pci/vfio_pci_intrs.c > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > @@ -55,6 +55,13 @@ struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, > return xa_load(&vdev->ctx, index); > } > > +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, > + struct vfio_pci_irq_ctx *ctx, unsigned long index) > +{ > + xa_erase(&vdev->ctx, index); > + kfree(ctx); > +} > + > static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) > { > struct vfio_pci_irq_ctx *ctx; > @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > { > struct pci_dev *pdev = vdev->pdev; > struct vfio_pci_irq_ctx *ctx; > + struct msi_map msix_map = {}; > + bool allow_dyn_alloc = false; > struct eventfd_ctx *trigger; > + bool new_ctx = false; > int irq, ret; > u16 cmd; > > + /* Only MSI-X allows dynamic allocation. */ > + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > + allow_dyn_alloc = true; Should vfio-pci-core probe this and store it in a field on vfio_pci_core_device so that we can simply use something like vdev->has_dyn_msix throughout? > + > ctx = vfio_irq_ctx_get(vdev, vector); > - if (!ctx) > + if (!ctx && !allow_dyn_alloc) > return -EINVAL; > + > irq = pci_irq_vector(pdev, vector); > + /* Context and interrupt are always allocated together. */ > + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > > - if (ctx->trigger) { > + if (ctx && ctx->trigger) { > irq_bypass_unregister_producer(&ctx->producer); > > cmd = vfio_pci_memory_lock_and_enable(vdev); > free_irq(irq, ctx->trigger); > + if (allow_dyn_alloc) { It almost seems easier to define msix_map in each scope that it's used: struct msi_map map = { .index = vector, .virq = irq }; > + msix_map.index = vector; > + msix_map.virq = irq; > + pci_msix_free_irq(pdev, msix_map); > + irq = -EINVAL; > + } > vfio_pci_memory_unlock_and_restore(vdev, cmd); > kfree(ctx->name); > eventfd_ctx_put(ctx->trigger); > ctx->trigger = NULL; > + if (allow_dyn_alloc) { > + vfio_irq_ctx_free(vdev, ctx, vector); > + ctx = NULL; > + } > } > > if (fd < 0) > return 0; > > + if (!ctx) { > + ctx = vfio_irq_ctx_alloc_single(vdev, vector); > + if (!ctx) > + return -ENOMEM; > + new_ctx = true; > + } > + > ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", > msix ? "x" : "", vector, pci_name(pdev)); > - if (!ctx->name) > - return -ENOMEM; > + if (!ctx->name) { > + ret = -ENOMEM; > + goto out_free_ctx; > + } > > trigger = eventfd_ctx_fdget(fd); > if (IS_ERR(trigger)) { > @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > goto out_free_name; > } > > - /* > - * The MSIx vector table resides in device memory which may be cleared > - * via backdoor resets. We don't allow direct access to the vector > - * table so even if a userspace driver attempts to save/restore around > - * such a reset it would be unsuccessful. To avoid this, restore the > - * cached value of the message prior to enabling. > - */ > cmd = vfio_pci_memory_lock_and_enable(vdev); > if (msix) { > - struct msi_msg msg; > - > - get_cached_msi_msg(irq, &msg); > - pci_write_msi_msg(irq, &msg); > + if (irq == -EINVAL) { > + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); struct msi_map map = pci_msix_alloc_irq_at(pdev, vector, NULL); > + if (msix_map.index < 0) { > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > + ret = msix_map.index; > + goto out_put_eventfd_ctx; > + } > + irq = msix_map.virq; > + } else { > + /* > + * The MSIx vector table resides in device memory which > + * may be cleared via backdoor resets. We don't allow > + * direct access to the vector table so even if a > + * userspace driver attempts to save/restore around > + * such a reset it would be unsuccessful. To avoid > + * this, restore the cached value of the message prior > + * to enabling. > + */ You've only just copied this comment down to here, but I think it's a bit stale. Maybe we should update it to something that helps explain this split better, maybe: /* * If the vector was previously allocated, refresh the * on-device message data before enabling in case it had * been cleared or corrupted since writing. */ IIRC, that was the purpose of writing it back to the device and the blocking of direct access is no longer accurate anyway. > + struct msi_msg msg; > + > + get_cached_msi_msg(irq, &msg); > + pci_write_msi_msg(irq, &msg); > + } > } > > ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); > - vfio_pci_memory_unlock_and_restore(vdev, cmd); > if (ret) > - goto out_put_eventfd_ctx; > + goto out_free_irq_locked; > + > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > > ctx->producer.token = trigger; > ctx->producer.irq = irq; > @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > return 0; > > +out_free_irq_locked: > + if (allow_dyn_alloc && new_ctx) { struct msi_map map = { .index = vector, .virq = irq }; > + msix_map.index = vector; > + msix_map.virq = irq; > + pci_msix_free_irq(pdev, msix_map); > + } > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > out_put_eventfd_ctx: > eventfd_ctx_put(trigger); > out_free_name: > kfree(ctx->name); > ctx->name = NULL; > +out_free_ctx: > + if (allow_dyn_alloc && new_ctx) > + vfio_irq_ctx_free(vdev, ctx, vector); > return ret; > } > Do we really need the new_ctx test in the above cases? Thanks, Alex
On Thu, 30 Mar 2023 16:40:50 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Tue, 28 Mar 2023 14:53:34 -0700 > Reinette Chatre <reinette.chatre@intel.com> wrote: > > > Recently introduced pci_msix_alloc_irq_at() and pci_msix_free_irq() > > enables an individual MSI-X index to be allocated and freed after > > MSI-X enabling. > > > > Support dynamic MSI-X if supported by the device. Keep the association > > between allocated interrupt and vfio interrupt context. Allocate new > > context together with the new interrupt if no interrupt context exist > > for an MSI-X interrupt. Similarly, release an interrupt with its > > context. > > > > Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> > > --- > > Changes since RFC V1: > > - Add pointer to interrupt context as function parameter to > > vfio_irq_ctx_free(). (Alex) > > - Initialize new_ctx to false. (Dan Carpenter) > > - Only support dynamic allocation if device supports it. (Alex) > > > > drivers/vfio/pci/vfio_pci_intrs.c | 93 +++++++++++++++++++++++++------ > > 1 file changed, 76 insertions(+), 17 deletions(-) > > > > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > > index b3a258e58625..755b752ca17e 100644 > > --- a/drivers/vfio/pci/vfio_pci_intrs.c > > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > > @@ -55,6 +55,13 @@ struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, > > return xa_load(&vdev->ctx, index); > > } > > > > +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, > > + struct vfio_pci_irq_ctx *ctx, unsigned long index) > > +{ > > + xa_erase(&vdev->ctx, index); > > + kfree(ctx); > > +} Also, the function below should use this rather than open coding the same now. Thanks, Alex > > + > > static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) > > { > > struct vfio_pci_irq_ctx *ctx; > > @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > { > > struct pci_dev *pdev = vdev->pdev; > > struct vfio_pci_irq_ctx *ctx; > > + struct msi_map msix_map = {}; > > + bool allow_dyn_alloc = false; > > struct eventfd_ctx *trigger; > > + bool new_ctx = false; > > int irq, ret; > > u16 cmd; > > > > + /* Only MSI-X allows dynamic allocation. */ > > + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > > + allow_dyn_alloc = true; > > Should vfio-pci-core probe this and store it in a field on > vfio_pci_core_device so that we can simply use something like > vdev->has_dyn_msix throughout? > > > + > > ctx = vfio_irq_ctx_get(vdev, vector); > > - if (!ctx) > > + if (!ctx && !allow_dyn_alloc) > > return -EINVAL; > > + > > irq = pci_irq_vector(pdev, vector); > > + /* Context and interrupt are always allocated together. */ > > + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > > > > - if (ctx->trigger) { > > + if (ctx && ctx->trigger) { > > irq_bypass_unregister_producer(&ctx->producer); > > > > cmd = vfio_pci_memory_lock_and_enable(vdev); > > free_irq(irq, ctx->trigger); > > + if (allow_dyn_alloc) { > > It almost seems easier to define msix_map in each scope that it's used: > > struct msi_map map = { .index = vector, > .virq = irq }; > > > + msix_map.index = vector; > > + msix_map.virq = irq; > > + pci_msix_free_irq(pdev, msix_map); > > + irq = -EINVAL; > > + } > > vfio_pci_memory_unlock_and_restore(vdev, cmd); > > kfree(ctx->name); > > eventfd_ctx_put(ctx->trigger); > > ctx->trigger = NULL; > > + if (allow_dyn_alloc) { > > + vfio_irq_ctx_free(vdev, ctx, vector); > > + ctx = NULL; > > + } > > } > > > > if (fd < 0) > > return 0; > > > > + if (!ctx) { > > + ctx = vfio_irq_ctx_alloc_single(vdev, vector); > > + if (!ctx) > > + return -ENOMEM; > > + new_ctx = true; > > + } > > + > > ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", > > msix ? "x" : "", vector, pci_name(pdev)); > > - if (!ctx->name) > > - return -ENOMEM; > > + if (!ctx->name) { > > + ret = -ENOMEM; > > + goto out_free_ctx; > > + } > > > > trigger = eventfd_ctx_fdget(fd); > > if (IS_ERR(trigger)) { > > @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > goto out_free_name; > > } > > > > - /* > > - * The MSIx vector table resides in device memory which may be cleared > > - * via backdoor resets. We don't allow direct access to the vector > > - * table so even if a userspace driver attempts to save/restore around > > - * such a reset it would be unsuccessful. To avoid this, restore the > > - * cached value of the message prior to enabling. > > - */ > > cmd = vfio_pci_memory_lock_and_enable(vdev); > > if (msix) { > > - struct msi_msg msg; > > - > > - get_cached_msi_msg(irq, &msg); > > - pci_write_msi_msg(irq, &msg); > > + if (irq == -EINVAL) { > > + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); > > struct msi_map map = pci_msix_alloc_irq_at(pdev, > vector, NULL); > > + if (msix_map.index < 0) { > > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > > + ret = msix_map.index; > > + goto out_put_eventfd_ctx; > > + } > > + irq = msix_map.virq; > > + } else { > > + /* > > + * The MSIx vector table resides in device memory which > > + * may be cleared via backdoor resets. We don't allow > > + * direct access to the vector table so even if a > > + * userspace driver attempts to save/restore around > > + * such a reset it would be unsuccessful. To avoid > > + * this, restore the cached value of the message prior > > + * to enabling. > > + */ > > You've only just copied this comment down to here, but I think it's a > bit stale. Maybe we should update it to something that helps explain > this split better, maybe: > > /* > * If the vector was previously allocated, refresh the > * on-device message data before enabling in case it had > * been cleared or corrupted since writing. > */ > > IIRC, that was the purpose of writing it back to the device and the > blocking of direct access is no longer accurate anyway. > > > + struct msi_msg msg; > > + > > + get_cached_msi_msg(irq, &msg); > > + pci_write_msi_msg(irq, &msg); > > + } > > } > > > > ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); > > - vfio_pci_memory_unlock_and_restore(vdev, cmd); > > if (ret) > > - goto out_put_eventfd_ctx; > > + goto out_free_irq_locked; > > + > > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > > > > ctx->producer.token = trigger; > > ctx->producer.irq = irq; > > @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > > > > return 0; > > > > +out_free_irq_locked: > > + if (allow_dyn_alloc && new_ctx) { > > struct msi_map map = { .index = vector, > .virq = irq }; > > > + msix_map.index = vector; > > + msix_map.virq = irq; > > + pci_msix_free_irq(pdev, msix_map); > > + } > > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > > out_put_eventfd_ctx: > > eventfd_ctx_put(trigger); > > out_free_name: > > kfree(ctx->name); > > ctx->name = NULL; > > +out_free_ctx: > > + if (allow_dyn_alloc && new_ctx) > > + vfio_irq_ctx_free(vdev, ctx, vector); > > return ret; > > } > > > > Do we really need the new_ctx test in the above cases? Thanks, > > Alex
Hi Reinette, > @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct > vfio_pci_core_device *vdev, { > struct pci_dev *pdev = vdev->pdev; > struct vfio_pci_irq_ctx *ctx; > + struct msi_map msix_map = {}; > + bool allow_dyn_alloc = false; > struct eventfd_ctx *trigger; > + bool new_ctx = false; > int irq, ret; > u16 cmd; > > + /* Only MSI-X allows dynamic allocation. */ > + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > + allow_dyn_alloc = true; > + > ctx = vfio_irq_ctx_get(vdev, vector); > - if (!ctx) > + if (!ctx && !allow_dyn_alloc) > return -EINVAL; > + > irq = pci_irq_vector(pdev, vector); > + /* Context and interrupt are always allocated together. */ > + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > > - if (ctx->trigger) { > + if (ctx && ctx->trigger) { > irq_bypass_unregister_producer(&ctx->producer); > > cmd = vfio_pci_memory_lock_and_enable(vdev); > free_irq(irq, ctx->trigger); > + if (allow_dyn_alloc) { > + msix_map.index = vector; > + msix_map.virq = irq; > + pci_msix_free_irq(pdev, msix_map); > + irq = -EINVAL; > + } > vfio_pci_memory_unlock_and_restore(vdev, cmd); > kfree(ctx->name); > eventfd_ctx_put(ctx->trigger); > ctx->trigger = NULL; > + if (allow_dyn_alloc) { > + vfio_irq_ctx_free(vdev, ctx, vector); > + ctx = NULL; > + } > } > > if (fd < 0) > return 0; > While looking at this piece of code, one thought comes to me: It might be possible that userspace comes twice with the same valid fd for a specific vector, this vfio code will free the resource in if(ctx && ctx->trigger) for the second time, and then re-alloc again for the same fd given by userspace. Would that help if we firstly check e.g. ctx->trigger with the given valid fd, to see if the trigger is really changed or not? Thanks, Jing > + if (!ctx) { > + ctx = vfio_irq_ctx_alloc_single(vdev, vector); > + if (!ctx) > + return -ENOMEM; > + new_ctx = true; > + } > + > ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", > msix ? "x" : "", vector, pci_name(pdev)); > - if (!ctx->name) > - return -ENOMEM; > + if (!ctx->name) { > + ret = -ENOMEM; > + goto out_free_ctx; > + } > > trigger = eventfd_ctx_fdget(fd); > if (IS_ERR(trigger)) { > @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct > vfio_pci_core_device *vdev, > goto out_free_name; > } > > - /* > - * The MSIx vector table resides in device memory which may be cleared > - * via backdoor resets. We don't allow direct access to the vector > - * table so even if a userspace driver attempts to save/restore around > - * such a reset it would be unsuccessful. To avoid this, restore the > - * cached value of the message prior to enabling. > - */ > cmd = vfio_pci_memory_lock_and_enable(vdev); > if (msix) { > - struct msi_msg msg; > - > - get_cached_msi_msg(irq, &msg); > - pci_write_msi_msg(irq, &msg); > + if (irq == -EINVAL) { > + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); > + if (msix_map.index < 0) { > + vfio_pci_memory_unlock_and_restore(vdev, > cmd); > + ret = msix_map.index; > + goto out_put_eventfd_ctx; > + } > + irq = msix_map.virq; > + } else { > + /* > + * The MSIx vector table resides in device memory > which > + * may be cleared via backdoor resets. We don't allow > + * direct access to the vector table so even if a > + * userspace driver attempts to save/restore around > + * such a reset it would be unsuccessful. To avoid > + * this, restore the cached value of the message prior > + * to enabling. > + */ > + struct msi_msg msg; > + > + get_cached_msi_msg(irq, &msg); > + pci_write_msi_msg(irq, &msg); > + } > } > > ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); > - vfio_pci_memory_unlock_and_restore(vdev, cmd); > if (ret) > - goto out_put_eventfd_ctx; > + goto out_free_irq_locked; > + > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > > ctx->producer.token = trigger; > ctx->producer.irq = irq; > @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct > vfio_pci_core_device *vdev, > > return 0; > > +out_free_irq_locked: > + if (allow_dyn_alloc && new_ctx) { > + msix_map.index = vector; > + msix_map.virq = irq; > + pci_msix_free_irq(pdev, msix_map); > + } > + vfio_pci_memory_unlock_and_restore(vdev, cmd); > out_put_eventfd_ctx: > eventfd_ctx_put(trigger); > out_free_name: > kfree(ctx->name); > ctx->name = NULL; > +out_free_ctx: > + if (allow_dyn_alloc && new_ctx) > + vfio_irq_ctx_free(vdev, ctx, vector); > return ret; > } > > -- > 2.34.1
On Fri, 31 Mar 2023 10:02:32 +0000 "Liu, Jing2" <jing2.liu@intel.com> wrote: > Hi Reinette, > > > @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct > > vfio_pci_core_device *vdev, { > > struct pci_dev *pdev = vdev->pdev; > > struct vfio_pci_irq_ctx *ctx; > > + struct msi_map msix_map = {}; > > + bool allow_dyn_alloc = false; > > struct eventfd_ctx *trigger; > > + bool new_ctx = false; > > int irq, ret; > > u16 cmd; > > > > + /* Only MSI-X allows dynamic allocation. */ > > + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > > + allow_dyn_alloc = true; > > + > > ctx = vfio_irq_ctx_get(vdev, vector); > > - if (!ctx) > > + if (!ctx && !allow_dyn_alloc) > > return -EINVAL; > > + > > irq = pci_irq_vector(pdev, vector); > > + /* Context and interrupt are always allocated together. */ > > + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > > > > - if (ctx->trigger) { > > + if (ctx && ctx->trigger) { > > irq_bypass_unregister_producer(&ctx->producer); > > > > cmd = vfio_pci_memory_lock_and_enable(vdev); > > free_irq(irq, ctx->trigger); > > + if (allow_dyn_alloc) { > > + msix_map.index = vector; > > + msix_map.virq = irq; > > + pci_msix_free_irq(pdev, msix_map); > > + irq = -EINVAL; > > + } > > vfio_pci_memory_unlock_and_restore(vdev, cmd); > > kfree(ctx->name); > > eventfd_ctx_put(ctx->trigger); > > ctx->trigger = NULL; > > + if (allow_dyn_alloc) { > > + vfio_irq_ctx_free(vdev, ctx, vector); > > + ctx = NULL; > > + } > > } > > > > if (fd < 0) > > return 0; > > > > While looking at this piece of code, one thought comes to me: > It might be possible that userspace comes twice with the same valid fd for a specific > vector, this vfio code will free the resource in if(ctx && ctx->trigger) for the second > time, and then re-alloc again for the same fd given by userspace. > > Would that help if we firstly check e.g. ctx->trigger with the given valid fd, to see if > the trigger is really changed or not? It's rather a made-up situation, if userspace wants to avoid bouncing the vector when the eventfd hasn't changed they can simply test this before calling the ioctl. Thanks, Alex
Hi Alex, On 3/30/2023 3:42 PM, Alex Williamson wrote: > On Thu, 30 Mar 2023 16:40:50 -0600 > Alex Williamson <alex.williamson@redhat.com> wrote: > >> On Tue, 28 Mar 2023 14:53:34 -0700 >> Reinette Chatre <reinette.chatre@intel.com> wrote: >> ... >>> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c >>> index b3a258e58625..755b752ca17e 100644 >>> --- a/drivers/vfio/pci/vfio_pci_intrs.c >>> +++ b/drivers/vfio/pci/vfio_pci_intrs.c >>> @@ -55,6 +55,13 @@ struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, >>> return xa_load(&vdev->ctx, index); >>> } >>> >>> +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, >>> + struct vfio_pci_irq_ctx *ctx, unsigned long index) >>> +{ >>> + xa_erase(&vdev->ctx, index); >>> + kfree(ctx); >>> +} > > Also, the function below should use this rather than open coding the > same now. Thanks, It should, yes. Thank you. Will do. >>> static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) >>> { >>> struct vfio_pci_irq_ctx *ctx; >>> @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, >>> { >>> struct pci_dev *pdev = vdev->pdev; >>> struct vfio_pci_irq_ctx *ctx; >>> + struct msi_map msix_map = {}; >>> + bool allow_dyn_alloc = false; >>> struct eventfd_ctx *trigger; >>> + bool new_ctx = false; >>> int irq, ret; >>> u16 cmd; >>> >>> + /* Only MSI-X allows dynamic allocation. */ >>> + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) >>> + allow_dyn_alloc = true; >> >> Should vfio-pci-core probe this and store it in a field on >> vfio_pci_core_device so that we can simply use something like >> vdev->has_dyn_msix throughout? It is not obvious to me if you mean this with vfio-pci-core probe, but it looks like a change to vfio_pci_core_enable() may be appropriate with a snippet like below: diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a743b98ba29a..a474ce80a555 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -533,6 +533,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } else vdev->msix_bar = 0xFF; + vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev); + if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; Please do note that I placed it outside of the earlier "if (msix_pos)" since pci_msix_can_alloc_dyn() has its own "if (!dev->msix_cap)". If you prefer to keep all the vdev->*msix* together I can move it into the if statement. With vdev->has_dyn_msix available "allow_dyn_alloc" can be dropped as you stated. >> >>> + >>> ctx = vfio_irq_ctx_get(vdev, vector); >>> - if (!ctx) >>> + if (!ctx && !allow_dyn_alloc) >>> return -EINVAL; >>> + >>> irq = pci_irq_vector(pdev, vector); >>> + /* Context and interrupt are always allocated together. */ >>> + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); >>> >>> - if (ctx->trigger) { >>> + if (ctx && ctx->trigger) { >>> irq_bypass_unregister_producer(&ctx->producer); >>> >>> cmd = vfio_pci_memory_lock_and_enable(vdev); >>> free_irq(irq, ctx->trigger); >>> + if (allow_dyn_alloc) { >> >> It almost seems easier to define msix_map in each scope that it's used: >> >> struct msi_map map = { .index = vector, >> .virq = irq }; >> Sure. Will do. >>> + msix_map.index = vector; >>> + msix_map.virq = irq; >>> + pci_msix_free_irq(pdev, msix_map); >>> + irq = -EINVAL; >>> + } >>> vfio_pci_memory_unlock_and_restore(vdev, cmd); >>> kfree(ctx->name); >>> eventfd_ctx_put(ctx->trigger); >>> ctx->trigger = NULL; >>> + if (allow_dyn_alloc) { >>> + vfio_irq_ctx_free(vdev, ctx, vector); >>> + ctx = NULL; >>> + } >>> } >>> >>> if (fd < 0) >>> return 0; >>> >>> + if (!ctx) { >>> + ctx = vfio_irq_ctx_alloc_single(vdev, vector); >>> + if (!ctx) >>> + return -ENOMEM; >>> + new_ctx = true; >>> + } >>> + >>> ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", >>> msix ? "x" : "", vector, pci_name(pdev)); >>> - if (!ctx->name) >>> - return -ENOMEM; >>> + if (!ctx->name) { >>> + ret = -ENOMEM; >>> + goto out_free_ctx; >>> + } >>> >>> trigger = eventfd_ctx_fdget(fd); >>> if (IS_ERR(trigger)) { >>> @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, >>> goto out_free_name; >>> } >>> >>> - /* >>> - * The MSIx vector table resides in device memory which may be cleared >>> - * via backdoor resets. We don't allow direct access to the vector >>> - * table so even if a userspace driver attempts to save/restore around >>> - * such a reset it would be unsuccessful. To avoid this, restore the >>> - * cached value of the message prior to enabling. >>> - */ >>> cmd = vfio_pci_memory_lock_and_enable(vdev); >>> if (msix) { >>> - struct msi_msg msg; >>> - >>> - get_cached_msi_msg(irq, &msg); >>> - pci_write_msi_msg(irq, &msg); >>> + if (irq == -EINVAL) { >>> + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); >> >> struct msi_map map = pci_msix_alloc_irq_at(pdev, >> vector, NULL); Will do. >>> + if (msix_map.index < 0) { >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); >>> + ret = msix_map.index; >>> + goto out_put_eventfd_ctx; >>> + } >>> + irq = msix_map.virq; >>> + } else { >>> + /* >>> + * The MSIx vector table resides in device memory which >>> + * may be cleared via backdoor resets. We don't allow >>> + * direct access to the vector table so even if a >>> + * userspace driver attempts to save/restore around >>> + * such a reset it would be unsuccessful. To avoid >>> + * this, restore the cached value of the message prior >>> + * to enabling. >>> + */ >> >> You've only just copied this comment down to here, but I think it's a >> bit stale. Maybe we should update it to something that helps explain >> this split better, maybe: >> >> /* >> * If the vector was previously allocated, refresh the >> * on-device message data before enabling in case it had >> * been cleared or corrupted since writing. >> */ >> >> IIRC, that was the purpose of writing it back to the device and the >> blocking of direct access is no longer accurate anyway. Thank you. Will do. To keep this patch focused I plan to separate this change into a new prep patch that will be placed earlier in this series. >> >>> + struct msi_msg msg; >>> + >>> + get_cached_msi_msg(irq, &msg); >>> + pci_write_msi_msg(irq, &msg); >>> + } >>> } >>> >>> ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); >>> - vfio_pci_memory_unlock_and_restore(vdev, cmd); >>> if (ret) >>> - goto out_put_eventfd_ctx; >>> + goto out_free_irq_locked; >>> + >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); >>> >>> ctx->producer.token = trigger; >>> ctx->producer.irq = irq; >>> @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, >>> >>> return 0; >>> >>> +out_free_irq_locked: >>> + if (allow_dyn_alloc && new_ctx) { >> >> struct msi_map map = { .index = vector, >> .virq = irq }; >> Will do. >>> + msix_map.index = vector; >>> + msix_map.virq = irq; >>> + pci_msix_free_irq(pdev, msix_map); >>> + } >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); >>> out_put_eventfd_ctx: >>> eventfd_ctx_put(trigger); >>> out_free_name: >>> kfree(ctx->name); >>> ctx->name = NULL; >>> +out_free_ctx: >>> + if (allow_dyn_alloc && new_ctx) >>> + vfio_irq_ctx_free(vdev, ctx, vector); >>> return ret; >>> } >>> >> >> Do we really need the new_ctx test in the above cases? Thanks, new_ctx is not required for correctness but instead is used to keep the code symmetric. Specifically, if the user enables MSI-X without providing triggers and then later assign triggers then an error path without new_ctx would unwind more than done in this function, it would free the context that was allocated within vfio_msi_enable(). Reinette
On Fri, 31 Mar 2023 10:49:16 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Hi Alex, > > On 3/30/2023 3:42 PM, Alex Williamson wrote: > > On Thu, 30 Mar 2023 16:40:50 -0600 > > Alex Williamson <alex.williamson@redhat.com> wrote: > > > >> On Tue, 28 Mar 2023 14:53:34 -0700 > >> Reinette Chatre <reinette.chatre@intel.com> wrote: > >> > > ... > > >>> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > >>> index b3a258e58625..755b752ca17e 100644 > >>> --- a/drivers/vfio/pci/vfio_pci_intrs.c > >>> +++ b/drivers/vfio/pci/vfio_pci_intrs.c > >>> @@ -55,6 +55,13 @@ struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, > >>> return xa_load(&vdev->ctx, index); > >>> } > >>> > >>> +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, > >>> + struct vfio_pci_irq_ctx *ctx, unsigned long index) > >>> +{ > >>> + xa_erase(&vdev->ctx, index); > >>> + kfree(ctx); > >>> +} > > > > Also, the function below should use this rather than open coding the > > same now. Thanks, > > It should, yes. Thank you. Will do. > > > >>> static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) > >>> { > >>> struct vfio_pci_irq_ctx *ctx; > >>> @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > >>> { > >>> struct pci_dev *pdev = vdev->pdev; > >>> struct vfio_pci_irq_ctx *ctx; > >>> + struct msi_map msix_map = {}; > >>> + bool allow_dyn_alloc = false; > >>> struct eventfd_ctx *trigger; > >>> + bool new_ctx = false; > >>> int irq, ret; > >>> u16 cmd; > >>> > >>> + /* Only MSI-X allows dynamic allocation. */ > >>> + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > >>> + allow_dyn_alloc = true; > >> > >> Should vfio-pci-core probe this and store it in a field on > >> vfio_pci_core_device so that we can simply use something like > >> vdev->has_dyn_msix throughout? > > It is not obvious to me if you mean this with vfio-pci-core probe, > but it looks like a change to vfio_pci_core_enable() may be > appropriate with a snippet like below: > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index a743b98ba29a..a474ce80a555 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -533,6 +533,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) > } else > vdev->msix_bar = 0xFF; > > + vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev); > + > if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) > vdev->has_vga = true; > > Please do note that I placed it outside of the earlier "if (msix_pos)" since > pci_msix_can_alloc_dyn() has its own "if (!dev->msix_cap)". If you prefer > to keep all the vdev->*msix* together I can move it into the if statement. Sure, just for common grouping I'd probably put it within the existing msix_cap branch. > With vdev->has_dyn_msix available "allow_dyn_alloc" can be dropped as you > stated. > > >> > >>> + > >>> ctx = vfio_irq_ctx_get(vdev, vector); > >>> - if (!ctx) > >>> + if (!ctx && !allow_dyn_alloc) > >>> return -EINVAL; > >>> + > >>> irq = pci_irq_vector(pdev, vector); > >>> + /* Context and interrupt are always allocated together. */ > >>> + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > >>> > >>> - if (ctx->trigger) { > >>> + if (ctx && ctx->trigger) { > >>> irq_bypass_unregister_producer(&ctx->producer); > >>> > >>> cmd = vfio_pci_memory_lock_and_enable(vdev); > >>> free_irq(irq, ctx->trigger); > >>> + if (allow_dyn_alloc) { > >> > >> It almost seems easier to define msix_map in each scope that it's used: > >> > >> struct msi_map map = { .index = vector, > >> .virq = irq }; > >> > > Sure. Will do. > > >>> + msix_map.index = vector; > >>> + msix_map.virq = irq; > >>> + pci_msix_free_irq(pdev, msix_map); > >>> + irq = -EINVAL; > >>> + } > >>> vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>> kfree(ctx->name); > >>> eventfd_ctx_put(ctx->trigger); > >>> ctx->trigger = NULL; > >>> + if (allow_dyn_alloc) { > >>> + vfio_irq_ctx_free(vdev, ctx, vector); > >>> + ctx = NULL; > >>> + } > >>> } > >>> > >>> if (fd < 0) > >>> return 0; > >>> > >>> + if (!ctx) { > >>> + ctx = vfio_irq_ctx_alloc_single(vdev, vector); > >>> + if (!ctx) > >>> + return -ENOMEM; > >>> + new_ctx = true; > >>> + } > >>> + > >>> ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", > >>> msix ? "x" : "", vector, pci_name(pdev)); > >>> - if (!ctx->name) > >>> - return -ENOMEM; > >>> + if (!ctx->name) { > >>> + ret = -ENOMEM; > >>> + goto out_free_ctx; > >>> + } > >>> > >>> trigger = eventfd_ctx_fdget(fd); > >>> if (IS_ERR(trigger)) { > >>> @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > >>> goto out_free_name; > >>> } > >>> > >>> - /* > >>> - * The MSIx vector table resides in device memory which may be cleared > >>> - * via backdoor resets. We don't allow direct access to the vector > >>> - * table so even if a userspace driver attempts to save/restore around > >>> - * such a reset it would be unsuccessful. To avoid this, restore the > >>> - * cached value of the message prior to enabling. > >>> - */ > >>> cmd = vfio_pci_memory_lock_and_enable(vdev); > >>> if (msix) { > >>> - struct msi_msg msg; > >>> - > >>> - get_cached_msi_msg(irq, &msg); > >>> - pci_write_msi_msg(irq, &msg); > >>> + if (irq == -EINVAL) { > >>> + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); > >> > >> struct msi_map map = pci_msix_alloc_irq_at(pdev, > >> vector, NULL); > > Will do. > > >>> + if (msix_map.index < 0) { > >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>> + ret = msix_map.index; > >>> + goto out_put_eventfd_ctx; > >>> + } > >>> + irq = msix_map.virq; > >>> + } else { > >>> + /* > >>> + * The MSIx vector table resides in device memory which > >>> + * may be cleared via backdoor resets. We don't allow > >>> + * direct access to the vector table so even if a > >>> + * userspace driver attempts to save/restore around > >>> + * such a reset it would be unsuccessful. To avoid > >>> + * this, restore the cached value of the message prior > >>> + * to enabling. > >>> + */ > >> > >> You've only just copied this comment down to here, but I think it's a > >> bit stale. Maybe we should update it to something that helps explain > >> this split better, maybe: > >> > >> /* > >> * If the vector was previously allocated, refresh the > >> * on-device message data before enabling in case it had > >> * been cleared or corrupted since writing. > >> */ > >> > >> IIRC, that was the purpose of writing it back to the device and the > >> blocking of direct access is no longer accurate anyway. > > Thank you. Will do. To keep this patch focused I plan to separate > this change into a new prep patch that will be placed earlier in > this series. Ok. > >> > >>> + struct msi_msg msg; > >>> + > >>> + get_cached_msi_msg(irq, &msg); > >>> + pci_write_msi_msg(irq, &msg); > >>> + } > >>> } > >>> > >>> ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); > >>> - vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>> if (ret) > >>> - goto out_put_eventfd_ctx; > >>> + goto out_free_irq_locked; > >>> + > >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>> > >>> ctx->producer.token = trigger; > >>> ctx->producer.irq = irq; > >>> @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, > >>> > >>> return 0; > >>> > >>> +out_free_irq_locked: > >>> + if (allow_dyn_alloc && new_ctx) { > >> > >> struct msi_map map = { .index = vector, > >> .virq = irq }; > >> > > Will do. > > >>> + msix_map.index = vector; > >>> + msix_map.virq = irq; > >>> + pci_msix_free_irq(pdev, msix_map); > >>> + } > >>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>> out_put_eventfd_ctx: > >>> eventfd_ctx_put(trigger); > >>> out_free_name: > >>> kfree(ctx->name); > >>> ctx->name = NULL; > >>> +out_free_ctx: > >>> + if (allow_dyn_alloc && new_ctx) > >>> + vfio_irq_ctx_free(vdev, ctx, vector); > >>> return ret; > >>> } > >>> > >> > >> Do we really need the new_ctx test in the above cases? Thanks, > > new_ctx is not required for correctness but instead is used to keep > the code symmetric. > Specifically, if the user enables MSI-X without providing triggers and > then later assign triggers then an error path without new_ctx would unwind > more than done in this function, it would free the context that > was allocated within vfio_msi_enable(). Seems like we already have that asymmetry, if a trigger is unset we'll free the ctx allocated by vfio_msi_enable(). Tracking which are allocated where is unnecessarily complex, how about a policy that devices supporting vdev->has_dyn_msix only ever have active contexts allocated? Thanks, Alex
Hi Alex, On 3/31/2023 3:24 PM, Alex Williamson wrote: > On Fri, 31 Mar 2023 10:49:16 -0700 > Reinette Chatre <reinette.chatre@intel.com> wrote: >> On 3/30/2023 3:42 PM, Alex Williamson wrote: >>> On Thu, 30 Mar 2023 16:40:50 -0600 >>> Alex Williamson <alex.williamson@redhat.com> wrote: >>> >>>> On Tue, 28 Mar 2023 14:53:34 -0700 >>>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>> ... >>>>> + msix_map.index = vector; >>>>> + msix_map.virq = irq; >>>>> + pci_msix_free_irq(pdev, msix_map); >>>>> + } >>>>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); >>>>> out_put_eventfd_ctx: >>>>> eventfd_ctx_put(trigger); >>>>> out_free_name: >>>>> kfree(ctx->name); >>>>> ctx->name = NULL; >>>>> +out_free_ctx: >>>>> + if (allow_dyn_alloc && new_ctx) >>>>> + vfio_irq_ctx_free(vdev, ctx, vector); >>>>> return ret; >>>>> } >>>>> >>>> >>>> Do we really need the new_ctx test in the above cases? Thanks, >> >> new_ctx is not required for correctness but instead is used to keep >> the code symmetric. >> Specifically, if the user enables MSI-X without providing triggers and >> then later assign triggers then an error path without new_ctx would unwind >> more than done in this function, it would free the context that >> was allocated within vfio_msi_enable(). > > Seems like we already have that asymmetry, if a trigger is unset we'll > free the ctx allocated by vfio_msi_enable(). Tracking which are Apologies, but could you please elaborate on where the asymmetry is? I am not able to see a flow in this solution where the ctx allocated by vfio_msi_enable() is freed if the trigger is unset. > allocated where is unnecessarily complex, how about a policy that I do not see this as tracking where allocations are made. Instead I see it as containing/compartmentalizing state changes with the goal of making the code easier to understand and maintain. Specifically, new_ctx is used so that if vfio_msi_set_vector_signal() fails, the state before and after vfio_msi_set_vector_signal() will be the same. I do agree that it makes vfio_msi_set_vector_signal() more complex and I can remove new_ctx if you find that this is unnecessary after considering the motivations behind its use. > devices supporting vdev->has_dyn_msix only ever have active contexts > allocated? Thanks, What do you see as an "active context"? A policy that is currently enforced is that an allocated context always has an allocated interrupt associated with it. I do not see how this could be expanded to also require an enabled interrupt because interrupt enabling requires a trigger that may not be available. Reinette
On Mon, 3 Apr 2023 10:31:23 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Hi Alex, > > On 3/31/2023 3:24 PM, Alex Williamson wrote: > > On Fri, 31 Mar 2023 10:49:16 -0700 > > Reinette Chatre <reinette.chatre@intel.com> wrote: > >> On 3/30/2023 3:42 PM, Alex Williamson wrote: > >>> On Thu, 30 Mar 2023 16:40:50 -0600 > >>> Alex Williamson <alex.williamson@redhat.com> wrote: > >>> > >>>> On Tue, 28 Mar 2023 14:53:34 -0700 > >>>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>> > > ... > > >>>>> + msix_map.index = vector; > >>>>> + msix_map.virq = irq; > >>>>> + pci_msix_free_irq(pdev, msix_map); > >>>>> + } > >>>>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>>>> out_put_eventfd_ctx: > >>>>> eventfd_ctx_put(trigger); > >>>>> out_free_name: > >>>>> kfree(ctx->name); > >>>>> ctx->name = NULL; > >>>>> +out_free_ctx: > >>>>> + if (allow_dyn_alloc && new_ctx) > >>>>> + vfio_irq_ctx_free(vdev, ctx, vector); > >>>>> return ret; > >>>>> } > >>>>> > >>>> > >>>> Do we really need the new_ctx test in the above cases? Thanks, > >> > >> new_ctx is not required for correctness but instead is used to keep > >> the code symmetric. > >> Specifically, if the user enables MSI-X without providing triggers and > >> then later assign triggers then an error path without new_ctx would unwind > >> more than done in this function, it would free the context that > >> was allocated within vfio_msi_enable(). > > > > Seems like we already have that asymmetry, if a trigger is unset we'll > > free the ctx allocated by vfio_msi_enable(). Tracking which are > > Apologies, but could you please elaborate on where the asymmetry is? I am > not able to see a flow in this solution where the ctx allocated by > vfio_msi_enable() is freed if the trigger is unset. The user first calls SET_IRQS to enable MSI-X with some number of vectors with (potentially) an eventfd for each vector. The user later calls SET_IRQS passing a -1 eventfd for one or more of the vectors with an eventfd initialized in the prior step. Given that we find the ctx, the ctx has a trigger, and assuming dynamic allocation is supported, the ctx is freed and vfio_msi_set_vector_signal() returns w/o allocating a new ctx. We've de-allocated both the irq and context initialized from vfio_msi_enable(). > > allocated where is unnecessarily complex, how about a policy that > > I do not see this as tracking where allocations are made. Instead I > see it as containing/compartmentalizing state changes with the goal of > making the code easier to understand and maintain. Specifically, new_ctx > is used so that if vfio_msi_set_vector_signal() fails, the state > before and after vfio_msi_set_vector_signal() will be the same. That's not really possible given how we teardown the existing ctx before configuring the new one and unwind to disable contexts in vfio_msi_set_block() > I do agree that it makes vfio_msi_set_vector_signal() more complex > and I can remove new_ctx if you find that this is unnecessary after > considering the motivations behind its use. If the goal is to allow the user to swap one eventfd for another, where the result will always be the new eventfd on success or the old eventfd on error, I don't see that this code does that, or that we've ever attempted to make such a guarantee. If the ioctl errors, I think the eventfds are generally deconfigured. We certainly have the unwind code that we discussed earlier that deconfigures all the vectors previously touched in the loop (which seems to be another path where we could de-allocate from the set of initial ctxs). > > devices supporting vdev->has_dyn_msix only ever have active contexts > > allocated? Thanks, > > What do you see as an "active context"? A policy that is currently enforced > is that an allocated context always has an allocated interrupt associated > with it. I do not see how this could be expanded to also require an > enabled interrupt because interrupt enabling requires a trigger that > may not be available. A context is essentially meant to track a trigger, ie. an eventfd provided by the user. In the static case all the irqs are necessarily pre-allocated, therefore we had no reason to consider a dynamic array for the contexts. However, a given context is really only "active" if it has a trigger, otherwise it's just a placeholder. When the placeholder is filled by an eventfd, the pre-allocated irq is enabled. This proposal seems to be a hybrid approach, pre-allocating some initial set of irqs and contexts and expecting the differentiation to occur only when new vectors are added, though we have some disagreement about this per above. Unfortunately I don't see an API to enable MSI-X without some vectors, so some pre-allocation of irqs seems to be required regardless. But if non-active contexts were only placeholders in the pre-dynamic world and we now manage them via a dynamic array, why is there any pre-allocation of contexts without knowing the nature of the eventfd to fill it? We could have more commonality between cases if contexts are always dynamically allocated, which might simplify differentiation of the has_dyn_msix cases largely to wrappers allocating and freeing irqs. Thanks, Alex
Hi Alex, On 4/3/2023 1:22 PM, Alex Williamson wrote: > On Mon, 3 Apr 2023 10:31:23 -0700 > Reinette Chatre <reinette.chatre@intel.com> wrote: > >> Hi Alex, >> >> On 3/31/2023 3:24 PM, Alex Williamson wrote: >>> On Fri, 31 Mar 2023 10:49:16 -0700 >>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>> On 3/30/2023 3:42 PM, Alex Williamson wrote: >>>>> On Thu, 30 Mar 2023 16:40:50 -0600 >>>>> Alex Williamson <alex.williamson@redhat.com> wrote: >>>>> >>>>>> On Tue, 28 Mar 2023 14:53:34 -0700 >>>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>>>> >> >> ... >> >>>>>>> + msix_map.index = vector; >>>>>>> + msix_map.virq = irq; >>>>>>> + pci_msix_free_irq(pdev, msix_map); >>>>>>> + } >>>>>>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); >>>>>>> out_put_eventfd_ctx: >>>>>>> eventfd_ctx_put(trigger); >>>>>>> out_free_name: >>>>>>> kfree(ctx->name); >>>>>>> ctx->name = NULL; >>>>>>> +out_free_ctx: >>>>>>> + if (allow_dyn_alloc && new_ctx) >>>>>>> + vfio_irq_ctx_free(vdev, ctx, vector); >>>>>>> return ret; >>>>>>> } >>>>>>> >>>>>> >>>>>> Do we really need the new_ctx test in the above cases? Thanks, >>>> >>>> new_ctx is not required for correctness but instead is used to keep >>>> the code symmetric. >>>> Specifically, if the user enables MSI-X without providing triggers and >>>> then later assign triggers then an error path without new_ctx would unwind >>>> more than done in this function, it would free the context that >>>> was allocated within vfio_msi_enable(). >>> >>> Seems like we already have that asymmetry, if a trigger is unset we'll >>> free the ctx allocated by vfio_msi_enable(). Tracking which are >> >> Apologies, but could you please elaborate on where the asymmetry is? I am >> not able to see a flow in this solution where the ctx allocated by >> vfio_msi_enable() is freed if the trigger is unset. > > The user first calls SET_IRQS to enable MSI-X with some number of > vectors with (potentially) an eventfd for each vector. The user later > calls SET_IRQS passing a -1 eventfd for one or more of the vectors with > an eventfd initialized in the prior step. Given that we find the ctx, > the ctx has a trigger, and assuming dynamic allocation is supported, the > ctx is freed and vfio_msi_set_vector_signal() returns w/o allocating a > new ctx. We've de-allocated both the irq and context initialized from > vfio_msi_enable(). This is correct. The comment I responded to was in regards to an unset trigger. The flow you describe is when a trigger is set. Not that it changes your point though, which is that vfio_msi_set_vector_signal() frees memory allocated by vfio_msi_enable(). This is clear to me. This is intended behavior. My concern is/was with the error path where a function failing may not be expected to change state, you address that concern below. >>> allocated where is unnecessarily complex, how about a policy that >> >> I do not see this as tracking where allocations are made. Instead I >> see it as containing/compartmentalizing state changes with the goal of >> making the code easier to understand and maintain. Specifically, new_ctx >> is used so that if vfio_msi_set_vector_signal() fails, the state >> before and after vfio_msi_set_vector_signal() will be the same. > > That's not really possible given how we teardown the existing ctx > before configuring the new one and unwind to disable contexts in > vfio_msi_set_block() Very unlikely indeed. I agree. >> I do agree that it makes vfio_msi_set_vector_signal() more complex >> and I can remove new_ctx if you find that this is unnecessary after >> considering the motivations behind its use. > > If the goal is to allow the user to swap one eventfd for another, where > the result will always be the new eventfd on success or the old eventfd > on error, I don't see that this code does that, or that we've ever > attempted to make such a guarantee. If the ioctl errors, I think the > eventfds are generally deconfigured. We certainly have the unwind code > that we discussed earlier that deconfigures all the vectors previously > touched in the loop (which seems to be another path where we could > de-allocate from the set of initial ctxs). Thank you for your patience in hearing and addressing my concerns. I plan to remove new_ctx in the next version. >>> devices supporting vdev->has_dyn_msix only ever have active contexts >>> allocated? Thanks, >> >> What do you see as an "active context"? A policy that is currently enforced >> is that an allocated context always has an allocated interrupt associated >> with it. I do not see how this could be expanded to also require an >> enabled interrupt because interrupt enabling requires a trigger that >> may not be available. > > A context is essentially meant to track a trigger, ie. an eventfd > provided by the user. In the static case all the irqs are necessarily > pre-allocated, therefore we had no reason to consider a dynamic array > for the contexts. However, a given context is really only "active" if > it has a trigger, otherwise it's just a placeholder. When the > placeholder is filled by an eventfd, the pre-allocated irq is enabled. I see. > > This proposal seems to be a hybrid approach, pre-allocating some > initial set of irqs and contexts and expecting the differentiation to > occur only when new vectors are added, though we have some disagreement > about this per above. Unfortunately I don't see an API to enable MSI-X > without some vectors, so some pre-allocation of irqs seems to be > required regardless. Right. pci_alloc_irq_vectors() or equivalent continues to be needed to enable MSI-X. Even so, it does seem possible (within vfio_msi_enable()) to just allocate one vector using pci_alloc_irq_vectors() and then immediately free it using pci_msix_free_irq(). What do you think? If I understand correctly this can be done without allocating any context and leave MSI-X enabled without any interrupts allocated. This could be a way to accomplish the "active context" policy for dynamic allocation. This is not a policy that can be applied broadly to interrupt contexts though because MSI and non-dynamic MSI-X could still have contexts with allocated interrupts without eventfd. > But if non-active contexts were only placeholders in the pre-dynamic > world and we now manage them via a dynamic array, why is there any > pre-allocation of contexts without knowing the nature of the eventfd to > fill it? We could have more commonality between cases if contexts are > always dynamically allocated, which might simplify differentiation of > the has_dyn_msix cases largely to wrappers allocating and freeing irqs. > Thanks, Thank you very much for your guidance. I will digest this some more and see how wrappers could be used. In the mean time while trying to think how to unify this code I do think there is an issue in this patch in that the get_cached_msi_msg()/pci_write_msi_msg() should not be in an else branch. Specifically, I think it needs to be: if (msix) { if (irq == -EINVAL) { /* dynamically allocate interrupt */ } get_cached_msi_msg(irq, &msg); pci_write_msi_msg(irq, &msg); } Thank you very much Reinette
On Mon, 3 Apr 2023 15:50:54 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Hi Alex, > > On 4/3/2023 1:22 PM, Alex Williamson wrote: > > On Mon, 3 Apr 2023 10:31:23 -0700 > > Reinette Chatre <reinette.chatre@intel.com> wrote: > > > >> Hi Alex, > >> > >> On 3/31/2023 3:24 PM, Alex Williamson wrote: > >>> On Fri, 31 Mar 2023 10:49:16 -0700 > >>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>> On 3/30/2023 3:42 PM, Alex Williamson wrote: > >>>>> On Thu, 30 Mar 2023 16:40:50 -0600 > >>>>> Alex Williamson <alex.williamson@redhat.com> wrote: > >>>>> > >>>>>> On Tue, 28 Mar 2023 14:53:34 -0700 > >>>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>>>> > >> > >> ... > >> > >>>>>>> + msix_map.index = vector; > >>>>>>> + msix_map.virq = irq; > >>>>>>> + pci_msix_free_irq(pdev, msix_map); > >>>>>>> + } > >>>>>>> + vfio_pci_memory_unlock_and_restore(vdev, cmd); > >>>>>>> out_put_eventfd_ctx: > >>>>>>> eventfd_ctx_put(trigger); > >>>>>>> out_free_name: > >>>>>>> kfree(ctx->name); > >>>>>>> ctx->name = NULL; > >>>>>>> +out_free_ctx: > >>>>>>> + if (allow_dyn_alloc && new_ctx) > >>>>>>> + vfio_irq_ctx_free(vdev, ctx, vector); > >>>>>>> return ret; > >>>>>>> } > >>>>>>> > >>>>>> > >>>>>> Do we really need the new_ctx test in the above cases? Thanks, > >>>> > >>>> new_ctx is not required for correctness but instead is used to keep > >>>> the code symmetric. > >>>> Specifically, if the user enables MSI-X without providing triggers and > >>>> then later assign triggers then an error path without new_ctx would unwind > >>>> more than done in this function, it would free the context that > >>>> was allocated within vfio_msi_enable(). > >>> > >>> Seems like we already have that asymmetry, if a trigger is unset we'll > >>> free the ctx allocated by vfio_msi_enable(). Tracking which are > >> > >> Apologies, but could you please elaborate on where the asymmetry is? I am > >> not able to see a flow in this solution where the ctx allocated by > >> vfio_msi_enable() is freed if the trigger is unset. > > > > The user first calls SET_IRQS to enable MSI-X with some number of > > vectors with (potentially) an eventfd for each vector. The user later > > calls SET_IRQS passing a -1 eventfd for one or more of the vectors with > > an eventfd initialized in the prior step. Given that we find the ctx, > > the ctx has a trigger, and assuming dynamic allocation is supported, the > > ctx is freed and vfio_msi_set_vector_signal() returns w/o allocating a > > new ctx. We've de-allocated both the irq and context initialized from > > vfio_msi_enable(). > > This is correct. The comment I responded to was in regards to an unset > trigger. The flow you describe is when a trigger is set. Not that > it changes your point though, which is that vfio_msi_set_vector_signal() > frees memory allocated by vfio_msi_enable(). This is clear to me. This > is intended behavior. My concern is/was with the error path where a function > failing may not be expected to change state, you address that concern below. > > >>> allocated where is unnecessarily complex, how about a policy that > >> > >> I do not see this as tracking where allocations are made. Instead I > >> see it as containing/compartmentalizing state changes with the goal of > >> making the code easier to understand and maintain. Specifically, new_ctx > >> is used so that if vfio_msi_set_vector_signal() fails, the state > >> before and after vfio_msi_set_vector_signal() will be the same. > > > > That's not really possible given how we teardown the existing ctx > > before configuring the new one and unwind to disable contexts in > > vfio_msi_set_block() > > Very unlikely indeed. I agree. > > >> I do agree that it makes vfio_msi_set_vector_signal() more complex > >> and I can remove new_ctx if you find that this is unnecessary after > >> considering the motivations behind its use. > > > > If the goal is to allow the user to swap one eventfd for another, where > > the result will always be the new eventfd on success or the old eventfd > > on error, I don't see that this code does that, or that we've ever > > attempted to make such a guarantee. If the ioctl errors, I think the > > eventfds are generally deconfigured. We certainly have the unwind code > > that we discussed earlier that deconfigures all the vectors previously > > touched in the loop (which seems to be another path where we could > > de-allocate from the set of initial ctxs). > > Thank you for your patience in hearing and addressing my concerns. I plan > to remove new_ctx in the next version. > > >>> devices supporting vdev->has_dyn_msix only ever have active contexts > >>> allocated? Thanks, > >> > >> What do you see as an "active context"? A policy that is currently enforced > >> is that an allocated context always has an allocated interrupt associated > >> with it. I do not see how this could be expanded to also require an > >> enabled interrupt because interrupt enabling requires a trigger that > >> may not be available. > > > > A context is essentially meant to track a trigger, ie. an eventfd > > provided by the user. In the static case all the irqs are necessarily > > pre-allocated, therefore we had no reason to consider a dynamic array > > for the contexts. However, a given context is really only "active" if > > it has a trigger, otherwise it's just a placeholder. When the > > placeholder is filled by an eventfd, the pre-allocated irq is enabled. > > I see. > > > > > This proposal seems to be a hybrid approach, pre-allocating some > > initial set of irqs and contexts and expecting the differentiation to > > occur only when new vectors are added, though we have some disagreement > > about this per above. Unfortunately I don't see an API to enable MSI-X > > without some vectors, so some pre-allocation of irqs seems to be > > required regardless. > > Right. pci_alloc_irq_vectors() or equivalent continues to be needed to > enable MSI-X. Even so, it does seem possible (within vfio_msi_enable()) > to just allocate one vector using pci_alloc_irq_vectors() > and then immediately free it using pci_msix_free_irq(). What do you think? QEMU does something similar but I think it can really only be described as a hack. In this case I think we can work with them being allocated since that's essentially the static path. > If I understand correctly this can be done without allocating any context > and leave MSI-X enabled without any interrupts allocated. This could be a > way to accomplish the "active context" policy for dynamic allocation. > This is not a policy that can be applied broadly to interrupt contexts though > because MSI and non-dynamic MSI-X could still have contexts with allocated > interrupts without eventfd. I think we could come up with wrappers that handle all cases, for example: int vfio_pci_alloc_irq(struct vfio_pci_core_device *vdev, unsigned int vector, int irq_type) { struct pci_dev *pdev = vdev->pdev; struct msi_map map; int irq; if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) return pdev->irq ?: -EINVAL; irq = pci_irq_vector(pdev, vector); if (irq > 0 || irq_type == VFIO_PCI_MSI_IRQ_INDEX || !vdev->has_dyn_msix) return irq; map = pci_msix_alloc_irq_at(pdev, vector, NULL); return map.index; } void vfio_pci_free_irq(struct vfio_pci_core_device *vdev, unsigned in vector, int irq_type) { struct msi_map map; int irq; if (irq_type != VFIO_PCI_INTX_MSIX_INDEX || !vdev->has_dyn_msix) return; irq = pci_irq_vector(pdev, vector); map = { .index = vector, .virq = irq }; if (WARN_ON(irq < 0)) return; pci_msix_free_irq(pdev, msix_map); } At that point, maybe we'd check whether it makes sense to embed the irq alloc/free within the ctx alloc/free. > > But if non-active contexts were only placeholders in the pre-dynamic > > world and we now manage them via a dynamic array, why is there any > > pre-allocation of contexts without knowing the nature of the eventfd to > > fill it? We could have more commonality between cases if contexts are > > always dynamically allocated, which might simplify differentiation of > > the has_dyn_msix cases largely to wrappers allocating and freeing irqs. > > Thanks, > > Thank you very much for your guidance. I will digest this some more and > see how wrappers could be used. In the mean time while trying to think how > to unify this code I do think there is an issue in this patch in that > the get_cached_msi_msg()/pci_write_msi_msg() > should not be in an else branch. > > Specifically, I think it needs to be: > if (msix) { > if (irq == -EINVAL) { > /* dynamically allocate interrupt */ > } > get_cached_msi_msg(irq, &msg); > pci_write_msi_msg(irq, &msg); > } Yes, that's looked wrong to me all along, I think that resolves it. Thanks, Alex
> From: Alex Williamson <alex.williamson@redhat.com> > On Fri, 31 Mar 2023 10:02:32 +0000 > "Liu, Jing2" <jing2.liu@intel.com> wrote: > > > Hi Reinette, > > > > > @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct > > > vfio_pci_core_device *vdev, { > > > struct pci_dev *pdev = vdev->pdev; > > > struct vfio_pci_irq_ctx *ctx; > > > + struct msi_map msix_map = {}; > > > + bool allow_dyn_alloc = false; > > > struct eventfd_ctx *trigger; > > > + bool new_ctx = false; > > > int irq, ret; > > > u16 cmd; > > > > > > + /* Only MSI-X allows dynamic allocation. */ > > > + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) > > > + allow_dyn_alloc = true; > > > + > > > ctx = vfio_irq_ctx_get(vdev, vector); > > > - if (!ctx) > > > + if (!ctx && !allow_dyn_alloc) > > > return -EINVAL; > > > + > > > irq = pci_irq_vector(pdev, vector); > > > + /* Context and interrupt are always allocated together. */ > > > + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); > > > > > > - if (ctx->trigger) { > > > + if (ctx && ctx->trigger) { > > > irq_bypass_unregister_producer(&ctx->producer); > > > > > > cmd = vfio_pci_memory_lock_and_enable(vdev); > > > free_irq(irq, ctx->trigger); > > > + if (allow_dyn_alloc) { > > > + msix_map.index = vector; > > > + msix_map.virq = irq; > > > + pci_msix_free_irq(pdev, msix_map); > > > + irq = -EINVAL; > > > + } > > > vfio_pci_memory_unlock_and_restore(vdev, cmd); > > > kfree(ctx->name); > > > eventfd_ctx_put(ctx->trigger); > > > ctx->trigger = NULL; > > > + if (allow_dyn_alloc) { > > > + vfio_irq_ctx_free(vdev, ctx, vector); > > > + ctx = NULL; > > > + } > > > } > > > > > > if (fd < 0) > > > return 0; > > > > > > > While looking at this piece of code, one thought comes to me: > > It might be possible that userspace comes twice with the same valid fd > > for a specific vector, this vfio code will free the resource in if(ctx > > && ctx->trigger) for the second time, and then re-alloc again for the same fd > given by userspace. > > > > Would that help if we firstly check e.g. ctx->trigger with the given > > valid fd, to see if the trigger is really changed or not? > > It's rather a made-up situation, if userspace wants to avoid bouncing the vector > when the eventfd hasn't changed they can simply test this before calling the ioctl. > Thanks, > > Alex Thank you very much for your answer. The reason is I see Qemu has such behaviors sending the same valid fd for a specific vector twice, when it wants to disable the MSIx (switching fd from kvm bypass to non-bypass). So the right consideration is, userspace should be responsible for avoiding doing the same thing several times, if it doesn't want that. Thanks, Jing
> From: Alex Williamson <alex.williamson@redhat.com> > Sent: Tuesday, April 4, 2023 11:19 AM > > > > Thank you very much for your guidance. I will digest this some more and > > see how wrappers could be used. In the mean time while trying to think > how > > to unify this code I do think there is an issue in this patch in that > > the get_cached_msi_msg()/pci_write_msi_msg() > > should not be in an else branch. > > > > Specifically, I think it needs to be: > > if (msix) { > > if (irq == -EINVAL) { > > /* dynamically allocate interrupt */ > > } > > get_cached_msi_msg(irq, &msg); > > pci_write_msi_msg(irq, &msg); > > } > > Yes, that's looked wrong to me all along, I think that resolves it. > Thanks, > Do you mind elaborating why this change is required? I thought pci_msix_alloc_irq_at() will compose a new msi message to write hence no need to get cached value again in that case...
Hi Alex, On 4/3/2023 8:18 PM, Alex Williamson wrote: > On Mon, 3 Apr 2023 15:50:54 -0700 > Reinette Chatre <reinette.chatre@intel.com> wrote: >> On 4/3/2023 1:22 PM, Alex Williamson wrote: >>> On Mon, 3 Apr 2023 10:31:23 -0700 >>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>> On 3/31/2023 3:24 PM, Alex Williamson wrote: >>>>> On Fri, 31 Mar 2023 10:49:16 -0700 >>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>>>> On 3/30/2023 3:42 PM, Alex Williamson wrote: >>>>>>> On Thu, 30 Mar 2023 16:40:50 -0600 >>>>>>> Alex Williamson <alex.williamson@redhat.com> wrote: >>>>>>> >>>>>>>> On Tue, 28 Mar 2023 14:53:34 -0700 >>>>>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: >>>>>>>> ... >>> If the goal is to allow the user to swap one eventfd for another, where >>> the result will always be the new eventfd on success or the old eventfd >>> on error, I don't see that this code does that, or that we've ever >>> attempted to make such a guarantee. If the ioctl errors, I think the >>> eventfds are generally deconfigured. We certainly have the unwind code >>> that we discussed earlier that deconfigures all the vectors previously >>> touched in the loop (which seems to be another path where we could >>> de-allocate from the set of initial ctxs). >> >> Thank you for your patience in hearing and addressing my concerns. I plan >> to remove new_ctx in the next version. >> >>>>> devices supporting vdev->has_dyn_msix only ever have active contexts >>>>> allocated? Thanks, >>>> >>>> What do you see as an "active context"? A policy that is currently enforced >>>> is that an allocated context always has an allocated interrupt associated >>>> with it. I do not see how this could be expanded to also require an >>>> enabled interrupt because interrupt enabling requires a trigger that >>>> may not be available. >>> >>> A context is essentially meant to track a trigger, ie. an eventfd >>> provided by the user. In the static case all the irqs are necessarily >>> pre-allocated, therefore we had no reason to consider a dynamic array >>> for the contexts. However, a given context is really only "active" if >>> it has a trigger, otherwise it's just a placeholder. When the >>> placeholder is filled by an eventfd, the pre-allocated irq is enabled. >> >> I see. >> >>> >>> This proposal seems to be a hybrid approach, pre-allocating some >>> initial set of irqs and contexts and expecting the differentiation to >>> occur only when new vectors are added, though we have some disagreement >>> about this per above. Unfortunately I don't see an API to enable MSI-X >>> without some vectors, so some pre-allocation of irqs seems to be >>> required regardless. >> >> Right. pci_alloc_irq_vectors() or equivalent continues to be needed to >> enable MSI-X. Even so, it does seem possible (within vfio_msi_enable()) >> to just allocate one vector using pci_alloc_irq_vectors() >> and then immediately free it using pci_msix_free_irq(). What do you think? > > QEMU does something similar but I think it can really only be described > as a hack. In this case I think we can work with them being allocated > since that's essentially the static path. ok. In this case I understand the hybrid approach to be required. Without something (a hack) like this I am not able to see how an "active context" policy can be enforced though. Interrupts allocated during MSI-X enabling may not have eventfd associated and thus cannot adhere to an "active context" policy. I understand from earlier comments that we do not want to track where contexts are allocated so I can only see a way to enforce a policy that a context has an allocated interrupt, but not an enabled interrupt. >> If I understand correctly this can be done without allocating any context >> and leave MSI-X enabled without any interrupts allocated. This could be a >> way to accomplish the "active context" policy for dynamic allocation. >> This is not a policy that can be applied broadly to interrupt contexts though >> because MSI and non-dynamic MSI-X could still have contexts with allocated >> interrupts without eventfd. > > I think we could come up with wrappers that handle all cases, for > example: > > int vfio_pci_alloc_irq(struct vfio_pci_core_device *vdev, > unsigned int vector, int irq_type) > { > struct pci_dev *pdev = vdev->pdev; > struct msi_map map; > int irq; > > if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) > return pdev->irq ?: -EINVAL; > > irq = pci_irq_vector(pdev, vector); > if (irq > 0 || irq_type == VFIO_PCI_MSI_IRQ_INDEX || > !vdev->has_dyn_msix) > return irq; > > map = pci_msix_alloc_irq_at(pdev, vector, NULL); > > return map.index; > } > > void vfio_pci_free_irq(struct vfio_pci_core_device *vdev, > unsigned in vector, int irq_type) > { > struct msi_map map; > int irq; > > if (irq_type != VFIO_PCI_INTX_MSIX_INDEX || > !vdev->has_dyn_msix) > return; > > irq = pci_irq_vector(pdev, vector); > map = { .index = vector, .virq = irq }; > > if (WARN_ON(irq < 0)) > return; > > pci_msix_free_irq(pdev, msix_map); > } Thank you very much for taking the time to write this out. I am not able to see where vfio_pci_alloc_irq()/vfio_pci_free_irq() would be called for an INTx interrupt. Is the INTx handling there for robustness or am I missing how it should be used for INTx interrupts? > At that point, maybe we'd check whether it makes sense to embed the irq > alloc/free within the ctx alloc/free. I think doing so would be the right thing to do since it helps to enforce the policy that interrupts and contexts are allocated together. I think this can be done when switching around the initialization within vfio_msi_set_vector_signal(). I need to look into this more. >>> But if non-active contexts were only placeholders in the pre-dynamic >>> world and we now manage them via a dynamic array, why is there any >>> pre-allocation of contexts without knowing the nature of the eventfd to >>> fill it? We could have more commonality between cases if contexts are >>> always dynamically allocated, which might simplify differentiation of >>> the has_dyn_msix cases largely to wrappers allocating and freeing irqs. >>> Thanks, >> >> Thank you very much for your guidance. I will digest this some more and >> see how wrappers could be used. In the mean time while trying to think how >> to unify this code I do think there is an issue in this patch in that >> the get_cached_msi_msg()/pci_write_msi_msg() >> should not be in an else branch. >> >> Specifically, I think it needs to be: >> if (msix) { >> if (irq == -EINVAL) { >> /* dynamically allocate interrupt */ >> } >> get_cached_msi_msg(irq, &msg); >> pci_write_msi_msg(irq, &msg); >> } > > Yes, that's looked wrong to me all along, I think that resolves it. Thank you very much. Reinette
Hi Kevin, On 4/3/2023 8:51 PM, Tian, Kevin wrote: >> From: Alex Williamson <alex.williamson@redhat.com> >> Sent: Tuesday, April 4, 2023 11:19 AM >>> >>> Thank you very much for your guidance. I will digest this some more and >>> see how wrappers could be used. In the mean time while trying to think >> how >>> to unify this code I do think there is an issue in this patch in that >>> the get_cached_msi_msg()/pci_write_msi_msg() >>> should not be in an else branch. >>> >>> Specifically, I think it needs to be: >>> if (msix) { >>> if (irq == -EINVAL) { >>> /* dynamically allocate interrupt */ >>> } >>> get_cached_msi_msg(irq, &msg); >>> pci_write_msi_msg(irq, &msg); >>> } >> >> Yes, that's looked wrong to me all along, I think that resolves it. >> Thanks, >> > > Do you mind elaborating why this change is required? I thought > pci_msix_alloc_irq_at() will compose a new msi message to write > hence no need to get cached value again in that case... With this change an interrupt allocated via pci_msix_alloc_irq_at() is treated the same as an interrupt allocated via pci_alloc_irq_vectors(). get_cached_msi_msg()/pci_write_msi_msg() is currently called for every allocated interrupt and this snippet intends to maintain this behavior. One flow I considered that made me think this is fixing a bug is as follows: Scenario A (current behavior): - host/user enables vectors 0, 1, 2 ,3 ,4 - kernel allocates all interrupts via pci_alloc_irq_vectors() - get_cached_msi_msg()/pci_write_msi_msg() is called for each interrupt Scenario B (this series): - host/user enables vector 0 - kernel allocates interrupt 0 via pci_alloc_irq_vectors() - get_cached_msi_msg()/pci_write_msi_msg() is called for interrupt 0 - host/user enables vector 1 - kernel allocates interrupt 1 via pci_msix_alloc_irq_at() - get_cached_msi_msg()/pci_write_msi_msg() is NOT called for interrupt 1 /* This seems a bug since host may expect same outcome as in scenario A */ I am not familiar with how the MSI messages are composed though and I surely could have gotten this wrong. I would like to learn more after you considered the motivation for this change. Reinette
On Tue, 4 Apr 2023 09:54:46 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Hi Alex, > > On 4/3/2023 8:18 PM, Alex Williamson wrote: > > On Mon, 3 Apr 2023 15:50:54 -0700 > > Reinette Chatre <reinette.chatre@intel.com> wrote: > >> On 4/3/2023 1:22 PM, Alex Williamson wrote: > >>> On Mon, 3 Apr 2023 10:31:23 -0700 > >>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>> On 3/31/2023 3:24 PM, Alex Williamson wrote: > >>>>> On Fri, 31 Mar 2023 10:49:16 -0700 > >>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>>>> On 3/30/2023 3:42 PM, Alex Williamson wrote: > >>>>>>> On Thu, 30 Mar 2023 16:40:50 -0600 > >>>>>>> Alex Williamson <alex.williamson@redhat.com> wrote: > >>>>>>> > >>>>>>>> On Tue, 28 Mar 2023 14:53:34 -0700 > >>>>>>>> Reinette Chatre <reinette.chatre@intel.com> wrote: > >>>>>>>> > > > ... > > >>> If the goal is to allow the user to swap one eventfd for another, where > >>> the result will always be the new eventfd on success or the old eventfd > >>> on error, I don't see that this code does that, or that we've ever > >>> attempted to make such a guarantee. If the ioctl errors, I think the > >>> eventfds are generally deconfigured. We certainly have the unwind code > >>> that we discussed earlier that deconfigures all the vectors previously > >>> touched in the loop (which seems to be another path where we could > >>> de-allocate from the set of initial ctxs). > >> > >> Thank you for your patience in hearing and addressing my concerns. I plan > >> to remove new_ctx in the next version. > >> > >>>>> devices supporting vdev->has_dyn_msix only ever have active contexts > >>>>> allocated? Thanks, > >>>> > >>>> What do you see as an "active context"? A policy that is currently enforced > >>>> is that an allocated context always has an allocated interrupt associated > >>>> with it. I do not see how this could be expanded to also require an > >>>> enabled interrupt because interrupt enabling requires a trigger that > >>>> may not be available. > >>> > >>> A context is essentially meant to track a trigger, ie. an eventfd > >>> provided by the user. In the static case all the irqs are necessarily > >>> pre-allocated, therefore we had no reason to consider a dynamic array > >>> for the contexts. However, a given context is really only "active" if > >>> it has a trigger, otherwise it's just a placeholder. When the > >>> placeholder is filled by an eventfd, the pre-allocated irq is enabled. > >> > >> I see. > >> > >>> > >>> This proposal seems to be a hybrid approach, pre-allocating some > >>> initial set of irqs and contexts and expecting the differentiation to > >>> occur only when new vectors are added, though we have some disagreement > >>> about this per above. Unfortunately I don't see an API to enable MSI-X > >>> without some vectors, so some pre-allocation of irqs seems to be > >>> required regardless. > >> > >> Right. pci_alloc_irq_vectors() or equivalent continues to be needed to > >> enable MSI-X. Even so, it does seem possible (within vfio_msi_enable()) > >> to just allocate one vector using pci_alloc_irq_vectors() > >> and then immediately free it using pci_msix_free_irq(). What do you think? > > > > QEMU does something similar but I think it can really only be described > > as a hack. In this case I think we can work with them being allocated > > since that's essentially the static path. > > ok. In this case I understand the hybrid approach to be required. Without > something (a hack) like this I am not able to see how an "active context" > policy can be enforced though. Interrupts allocated during MSI-X enabling may > not have eventfd associated and thus cannot adhere to an "active context" policy. I > understand from earlier comments that we do not want to track where contexts > are allocated so I can only see a way to enforce a policy that a context has > an allocated interrupt, but not an enabled interrupt. We're talking about the contexts that we now allocate in the xarray to store the eventfd linkage, right? We need to pre-allocate some irqs both to satisfy the API and to support non-dynamic MSI-X devices, but we don't need to pre-allocate contexts. The logic that I propose below supports lookup of the pre-allocated irqs for all cases and falls back to allocating a new irq only for cases that support it. irqs and contexts aren't exactly 1:1 for the dynamic case due to the artifacts of the API, but the model supports only allocating contexts as they're used, or "active". > >> If I understand correctly this can be done without allocating any context > >> and leave MSI-X enabled without any interrupts allocated. This could be a > >> way to accomplish the "active context" policy for dynamic allocation. > >> This is not a policy that can be applied broadly to interrupt contexts though > >> because MSI and non-dynamic MSI-X could still have contexts with allocated > >> interrupts without eventfd. > > > > I think we could come up with wrappers that handle all cases, for > > example: > > > > int vfio_pci_alloc_irq(struct vfio_pci_core_device *vdev, > > unsigned int vector, int irq_type) > > { > > struct pci_dev *pdev = vdev->pdev; > > struct msi_map map; > > int irq; > > > > if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) > > return pdev->irq ?: -EINVAL; > > > > irq = pci_irq_vector(pdev, vector); > > if (irq > 0 || irq_type == VFIO_PCI_MSI_IRQ_INDEX || > > !vdev->has_dyn_msix) > > return irq; > > > > map = pci_msix_alloc_irq_at(pdev, vector, NULL); > > > > return map.index; > > } > > > > void vfio_pci_free_irq(struct vfio_pci_core_device *vdev, > > unsigned in vector, int irq_type) > > { > > struct msi_map map; > > int irq; > > > > if (irq_type != VFIO_PCI_INTX_MSIX_INDEX || > > !vdev->has_dyn_msix) > > return; > > > > irq = pci_irq_vector(pdev, vector); > > map = { .index = vector, .virq = irq }; > > > > if (WARN_ON(irq < 0)) > > return; > > > > pci_msix_free_irq(pdev, msix_map); > > } > > Thank you very much for taking the time to write this out. I am not able to > see where vfio_pci_alloc_irq()/vfio_pci_free_irq() would be called for > an INTx interrupt. Is the INTx handling there for robustness or am I > missing how it should be used for INTx interrupts? Mostly just trying to illustrate that all interrupt types could be supported, if it doesn't make sense for INTx, drop it. > > At that point, maybe we'd check whether it makes sense to embed the irq > > alloc/free within the ctx alloc/free. > > I think doing so would be the right thing to do since it helps > to enforce the policy that interrupts and contexts are allocated together. > I think this can be done when switching around the initialization within > vfio_msi_set_vector_signal(). I need to look into this more. Interrupts and contexts allocated together would be ideal, but I think given the API it's a reasonable and simple compromise given the non-dynamic support to draw from the initial allocation where we can. Actually, there could be a latency and reliability advantage to hang on to the irq when an eventfd is unset, maybe we should only free irqs on MSI-X teardown and otherwise use the allocated irqs as a cache. Maybe worth thinking about. Thanks, Alex
On Tue, 4 Apr 2023 10:29:14 -0700 Reinette Chatre <reinette.chatre@intel.com> wrote: > Hi Kevin, > > On 4/3/2023 8:51 PM, Tian, Kevin wrote: > >> From: Alex Williamson <alex.williamson@redhat.com> > >> Sent: Tuesday, April 4, 2023 11:19 AM > >>> > >>> Thank you very much for your guidance. I will digest this some more and > >>> see how wrappers could be used. In the mean time while trying to think > >> how > >>> to unify this code I do think there is an issue in this patch in that > >>> the get_cached_msi_msg()/pci_write_msi_msg() > >>> should not be in an else branch. > >>> > >>> Specifically, I think it needs to be: > >>> if (msix) { > >>> if (irq == -EINVAL) { > >>> /* dynamically allocate interrupt */ > >>> } > >>> get_cached_msi_msg(irq, &msg); > >>> pci_write_msi_msg(irq, &msg); > >>> } > >> > >> Yes, that's looked wrong to me all along, I think that resolves it. > >> Thanks, > >> > > > > Do you mind elaborating why this change is required? I thought > > pci_msix_alloc_irq_at() will compose a new msi message to write > > hence no need to get cached value again in that case... > > With this change an interrupt allocated via pci_msix_alloc_irq_at() > is treated the same as an interrupt allocated via pci_alloc_irq_vectors(). > > get_cached_msi_msg()/pci_write_msi_msg() is currently called for > every allocated interrupt and this snippet intends to maintain > this behavior. > > One flow I considered that made me think this is fixing a bug is > as follows: > Scenario A (current behavior): > - host/user enables vectors 0, 1, 2 ,3 ,4 > - kernel allocates all interrupts via pci_alloc_irq_vectors() > - get_cached_msi_msg()/pci_write_msi_msg() is called for each interrupt In this scenario, I think the intention is that there's non-zero time since pci_alloc_irq_vectors() such that a device reset or other manipulation of the vector table may have occurred, therefore we're potentially restoring the programming of the vector table with this get/write. > Scenario B (this series): > - host/user enables vector 0 > - kernel allocates interrupt 0 via pci_alloc_irq_vectors() > - get_cached_msi_msg()/pci_write_msi_msg() is called for interrupt 0 > - host/user enables vector 1 > - kernel allocates interrupt 1 via pci_msix_alloc_irq_at() > - get_cached_msi_msg()/pci_write_msi_msg() is NOT called for interrupt 1 > /* This seems a bug since host may expect same outcome as in scenario A */ > > I am not familiar with how the MSI messages are composed though and I surely > could have gotten this wrong. I would like to learn more after you considered > the motivation for this change. I think Kevin has a point, if it's correct that we do this get/write in order to account for manipulation of the device since we wrote into the vector table via either pci_alloc_irq_vectors() or pci_msix_alloc_irq_at(), then it really only makes sense to do that restore if we haven't allocated the irq and written the vector table immediately prior. Thanks, Alex
Hi Alex, On 4/4/2023 11:43 AM, Alex Williamson wrote: > > I think Kevin has a point, if it's correct that we do this get/write in > order to account for manipulation of the device since we wrote into the > vector table via either pci_alloc_irq_vectors() or > pci_msix_alloc_irq_at(), then it really only makes sense to do that > restore if we haven't allocated the irq and written the vector table > immediately prior. Thanks, > I see. Even so, could it be acceptable to call get_cached_msi_msg()/pci_write_msi_msg() unconditionally (for MSI-X) or should the new [1] vfio_pci_alloc_irq() indicate if a new IRQ was allocated to determine if get_cached_msi_msg()/pci_write_msi_msg() should be run? Reinette [1] https://lore.kernel.org/lkml/20230403211841.0e206b67.alex.williamson@redhat.com/
Hi Alex, On 4/4/2023 11:24 AM, Alex Williamson wrote: > We're talking about the contexts that we now allocate in the xarray to > store the eventfd linkage, right? We need to pre-allocate some irqs > both to satisfy the API and to support non-dynamic MSI-X devices, but > we don't need to pre-allocate contexts. The logic that I propose below > supports lookup of the pre-allocated irqs for all cases and falls back > to allocating a new irq only for cases that support it. irqs and > contexts aren't exactly 1:1 for the dynamic case due to the artifacts > of the API, but the model supports only allocating contexts as they're > used, or "active". Now I understand. Thank you very much for your patience. ... > Interrupts and contexts allocated together would be ideal, but I think > given the API it's a reasonable and simple compromise given the > non-dynamic support to draw from the initial allocation where we can. > Actually, there could be a latency and reliability advantage to hang on > to the irq when an eventfd is unset, maybe we should only free irqs on > MSI-X teardown and otherwise use the allocated irqs as a cache. Maybe > worth thinking about. Thanks, I implemented this change and I think it looks good. Enabling of dynamic MSI-X ended up consisting out of vfio_pci_alloc_irq() you suggested and one more line that uses it. This is because I also made the change to defer freeing irqs to MSI-X teardown and doing so is surely more efficient in the scenario that Jing pointed out. I did not transition the INTx code to "active" contexts - meaning that the interrupt context continues to be allocated at the time INTx is enabled. From what I understand the additional support for mask/unmask requires a context but it does not need to be active. Reinette
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index b3a258e58625..755b752ca17e 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -55,6 +55,13 @@ struct vfio_pci_irq_ctx *vfio_irq_ctx_get(struct vfio_pci_core_device *vdev, return xa_load(&vdev->ctx, index); } +static void vfio_irq_ctx_free(struct vfio_pci_core_device *vdev, + struct vfio_pci_irq_ctx *ctx, unsigned long index) +{ + xa_erase(&vdev->ctx, index); + kfree(ctx); +} + static void vfio_irq_ctx_free_all(struct vfio_pci_core_device *vdev) { struct vfio_pci_irq_ctx *ctx; @@ -409,33 +416,62 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, { struct pci_dev *pdev = vdev->pdev; struct vfio_pci_irq_ctx *ctx; + struct msi_map msix_map = {}; + bool allow_dyn_alloc = false; struct eventfd_ctx *trigger; + bool new_ctx = false; int irq, ret; u16 cmd; + /* Only MSI-X allows dynamic allocation. */ + if (msix && pci_msix_can_alloc_dyn(vdev->pdev)) + allow_dyn_alloc = true; + ctx = vfio_irq_ctx_get(vdev, vector); - if (!ctx) + if (!ctx && !allow_dyn_alloc) return -EINVAL; + irq = pci_irq_vector(pdev, vector); + /* Context and interrupt are always allocated together. */ + WARN_ON((ctx && irq == -EINVAL) || (!ctx && irq != -EINVAL)); - if (ctx->trigger) { + if (ctx && ctx->trigger) { irq_bypass_unregister_producer(&ctx->producer); cmd = vfio_pci_memory_lock_and_enable(vdev); free_irq(irq, ctx->trigger); + if (allow_dyn_alloc) { + msix_map.index = vector; + msix_map.virq = irq; + pci_msix_free_irq(pdev, msix_map); + irq = -EINVAL; + } vfio_pci_memory_unlock_and_restore(vdev, cmd); kfree(ctx->name); eventfd_ctx_put(ctx->trigger); ctx->trigger = NULL; + if (allow_dyn_alloc) { + vfio_irq_ctx_free(vdev, ctx, vector); + ctx = NULL; + } } if (fd < 0) return 0; + if (!ctx) { + ctx = vfio_irq_ctx_alloc_single(vdev, vector); + if (!ctx) + return -ENOMEM; + new_ctx = true; + } + ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-msi%s[%d](%s)", msix ? "x" : "", vector, pci_name(pdev)); - if (!ctx->name) - return -ENOMEM; + if (!ctx->name) { + ret = -ENOMEM; + goto out_free_ctx; + } trigger = eventfd_ctx_fdget(fd); if (IS_ERR(trigger)) { @@ -443,25 +479,38 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, goto out_free_name; } - /* - * The MSIx vector table resides in device memory which may be cleared - * via backdoor resets. We don't allow direct access to the vector - * table so even if a userspace driver attempts to save/restore around - * such a reset it would be unsuccessful. To avoid this, restore the - * cached value of the message prior to enabling. - */ cmd = vfio_pci_memory_lock_and_enable(vdev); if (msix) { - struct msi_msg msg; - - get_cached_msi_msg(irq, &msg); - pci_write_msi_msg(irq, &msg); + if (irq == -EINVAL) { + msix_map = pci_msix_alloc_irq_at(pdev, vector, NULL); + if (msix_map.index < 0) { + vfio_pci_memory_unlock_and_restore(vdev, cmd); + ret = msix_map.index; + goto out_put_eventfd_ctx; + } + irq = msix_map.virq; + } else { + /* + * The MSIx vector table resides in device memory which + * may be cleared via backdoor resets. We don't allow + * direct access to the vector table so even if a + * userspace driver attempts to save/restore around + * such a reset it would be unsuccessful. To avoid + * this, restore the cached value of the message prior + * to enabling. + */ + struct msi_msg msg; + + get_cached_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); + } } ret = request_irq(irq, vfio_msihandler, 0, ctx->name, trigger); - vfio_pci_memory_unlock_and_restore(vdev, cmd); if (ret) - goto out_put_eventfd_ctx; + goto out_free_irq_locked; + + vfio_pci_memory_unlock_and_restore(vdev, cmd); ctx->producer.token = trigger; ctx->producer.irq = irq; @@ -477,11 +526,21 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, return 0; +out_free_irq_locked: + if (allow_dyn_alloc && new_ctx) { + msix_map.index = vector; + msix_map.virq = irq; + pci_msix_free_irq(pdev, msix_map); + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); out_put_eventfd_ctx: eventfd_ctx_put(trigger); out_free_name: kfree(ctx->name); ctx->name = NULL; +out_free_ctx: + if (allow_dyn_alloc && new_ctx) + vfio_irq_ctx_free(vdev, ctx, vector); return ret; }
Recently introduced pci_msix_alloc_irq_at() and pci_msix_free_irq() enables an individual MSI-X index to be allocated and freed after MSI-X enabling. Support dynamic MSI-X if supported by the device. Keep the association between allocated interrupt and vfio interrupt context. Allocate new context together with the new interrupt if no interrupt context exist for an MSI-X interrupt. Similarly, release an interrupt with its context. Signed-off-by: Reinette Chatre <reinette.chatre@intel.com> --- Changes since RFC V1: - Add pointer to interrupt context as function parameter to vfio_irq_ctx_free(). (Alex) - Initialize new_ctx to false. (Dan Carpenter) - Only support dynamic allocation if device supports it. (Alex) drivers/vfio/pci/vfio_pci_intrs.c | 93 +++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 17 deletions(-)