Message ID | 20231205222026.2108094-1-Felix.Kuehling@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | drm/amdkfd: Fix sparse __rcu annotation warnings | expand |
On 2023-12-05 17:20, Felix Kuehling wrote: > Properly mark kfd_process->ef as __rcu and consistently access it with > rcu_dereference_protected. > > Reported-by: kernel test robot <lkp@intel.com> > Closes: https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ > Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> ping. Christian, would you review this patch, please? Thanks, Felix > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- > 4 files changed, 8 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index f2e920734c98..20cb266dcedd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -314,7 +314,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); > int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo); > > int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, > - struct dma_fence **ef); > + struct dma_fence __rcu **ef); > int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, > struct kfd_vm_fault_info *info); > int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > index 7d91f99acb59..8ba6f6c8363d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > @@ -2806,7 +2806,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) > put_task_struct(usertask); > } > > -static void replace_eviction_fence(struct dma_fence **ef, > +static void replace_eviction_fence(struct dma_fence __rcu **ef, > struct dma_fence *new_ef) > { > struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true > @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct dma_fence **ef, > * 7. Add fence to all PD and PT BOs. > * 8. Unreserve all BOs > */ > -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) > +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef) > { > struct amdkfd_process_info *process_info = info; > struct amdgpu_vm *peer_vm; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 45366b4ca976..5a24097a9f28 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -917,7 +917,7 @@ struct kfd_process { > * fence will be triggered during eviction and new one will be created > * during restore > */ > - struct dma_fence *ef; > + struct dma_fence __rcu *ef; > > /* Work items for evicting and restoring BOs */ > struct delayed_work eviction_work; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 71df51fcc1b0..14b11d61f8dd 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct work_struct *work) > { > struct kfd_process *p = container_of(work, struct kfd_process, > release_work); > + struct dma_fence *ef = rcu_dereference_protected(p->ef, > + kref_read(&p->ref) == 0); > > kfd_process_dequeue_from_all_devices(p); > pqm_uninit(&p->pqm); > @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct work_struct *work) > * destroyed. This allows any BOs to be freed without > * triggering pointless evictions or waiting for fences. > */ > - dma_fence_signal(p->ef); > + dma_fence_signal(ef); > > kfd_process_remove_sysfs(p); > > @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct work_struct *work) > svm_range_list_fini(p); > > kfd_process_destroy_pdds(p); > - dma_fence_put(p->ef); > + dma_fence_put(ef); > > kfd_event_free_process(p); >
Am 07.12.23 um 20:14 schrieb Felix Kuehling: > > On 2023-12-05 17:20, Felix Kuehling wrote: >> Properly mark kfd_process->ef as __rcu and consistently access it with >> rcu_dereference_protected. >> >> Reported-by: kernel test robot <lkp@intel.com> >> Closes: >> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ >> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> > > ping. > > Christian, would you review this patch, please? Looks a bit suspicious, especially the rcu_dereference_protected() use. What is the static checker complaining about in the first place? Regards, Christian. > > Thanks, > Felix > > > >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- >> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- >> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- >> 4 files changed, 8 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> index f2e920734c98..20cb266dcedd 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> @@ -314,7 +314,7 @@ void >> amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >> int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, >> struct amdgpu_bo *bo); >> int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >> - struct dma_fence **ef); >> + struct dma_fence __rcu **ef); >> int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, >> struct kfd_vm_fault_info *info); >> int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device >> *adev, int fd, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> index 7d91f99acb59..8ba6f6c8363d 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> @@ -2806,7 +2806,7 @@ static void >> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) >> put_task_struct(usertask); >> } >> -static void replace_eviction_fence(struct dma_fence **ef, >> +static void replace_eviction_fence(struct dma_fence __rcu **ef, >> struct dma_fence *new_ef) >> { >> struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true >> @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct >> dma_fence **ef, >> * 7. Add fence to all PD and PT BOs. >> * 8. Unreserve all BOs >> */ >> -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >> dma_fence **ef) >> +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >> dma_fence __rcu **ef) >> { >> struct amdkfd_process_info *process_info = info; >> struct amdgpu_vm *peer_vm; >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> index 45366b4ca976..5a24097a9f28 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> @@ -917,7 +917,7 @@ struct kfd_process { >> * fence will be triggered during eviction and new one will be >> created >> * during restore >> */ >> - struct dma_fence *ef; >> + struct dma_fence __rcu *ef; >> /* Work items for evicting and restoring BOs */ >> struct delayed_work eviction_work; >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> index 71df51fcc1b0..14b11d61f8dd 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct >> work_struct *work) >> { >> struct kfd_process *p = container_of(work, struct kfd_process, >> release_work); >> + struct dma_fence *ef = rcu_dereference_protected(p->ef, >> + kref_read(&p->ref) == 0); >> kfd_process_dequeue_from_all_devices(p); >> pqm_uninit(&p->pqm); >> @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct >> work_struct *work) >> * destroyed. This allows any BOs to be freed without >> * triggering pointless evictions or waiting for fences. >> */ >> - dma_fence_signal(p->ef); >> + dma_fence_signal(ef); >> kfd_process_remove_sysfs(p); >> @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct >> work_struct *work) >> svm_range_list_fini(p); >> kfd_process_destroy_pdds(p); >> - dma_fence_put(p->ef); >> + dma_fence_put(ef); >> kfd_event_free_process(p);
On 2023-12-08 05:11, Christian König wrote: > Am 07.12.23 um 20:14 schrieb Felix Kuehling: >> >> On 2023-12-05 17:20, Felix Kuehling wrote: >>> Properly mark kfd_process->ef as __rcu and consistently access it with >>> rcu_dereference_protected. >>> >>> Reported-by: kernel test robot <lkp@intel.com> >>> Closes: >>> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ >>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> >> >> ping. >> >> Christian, would you review this patch, please? > > Looks a bit suspicious, especially the rcu_dereference_protected() use. > > What is the static checker complaining about in the first place? From https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/: >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: sparse: incompatible types in comparison expression (different address spaces): >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: struct dma_fence [noderef] __rcu * >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: struct dma_fence * ... >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: sparse: incompatible types in comparison expression (different address spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: struct dma_fence [noderef] __rcu * >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: struct dma_fence * >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: sparse: incompatible types in comparison expression (different address spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: struct dma_fence [noderef] __rcu * >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: struct dma_fence * As far as I can tell, the reason is, that I'm using dma_fence_get_rcu_safe and rcu_replace_pointer to get and update kfd_process->ef, without annotating the fence pointers with __rcu. This patch fixes that by marking kfd_process->ef as an __rcu pointer. The only place that was dereferencing it directly was kfd_process_wq_release, where I added rcu_dereference_protected. The condition I'm using here is, that the process ref count is 0 at that point, which means nobody else is referencing the process or this fence pointer at the time. Regards, Felix > > Regards, > Christian. > >> >> Thanks, >> Felix >> >> >> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- >>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- >>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- >>> 4 files changed, 8 insertions(+), 6 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>> index f2e920734c98..20cb266dcedd 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>> @@ -314,7 +314,7 @@ void >>> amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >>> int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, >>> struct amdgpu_bo *bo); >>> int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >>> - struct dma_fence **ef); >>> + struct dma_fence __rcu **ef); >>> int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, >>> struct kfd_vm_fault_info *info); >>> int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device >>> *adev, int fd, >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>> index 7d91f99acb59..8ba6f6c8363d 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>> @@ -2806,7 +2806,7 @@ static void >>> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) >>> put_task_struct(usertask); >>> } >>> -static void replace_eviction_fence(struct dma_fence **ef, >>> +static void replace_eviction_fence(struct dma_fence __rcu **ef, >>> struct dma_fence *new_ef) >>> { >>> struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true >>> @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct >>> dma_fence **ef, >>> * 7. Add fence to all PD and PT BOs. >>> * 8. Unreserve all BOs >>> */ >>> -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>> dma_fence **ef) >>> +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>> dma_fence __rcu **ef) >>> { >>> struct amdkfd_process_info *process_info = info; >>> struct amdgpu_vm *peer_vm; >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> index 45366b4ca976..5a24097a9f28 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> @@ -917,7 +917,7 @@ struct kfd_process { >>> * fence will be triggered during eviction and new one will be >>> created >>> * during restore >>> */ >>> - struct dma_fence *ef; >>> + struct dma_fence __rcu *ef; >>> /* Work items for evicting and restoring BOs */ >>> struct delayed_work eviction_work; >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>> index 71df51fcc1b0..14b11d61f8dd 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>> @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct >>> work_struct *work) >>> { >>> struct kfd_process *p = container_of(work, struct kfd_process, >>> release_work); >>> + struct dma_fence *ef = rcu_dereference_protected(p->ef, >>> + kref_read(&p->ref) == 0); >>> kfd_process_dequeue_from_all_devices(p); >>> pqm_uninit(&p->pqm); >>> @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct >>> work_struct *work) >>> * destroyed. This allows any BOs to be freed without >>> * triggering pointless evictions or waiting for fences. >>> */ >>> - dma_fence_signal(p->ef); >>> + dma_fence_signal(ef); >>> kfd_process_remove_sysfs(p); >>> @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct >>> work_struct *work) >>> svm_range_list_fini(p); >>> kfd_process_destroy_pdds(p); >>> - dma_fence_put(p->ef); >>> + dma_fence_put(ef); >>> kfd_event_free_process(p); >
On 2023-12-11 10:56, Felix Kuehling wrote: > > On 2023-12-08 05:11, Christian König wrote: >> Am 07.12.23 um 20:14 schrieb Felix Kuehling: >>> >>> On 2023-12-05 17:20, Felix Kuehling wrote: >>>> Properly mark kfd_process->ef as __rcu and consistently access it with >>>> rcu_dereference_protected. >>>> >>>> Reported-by: kernel test robot <lkp@intel.com> >>>> Closes: >>>> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ >>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> >>> >>> ping. >>> >>> Christian, would you review this patch, please? >> >> Looks a bit suspicious, especially the rcu_dereference_protected() use. >> >> What is the static checker complaining about in the first place? > From > https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/: > >>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: >>> sparse: incompatible types in comparison expression (different >>> address spaces): >> >>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: > struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: > struct dma_fence * ... >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > sparse: incompatible types in comparison expression (different address > spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: > sparse: struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > struct dma_fence * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > sparse: incompatible types in comparison expression (different address > spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: > sparse: struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > struct dma_fence * > > As far as I can tell, the reason is, that I'm using > dma_fence_get_rcu_safe and rcu_replace_pointer to get and update > kfd_process->ef, without annotating the fence pointers with __rcu. > This patch fixes that by marking kfd_process->ef as an __rcu pointer. > The only place that was dereferencing it directly was > kfd_process_wq_release, where I added rcu_dereference_protected. The > condition I'm using here is, that the process ref count is 0 at that > point, which means nobody else is referencing the process or this > fence pointer at the time. Hi Christian, We discussed offline that you think rcu_dereference_protected is not needed in the teardown function. After reading over rcupdate.h, I think a simpler alternative would be to use rcu_access_pointer, after a grace period to ensure there can be no more readers. Based on this comment in rcupdate.h: * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. The last sentence sounds like rcu_dereference_protected should also be OK, though. Either way, it sounds like I need to add a synchronize_rcu call in any case, before freeing the fence. Do you agree with this proposal? Regards, Felix > > Regards, > Felix > > >> >> Regards, >> Christian. >> >>> >>> Thanks, >>> Felix >>> >>> >>> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- >>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- >>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- >>>> 4 files changed, 8 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> index f2e920734c98..20cb266dcedd 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> @@ -314,7 +314,7 @@ void >>>> amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >>>> int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, >>>> struct amdgpu_bo *bo); >>>> int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >>>> - struct dma_fence **ef); >>>> + struct dma_fence __rcu **ef); >>>> int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device >>>> *adev, >>>> struct kfd_vm_fault_info *info); >>>> int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device >>>> *adev, int fd, >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> index 7d91f99acb59..8ba6f6c8363d 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> @@ -2806,7 +2806,7 @@ static void >>>> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) >>>> put_task_struct(usertask); >>>> } >>>> -static void replace_eviction_fence(struct dma_fence **ef, >>>> +static void replace_eviction_fence(struct dma_fence __rcu **ef, >>>> struct dma_fence *new_ef) >>>> { >>>> struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true >>>> @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct >>>> dma_fence **ef, >>>> * 7. Add fence to all PD and PT BOs. >>>> * 8. Unreserve all BOs >>>> */ >>>> -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>> dma_fence **ef) >>>> +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>> dma_fence __rcu **ef) >>>> { >>>> struct amdkfd_process_info *process_info = info; >>>> struct amdgpu_vm *peer_vm; >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> index 45366b4ca976..5a24097a9f28 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> @@ -917,7 +917,7 @@ struct kfd_process { >>>> * fence will be triggered during eviction and new one will >>>> be created >>>> * during restore >>>> */ >>>> - struct dma_fence *ef; >>>> + struct dma_fence __rcu *ef; >>>> /* Work items for evicting and restoring BOs */ >>>> struct delayed_work eviction_work; >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> index 71df51fcc1b0..14b11d61f8dd 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>> { >>>> struct kfd_process *p = container_of(work, struct kfd_process, >>>> release_work); >>>> + struct dma_fence *ef = rcu_dereference_protected(p->ef, >>>> + kref_read(&p->ref) == 0); >>>> kfd_process_dequeue_from_all_devices(p); >>>> pqm_uninit(&p->pqm); >>>> @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>> * destroyed. This allows any BOs to be freed without >>>> * triggering pointless evictions or waiting for fences. >>>> */ >>>> - dma_fence_signal(p->ef); >>>> + dma_fence_signal(ef); >>>> kfd_process_remove_sysfs(p); >>>> @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>> svm_range_list_fini(p); >>>> kfd_process_destroy_pdds(p); >>>> - dma_fence_put(p->ef); >>>> + dma_fence_put(ef); >>>> kfd_event_free_process(p); >> >
Am 20.12.23 um 17:58 schrieb Felix Kuehling: > On 2023-12-11 10:56, Felix Kuehling wrote: >> >> On 2023-12-08 05:11, Christian König wrote: >>> Am 07.12.23 um 20:14 schrieb Felix Kuehling: >>>> >>>> On 2023-12-05 17:20, Felix Kuehling wrote: >>>>> Properly mark kfd_process->ef as __rcu and consistently access it >>>>> with >>>>> rcu_dereference_protected. >>>>> >>>>> Reported-by: kernel test robot <lkp@intel.com> >>>>> Closes: >>>>> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ >>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> >>>> >>>> ping. >>>> >>>> Christian, would you review this patch, please? >>> >>> Looks a bit suspicious, especially the rcu_dereference_protected() use. >>> >>> What is the static checker complaining about in the first place? >> From >> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/: >> >>>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: >>>> sparse: incompatible types in comparison expression (different >>>> address spaces): >> >>>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: >> struct dma_fence [noderef] __rcu * >> >> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: >> struct dma_fence * ... >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> sparse: incompatible types in comparison expression (different >> address spaces): >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> struct dma_fence [noderef] __rcu * >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> struct dma_fence * >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> sparse: incompatible types in comparison expression (different >> address spaces): >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> struct dma_fence [noderef] __rcu * >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: >> struct dma_fence * >> >> As far as I can tell, the reason is, that I'm using >> dma_fence_get_rcu_safe and rcu_replace_pointer to get and update >> kfd_process->ef, without annotating the fence pointers with __rcu. >> This patch fixes that by marking kfd_process->ef as an __rcu pointer. >> The only place that was dereferencing it directly was >> kfd_process_wq_release, where I added rcu_dereference_protected. The >> condition I'm using here is, that the process ref count is 0 at that >> point, which means nobody else is referencing the process or this >> fence pointer at the time. > > Hi Christian, > > We discussed offline that you think rcu_dereference_protected is not > needed in the teardown function. After reading over rcupdate.h, I > think a simpler alternative would be to use rcu_access_pointer, after > a grace period to ensure there can be no more readers. Based on this > comment in rcupdate.h: > > * It is also permissible to use rcu_access_pointer() when read-side > * access to the pointer was removed at least one grace period ago, as is > * the case in the context of the RCU callback that is freeing up the data, > * or after a synchronize_rcu() returns. This can be useful when tearing > * down multi-linked structures after a grace period has elapsed. However, > * rcu_dereference_protected() is normally preferred for this use case. > > The last sentence sounds like rcu_dereference_protected should also be > OK, though. Either way, it sounds like I need to add a synchronize_rcu > call in any case, before freeing the fence. Do you agree with this > proposal? > Yeah, completely agree. I think the reference to using rcu_dereference_protected() is when you protect the pointer with some lock, which isn't the case here. And the question is who is accessing this pointer? If you can guarantee that there are no more readers you don't need an synchronize_rcu(), if you can't then yes a grace period is necessary here. Regards, Christian. > Regards, > Felix > > >> >> Regards, >> Felix >> >> >>> >>> Regards, >>> Christian. >>> >>>> >>>> Thanks, >>>> Felix >>>> >>>> >>>> >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- >>>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- >>>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- >>>>> 4 files changed, 8 insertions(+), 6 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>>> index f2e920734c98..20cb266dcedd 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>>> @@ -314,7 +314,7 @@ void >>>>> amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >>>>> int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, >>>>> struct amdgpu_bo *bo); >>>>> int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >>>>> - struct dma_fence **ef); >>>>> + struct dma_fence __rcu **ef); >>>>> int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device >>>>> *adev, >>>>> struct kfd_vm_fault_info *info); >>>>> int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device >>>>> *adev, int fd, >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>>> index 7d91f99acb59..8ba6f6c8363d 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>>> @@ -2806,7 +2806,7 @@ static void >>>>> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) >>>>> put_task_struct(usertask); >>>>> } >>>>> -static void replace_eviction_fence(struct dma_fence **ef, >>>>> +static void replace_eviction_fence(struct dma_fence __rcu **ef, >>>>> struct dma_fence *new_ef) >>>>> { >>>>> struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, >>>>> true >>>>> @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct >>>>> dma_fence **ef, >>>>> * 7. Add fence to all PD and PT BOs. >>>>> * 8. Unreserve all BOs >>>>> */ >>>>> -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>>> dma_fence **ef) >>>>> +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>>> dma_fence __rcu **ef) >>>>> { >>>>> struct amdkfd_process_info *process_info = info; >>>>> struct amdgpu_vm *peer_vm; >>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>> index 45366b4ca976..5a24097a9f28 100644 >>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>> @@ -917,7 +917,7 @@ struct kfd_process { >>>>> * fence will be triggered during eviction and new one will >>>>> be created >>>>> * during restore >>>>> */ >>>>> - struct dma_fence *ef; >>>>> + struct dma_fence __rcu *ef; >>>>> /* Work items for evicting and restoring BOs */ >>>>> struct delayed_work eviction_work; >>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>> index 71df51fcc1b0..14b11d61f8dd 100644 >>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>> @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct >>>>> work_struct *work) >>>>> { >>>>> struct kfd_process *p = container_of(work, struct kfd_process, >>>>> release_work); >>>>> + struct dma_fence *ef = rcu_dereference_protected(p->ef, >>>>> + kref_read(&p->ref) == 0); >>>>> kfd_process_dequeue_from_all_devices(p); >>>>> pqm_uninit(&p->pqm); >>>>> @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct >>>>> work_struct *work) >>>>> * destroyed. This allows any BOs to be freed without >>>>> * triggering pointless evictions or waiting for fences. >>>>> */ >>>>> - dma_fence_signal(p->ef); >>>>> + dma_fence_signal(ef); >>>>> kfd_process_remove_sysfs(p); >>>>> @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct >>>>> work_struct *work) >>>>> svm_range_list_fini(p); >>>>> kfd_process_destroy_pdds(p); >>>>> - dma_fence_put(p->ef); >>>>> + dma_fence_put(ef); >>>>> kfd_event_free_process(p); >>> >>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index f2e920734c98..20cb266dcedd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -314,7 +314,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo); int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, - struct dma_fence **ef); + struct dma_fence __rcu **ef); int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, struct kfd_vm_fault_info *info); int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7d91f99acb59..8ba6f6c8363d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2806,7 +2806,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) put_task_struct(usertask); } -static void replace_eviction_fence(struct dma_fence **ef, +static void replace_eviction_fence(struct dma_fence __rcu **ef, struct dma_fence *new_ef) { struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct dma_fence **ef, * 7. Add fence to all PD and PT BOs. * 8. Unreserve all BOs */ -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef) { struct amdkfd_process_info *process_info = info; struct amdgpu_vm *peer_vm; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 45366b4ca976..5a24097a9f28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -917,7 +917,7 @@ struct kfd_process { * fence will be triggered during eviction and new one will be created * during restore */ - struct dma_fence *ef; + struct dma_fence __rcu *ef; /* Work items for evicting and restoring BOs */ struct delayed_work eviction_work; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 71df51fcc1b0..14b11d61f8dd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct work_struct *work) { struct kfd_process *p = container_of(work, struct kfd_process, release_work); + struct dma_fence *ef = rcu_dereference_protected(p->ef, + kref_read(&p->ref) == 0); kfd_process_dequeue_from_all_devices(p); pqm_uninit(&p->pqm); @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct work_struct *work) * destroyed. This allows any BOs to be freed without * triggering pointless evictions or waiting for fences. */ - dma_fence_signal(p->ef); + dma_fence_signal(ef); kfd_process_remove_sysfs(p); @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct work_struct *work) svm_range_list_fini(p); kfd_process_destroy_pdds(p); - dma_fence_put(p->ef); + dma_fence_put(ef); kfd_event_free_process(p);
Properly mark kfd_process->ef as __rcu and consistently access it with rcu_dereference_protected. Reported-by: kernel test robot <lkp@intel.com> Closes: https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 ++++-- 4 files changed, 8 insertions(+), 6 deletions(-)