diff mbox series

[v3] arm64/mm: avoid fixmap race condition when create pud mapping

Message ID 20211216082812.165387-1-jianyong.wu@arm.com (mailing list archive)
State New, archived
Headers show
Series [v3] arm64/mm: avoid fixmap race condition when create pud mapping | expand

Commit Message

Jianyong Wu Dec. 16, 2021, 8:28 a.m. UTC
The 'fixmap' is a global resource and is used recursively by
create pud mapping(), leading to a potential race condition in the
presence of a concurrent call to alloc_init_pud():

kernel_init thread                          virtio-mem workqueue thread
==================                          ===========================

  alloc_init_pud(...)                       alloc_init_pud(...)
  pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
  READ_ONCE(*pudp)
  pud_clear_fixmap(...)
                                            READ_ONCE(*pudp) // CRASH!

As kernel may sleep during creating pud mapping, introduce a mutex lock to
serialise use of the fixmap entries by alloc_init_pud().

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---

Change log:

from v2 to v3:
     change spin lock to mutex lock as kernel may sleep when create pud
map.

 arch/arm64/mm/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

Comments

David Hildenbrand Dec. 16, 2021, 3:19 p.m. UTC | #1
On 16.12.21 09:28, Jianyong Wu wrote:
> The 'fixmap' is a global resource and is used recursively by
> create pud mapping(), leading to a potential race condition in the
> presence of a concurrent call to alloc_init_pud():
> 
> kernel_init thread                          virtio-mem workqueue thread
> ==================                          ===========================
> 
>   alloc_init_pud(...)                       alloc_init_pud(...)
>   pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
>   READ_ONCE(*pudp)
>   pud_clear_fixmap(...)
>                                             READ_ONCE(*pudp) // CRASH!
> 
> As kernel may sleep during creating pud mapping, introduce a mutex lock to
> serialise use of the fixmap entries by alloc_init_pud().
> 
> Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
> ---
> 
> Change log:
> 
> from v2 to v3:
>      change spin lock to mutex lock as kernel may sleep when create pud
> map.
> 
>  arch/arm64/mm/mmu.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index acfae9b41cc8..e680a6a8ca40 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -63,6 +63,7 @@ static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
>  static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
>  
>  static DEFINE_SPINLOCK(swapper_pgdir_lock);
> +static DEFINE_MUTEX(fixmap_lock);
>  
>  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
>  {
> @@ -329,6 +330,11 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	}
>  	BUG_ON(p4d_bad(p4d));
>  
> +	/*
> +	 * We only have one fixmap entry per page-table level, so take
> +	 * the fixmap lock until we're done.
> +	 */
> +	mutex_lock(&fixmap_lock);
>  	pudp = pud_set_fixmap_offset(p4dp, addr);
>  	do {
>  		pud_t old_pud = READ_ONCE(*pudp);
> @@ -359,6 +365,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	} while (pudp++, addr = next, addr != end);
>  
>  	pud_clear_fixmap();
> +	mutex_unlock(&fixmap_lock);
>  }
>  
>  static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> 

Hopefully now we get it right and there are not any "special" callpaths :)

Acked-by: David Hildenbrand <david@redhat.com>
Mark Rutland Dec. 17, 2021, 9:30 a.m. UTC | #2
On Thu, Dec 16, 2021 at 04:28:12PM +0800, Jianyong Wu wrote:
> The 'fixmap' is a global resource and is used recursively by
> create pud mapping(), leading to a potential race condition in the
> presence of a concurrent call to alloc_init_pud():
> 
> kernel_init thread                          virtio-mem workqueue thread
> ==================                          ===========================
> 
>   alloc_init_pud(...)                       alloc_init_pud(...)
>   pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
>   READ_ONCE(*pudp)
>   pud_clear_fixmap(...)
>                                             READ_ONCE(*pudp) // CRASH!
> 
> As kernel may sleep during creating pud mapping, introduce a mutex lock to
> serialise use of the fixmap entries by alloc_init_pud().
> 
> Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>

Since there were deadlock issues with the last version, it would be very nice
if we could check this with at least:

* CONFIG_DEBUG_ATOMIC_SLEEP
* CONFIG_PROVE_LOCKING

... so that we can be reasonably certain that we're not introducing some
livelock/deadlock scenario.

Are you able to reproduce the problem for testing, or was this found by
inspection? Do you have any instructions for reproducing the problem? e.g. can
this easily be tested with QEMU?

If you're able to reproduce the issue, it would be nice to have an example
backtrace of when this goes wrong.

Thanks,
Mark.

> ---
> 
> Change log:
> 
> from v2 to v3:
>      change spin lock to mutex lock as kernel may sleep when create pud
> map.
> 
>  arch/arm64/mm/mmu.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index acfae9b41cc8..e680a6a8ca40 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -63,6 +63,7 @@ static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
>  static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
>  
>  static DEFINE_SPINLOCK(swapper_pgdir_lock);
> +static DEFINE_MUTEX(fixmap_lock);
>  
>  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
>  {
> @@ -329,6 +330,11 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	}
>  	BUG_ON(p4d_bad(p4d));
>  
> +	/*
> +	 * We only have one fixmap entry per page-table level, so take
> +	 * the fixmap lock until we're done.
> +	 */
> +	mutex_lock(&fixmap_lock);
>  	pudp = pud_set_fixmap_offset(p4dp, addr);
>  	do {
>  		pud_t old_pud = READ_ONCE(*pudp);
> @@ -359,6 +365,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	} while (pudp++, addr = next, addr != end);
>  
>  	pud_clear_fixmap();
> +	mutex_unlock(&fixmap_lock);
>  }
>  
>  static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> -- 
> 2.17.1
>
Jianyong Wu Dec. 17, 2021, 10:09 a.m. UTC | #3
Hi Mark,

> -----Original Message-----
> From: Mark Rutland <mark.rutland@arm.com>
> Sent: Friday, December 17, 2021 5:31 PM
> To: Jianyong Wu <Jianyong.Wu@arm.com>
> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; will@kernel.org; Anshuman
> Khandual <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> david@redhat.com; quic_qiancai@quicinc.com; ardb@kernel.org; linux-
> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> gshan@redhat.com; Justin He <Justin.He@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Thu, Dec 16, 2021 at 04:28:12PM +0800, Jianyong Wu wrote:
> > The 'fixmap' is a global resource and is used recursively by create
> > pud mapping(), leading to a potential race condition in the presence
> > of a concurrent call to alloc_init_pud():
> >
> > kernel_init thread                          virtio-mem workqueue thread
> > ==================                          ===========================
> >
> >   alloc_init_pud(...)                       alloc_init_pud(...)
> >   pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
> >   READ_ONCE(*pudp)
> >   pud_clear_fixmap(...)
> >                                             READ_ONCE(*pudp) // CRASH!
> >
> > As kernel may sleep during creating pud mapping, introduce a mutex
> > lock to serialise use of the fixmap entries by alloc_init_pud().
> >
> > Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
> 
> Since there were deadlock issues with the last version, it would be very nice
> if we could check this with at least:
> 
> * CONFIG_DEBUG_ATOMIC_SLEEP
> * CONFIG_PROVE_LOCKING
> 
> ... so that we can be reasonably certain that we're not introducing some
> livelock/deadlock scenario.
> 

I enable these 2 configs and test for the current patch. No warning related with this change found. 

> Are you able to reproduce the problem for testing, or was this found by
> inspection? Do you have any instructions for reproducing the problem? e.g.
> can this easily be tested with QEMU?
> 

I test it using Cloud Hypervisor not QEMU. I find the bug when I tested the incoming feature of virtio-mem using Cloud Hypervisor.
I think we can reproduce this bug using QEMU, but as there is no virtio-mem support for the current QEMU, we can only test the ACPI-based memory hotplug. However, I think it's not easy to do and I have not tried that.

In my test: firstly, start a VM and hotplug a large size of memory using virtio-mem and reboot or kexec a new kernel. When the kernel booting, memory hotplugged by virtio-mem will be added within kernel_init. As both of kernel init and memory add thread will update page table, "alloc_pud_init" may be executed concurrently. 

I think it's not easy to reproduce the bug using ACPI based memory hotplug, as we must hotplug memory at the same time of kernel_init to crash with it. 

> If you're able to reproduce the issue, it would be nice to have an example
> backtrace of when this goes wrong.
> 
Yes, this bug occurs when kernel init, the function execute flow is:
-------------------------
kernel_init
  kernel_init_freeable
    ...
      do_initcall
        ...
          module_init [A]

  ...
  mark_readonly
    mark_rodata_ro [B]
-------------------------
[A] can contains memory hotplug init therefore both [A] and [B] can
update page table at the same time and may lead to crash.

Thanks
Jianyong Wu

> Thanks,
> Mark.
> 
> > ---
> >
> > Change log:
> >
> > from v2 to v3:
> >      change spin lock to mutex lock as kernel may sleep when create
> > pud map.
> >
> >  arch/arm64/mm/mmu.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index
> > acfae9b41cc8..e680a6a8ca40 100644
> > --- a/arch/arm64/mm/mmu.c
> > +++ b/arch/arm64/mm/mmu.c
> > @@ -63,6 +63,7 @@ static pmd_t bm_pmd[PTRS_PER_PMD]
> __page_aligned_bss
> > __maybe_unused;  static pud_t bm_pud[PTRS_PER_PUD]
> __page_aligned_bss
> > __maybe_unused;
> >
> >  static DEFINE_SPINLOCK(swapper_pgdir_lock);
> > +static DEFINE_MUTEX(fixmap_lock);
> >
> >  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)  { @@ -329,6 +330,11 @@
> > static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long
> end,
> >  	}
> >  	BUG_ON(p4d_bad(p4d));
> >
> > +	/*
> > +	 * We only have one fixmap entry per page-table level, so take
> > +	 * the fixmap lock until we're done.
> > +	 */
> > +	mutex_lock(&fixmap_lock);
> >  	pudp = pud_set_fixmap_offset(p4dp, addr);
> >  	do {
> >  		pud_t old_pud = READ_ONCE(*pudp);
> > @@ -359,6 +365,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned
> long addr, unsigned long end,
> >  	} while (pudp++, addr = next, addr != end);
> >
> >  	pud_clear_fixmap();
> > +	mutex_unlock(&fixmap_lock);
> >  }
> >
> >  static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> > --
> > 2.17.1
> >
Catalin Marinas Jan. 5, 2022, 6:03 p.m. UTC | #4
On Thu, Dec 16, 2021 at 04:28:12PM +0800, Jianyong Wu wrote:
> The 'fixmap' is a global resource and is used recursively by
> create pud mapping(), leading to a potential race condition in the
> presence of a concurrent call to alloc_init_pud():
> 
> kernel_init thread                          virtio-mem workqueue thread
> ==================                          ===========================
> 
>   alloc_init_pud(...)                       alloc_init_pud(...)
>   pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
>   READ_ONCE(*pudp)
>   pud_clear_fixmap(...)
>                                             READ_ONCE(*pudp) // CRASH!
> 
> As kernel may sleep during creating pud mapping, introduce a mutex lock to
> serialise use of the fixmap entries by alloc_init_pud().
> 
> Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>

I tried to queue this patch but with certain configurations it doesn't
boot under Qemu. Starting from defconfig, update .config with (I had
this in one of my build scripts):

$ ./scripts/config \
		-e DEBUG_KERNEL \
		-e DEBUG_PAGEALLOC \
		-e DEBUG_PAGEALLOC_ENABLE_DEFAULT \
		-e DEBUG_WX \
		-e DEBUG_SET_MODULE_RONX \
		-e DEBUG_ALIGN_RODATA \
		-e ARM64_PTDUMP_DEBUGFS \
		-e DEBUG_OBJECTS \
		-e DEBUG_OBJECTS_FREE \
		-e DEBUG_OBJECTS_TIMERS \
		-e DEBUG_KOBJECT_RELEASE \
		-e DEBUG_LOCKING_API_SELFTESTS \
		-e DEBUG_PREEMPT \
		-e DEBUG_TIMEKEEPING \
		-e DEBUG_VM \
		-e DEBUG_VM_VMACACHE \
		-e DEBUG_VM_RB \
		-e DEBUG_VM_PGFLAGS \
		-e DEBUG_VIRTUAL \
		-e DEBUG_LIST \
		-e DEBUG_PI_LIST \
		-e DEBUG_SG \
		-e PROVE_LOCKING \
		-e DEBUG_RT_MUTEXES \
		-e DEBUG_ATOMIC_SLEEP \
		-e ATOMIC64_SELFTEST

It stop after exiting the EFI boot services. I did not have time to
debug.
Jianyong Wu Jan. 6, 2022, 10:13 a.m. UTC | #5
Hi Catalin,

I test this patch in your way using both EDK2 V2.6 and EDK2 v2.7. it's peculiar that this issue shows up on v2.6 but not on v2.7.
For now, I only find that if "CONFIG_DEBUG_LOCK_ALLOC" is enabled, the kernel boot will hang. However, I can't debug it by printk as this issue happens before pl11 is ready.
I will go on debugging, but very appreciated if someone can give some hints on it.

Thanks
Jianyong

> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: Thursday, January 6, 2022 2:04 AM
> To: Jianyong Wu <Jianyong.Wu@arm.com>
> Cc: will@kernel.org; Anshuman Khandual <Anshuman.Khandual@arm.com>;
> akpm@linux-foundation.org; david@redhat.com; quic_qiancai@quicinc.com;
> ardb@kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; Justin He
> <Justin.He@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Thu, Dec 16, 2021 at 04:28:12PM +0800, Jianyong Wu wrote:
> > The 'fixmap' is a global resource and is used recursively by create
> > pud mapping(), leading to a potential race condition in the presence
> > of a concurrent call to alloc_init_pud():
> >
> > kernel_init thread                          virtio-mem workqueue thread
> > ==================                          ===========================
> >
> >   alloc_init_pud(...)                       alloc_init_pud(...)
> >   pudp = pud_set_fixmap_offset(...)         pudp = pud_set_fixmap_offset(...)
> >   READ_ONCE(*pudp)
> >   pud_clear_fixmap(...)
> >                                             READ_ONCE(*pudp) // CRASH!
> >
> > As kernel may sleep during creating pud mapping, introduce a mutex
> > lock to serialise use of the fixmap entries by alloc_init_pud().
> >
> > Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
> 
> I tried to queue this patch but with certain configurations it doesn't boot
> under Qemu. Starting from defconfig, update .config with (I had this in one
> of my build scripts):
> 
> $ ./scripts/config \
> 		-e DEBUG_KERNEL \
> 		-e DEBUG_PAGEALLOC \
> 		-e DEBUG_PAGEALLOC_ENABLE_DEFAULT \
> 		-e DEBUG_WX \
> 		-e DEBUG_SET_MODULE_RONX \
> 		-e DEBUG_ALIGN_RODATA \
> 		-e ARM64_PTDUMP_DEBUGFS \
> 		-e DEBUG_OBJECTS \
> 		-e DEBUG_OBJECTS_FREE \
> 		-e DEBUG_OBJECTS_TIMERS \
> 		-e DEBUG_KOBJECT_RELEASE \
> 		-e DEBUG_LOCKING_API_SELFTESTS \
> 		-e DEBUG_PREEMPT \
> 		-e DEBUG_TIMEKEEPING \
> 		-e DEBUG_VM \
> 		-e DEBUG_VM_VMACACHE \
> 		-e DEBUG_VM_RB \
> 		-e DEBUG_VM_PGFLAGS \
> 		-e DEBUG_VIRTUAL \
> 		-e DEBUG_LIST \
> 		-e DEBUG_PI_LIST \
> 		-e DEBUG_SG \
> 		-e PROVE_LOCKING \
> 		-e DEBUG_RT_MUTEXES \
> 		-e DEBUG_ATOMIC_SLEEP \
> 		-e ATOMIC64_SELFTEST
> 
> It stop after exiting the EFI boot services. I did not have time to debug.
> 
> --
> Catalin
Catalin Marinas Jan. 6, 2022, 3:56 p.m. UTC | #6
On Thu, Jan 06, 2022 at 10:13:06AM +0000, Jianyong Wu wrote:
> I test this patch in your way using both EDK2 V2.6 and EDK2 v2.7. it's
> peculiar that this issue shows up on v2.6 but not on v2.7.
> For now, I only find that if "CONFIG_DEBUG_LOCK_ALLOC" is enabled, the
> kernel boot will hang. However, I can't debug it by printk as this
> issue happens before pl11 is ready.

I tried earlycon but that doesn't help either.

> I will go on debugging, but very appreciated if someone can give some
> hints on it.

FWIW, passing "nokaslr" on the kernel command line makes it boot (and
this makes debugging harder). That's as far as I've gone.
Jianyong Wu Jan. 7, 2022, 9:10 a.m. UTC | #7
Hi Catalin,

I roughly find the root cause.
 alloc_init_pud will be called at the very beginning of kernel boot in create_mapping_noalloc where no memory allocator is initialized. But lockdep check may need allocate memory. So, kernel take exception when acquire lock.(I have not found the exact code that cause this issue) that's say we may not be able to use a lock so early.

I come up with 2 methods to address it. 
1) skip dead lock check at the very beginning of kernel boot in lockdep code.
2) provided 2 two versions of __create_pgd_mapping, one with lock in it and the other without. There may be no possible of race for memory mapping at the very beginning time of kernel boot, thus we can use the no lock version of __create_pgd_mapping safely.
In my test, this issue is gone if there is no lock held in create_mapping_noalloc. I think create_mapping_noalloc is called early enough to avoid the race conditions of memory mapping, however, I have not proved it.

For now, I prefer 2).
The rough change for method 2:
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8..3d3c910f446b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -63,6 +63,7 @@ static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;

 static DEFINE_SPINLOCK(swapper_pgdir_lock);
+static DEFINE_MUTEX(fixmap_lock);

 void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -381,6 +382,41 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
        addr = virt & PAGE_MASK;
        end = PAGE_ALIGN(virt + size);

+       do {
+               next = pgd_addr_end(addr, end);
+               /*
+                * fixmap is used inside of alloc_init_pud, but we only have
+                * one fixmap entry per page-table level, so take the fixmap
+                * lock until we're done.
+                */
+               mutex_lock(&fixmap_lock);
+               alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
+                              flags);
+               mutex_unlock(&fixmap_lock);
+               phys += next - addr;
+       } while (pgdp++, addr = next, addr != end);
+}
+
+static void __create_pgd_mapping_nolock(pgd_t *pgdir, phys_addr_t phys,
+                                unsigned long virt, phys_addr_t size,
+                                pgprot_t prot,
+                                phys_addr_t (*pgtable_alloc)(int),
+                                int flags)
+{
+       unsigned long addr, end, next;
+       pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+
+       /*
+        * If the virtual and physical address don't have the same offset
+        * within a page, we cannot map the region as the caller expects.
+        */
+       if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+               return;
+
+       phys &= PAGE_MASK;
+       addr = virt & PAGE_MASK;
+       end = PAGE_ALIGN(virt + size);
+
        do {
                next = pgd_addr_end(addr, end);
                alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
@@ -432,7 +468,10 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                        &phys, virt);
                return;
        }
-       __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+       /*
+        * We have no need to hold a lock at this very beginning.
+        */
+       __create_pgd_mapping_nolock(init_mm.pgd, phys, virt, size, prot, NULL,
                             NO_CONT_MAPPINGS);
 }

WDYT?

Thanks
Jianyong

> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: Thursday, January 6, 2022 11:57 PM
> To: Jianyong Wu <Jianyong.Wu@arm.com>
> Cc: will@kernel.org; Anshuman Khandual <Anshuman.Khandual@arm.com>;
> akpm@linux-foundation.org; david@redhat.com; quic_qiancai@quicinc.com;
> ardb@kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; Justin He
> <Justin.He@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Thu, Jan 06, 2022 at 10:13:06AM +0000, Jianyong Wu wrote:
> > I test this patch in your way using both EDK2 V2.6 and EDK2 v2.7. it's
> > peculiar that this issue shows up on v2.6 but not on v2.7.
> > For now, I only find that if "CONFIG_DEBUG_LOCK_ALLOC" is enabled, the
> > kernel boot will hang. However, I can't debug it by printk as this
> > issue happens before pl11 is ready.
> 
> I tried earlycon but that doesn't help either.
> 
> > I will go on debugging, but very appreciated if someone can give some
> > hints on it.
> 
> FWIW, passing "nokaslr" on the kernel command line makes it boot (and this
> makes debugging harder). That's as far as I've gone.
> 
> --
> Catalin
Catalin Marinas Jan. 7, 2022, 10:42 a.m. UTC | #8
On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> Hi Catalin,
> 
> I roughly find the root cause.
>  alloc_init_pud will be called at the very beginning of kernel boot in create_mapping_noalloc where no memory allocator is initialized. But lockdep check may need allocate memory. So, kernel take exception when acquire lock.(I have not found the exact code that cause this issue) that's say we may not be able to use a lock so early.
> 
> I come up with 2 methods to address it. 
> 1) skip dead lock check at the very beginning of kernel boot in lockdep code.
> 2) provided 2 two versions of __create_pgd_mapping, one with lock in
> it and the other without. There may be no possible of race for memory
> mapping at the very beginning time of kernel boot, thus we can use the
> no lock version of __create_pgd_mapping safely.
> In my test, this issue is gone if there is no lock held in
> create_mapping_noalloc. I think create_mapping_noalloc is called early
> enough to avoid the race conditions of memory mapping, however, I have
> not proved it.

I think method 2 would work better but rather than implementing new
nolock functions I'd add a NO_LOCK flag and check it in
alloc_init_pud() before mutex_lock/unlock. Also add a comment when
passing the NO_LOCK flag on why it's needed and why there wouldn't be
any races at that stage (early boot etc.)

Thanks.
Catalin Marinas Jan. 7, 2022, 10:53 a.m. UTC | #9
On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> I roughly find the root cause.
>  alloc_init_pud will be called at the very beginning of kernel boot in
>  create_mapping_noalloc where no memory allocator is initialized. But
>  lockdep check may need allocate memory. So, kernel take exception
>  when acquire lock.(I have not found the exact code that cause this
>  issue) that's say we may not be able to use a lock so early.

I couldn't find an slab or page allocation place either. It would be
nice to get to the root cause rather than just avoiding the mutex on the
early boot path.
Jia He Jan. 26, 2022, 4:20 a.m. UTC | #10
Hi Catalin

> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: Friday, January 7, 2022 6:43 PM
> To: Jianyong Wu <Jianyong.Wu@arm.com>
> Cc: will@kernel.org; Anshuman Khandual <Anshuman.Khandual@arm.com>;
> akpm@linux-foundation.org; david@redhat.com; quic_qiancai@quicinc.com;
> ardb@kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; Justin He
> <Justin.He@arm.com>; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> > Hi Catalin,
> >
> > I roughly find the root cause.
> >  alloc_init_pud will be called at the very beginning of kernel boot in
> create_mapping_noalloc where no memory allocator is initialized. But
> lockdep check may need allocate memory. So, kernel take exception when
> acquire lock.(I have not found the exact code that cause this issue)
> that's say we may not be able to use a lock so early.
> >
> > I come up with 2 methods to address it.
> > 1) skip dead lock check at the very beginning of kernel boot in lockdep
> code.
> > 2) provided 2 two versions of __create_pgd_mapping, one with lock in
> > it and the other without. There may be no possible of race for memory
> > mapping at the very beginning time of kernel boot, thus we can use the
> > no lock version of __create_pgd_mapping safely.
> > In my test, this issue is gone if there is no lock held in
> > create_mapping_noalloc. I think create_mapping_noalloc is called early
> > enough to avoid the race conditions of memory mapping, however, I have
> > not proved it.
> 
> I think method 2 would work better but rather than implementing new
> nolock functions I'd add a NO_LOCK flag and check it in
> alloc_init_pud() before mutex_lock/unlock. Also add a comment when
> passing the NO_LOCK flag on why it's needed and why there wouldn't be
> any races at that stage (early boot etc.)
> 
The problematic code path is:
__primary_switched
	early_fdt_map->fixmap_remap_fdt
		create_mapping_noalloc->alloc_init_pud
			mutex_lock (with Jianyong's patch)

The problem seems to be that we will clear BSS segment twice if kaslr
is enabled. Hence, some of the static variables in lockdep init process were
messed up. That is to said, with kaslr enabled we might initialize lockdep
twice if we add mutex_lock/unlock in alloc_init_pud().

In other ways, if we invoke mutex_lock/unlock in such a early booting stage.
It might be unsafe because lockdep inserts lock_acquire/release as the complex
hooks.

In summary, would it better if Jianyong splits these early boot and late boot
case? e.g. introduce a nolock version for create_mapping_noalloc().

What do you think of it?

--
Cheers,
Justin (Jia He)
Ard Biesheuvel Jan. 26, 2022, 8:36 a.m. UTC | #11
On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
>
> Hi Catalin
>
> > -----Original Message-----
> > From: Catalin Marinas <catalin.marinas@arm.com>
> > Sent: Friday, January 7, 2022 6:43 PM
> > To: Jianyong Wu <Jianyong.Wu@arm.com>
> > Cc: will@kernel.org; Anshuman Khandual <Anshuman.Khandual@arm.com>;
> > akpm@linux-foundation.org; david@redhat.com; quic_qiancai@quicinc.com;
> > ardb@kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> > kernel@lists.infradead.org; gshan@redhat.com; Justin He
> > <Justin.He@arm.com>; nd <nd@arm.com>
> > Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> > pud mapping
> >
> > On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> > > Hi Catalin,
> > >
> > > I roughly find the root cause.
> > >  alloc_init_pud will be called at the very beginning of kernel boot in
> > create_mapping_noalloc where no memory allocator is initialized. But
> > lockdep check may need allocate memory. So, kernel take exception when
> > acquire lock.(I have not found the exact code that cause this issue)
> > that's say we may not be able to use a lock so early.
> > >
> > > I come up with 2 methods to address it.
> > > 1) skip dead lock check at the very beginning of kernel boot in lockdep
> > code.
> > > 2) provided 2 two versions of __create_pgd_mapping, one with lock in
> > > it and the other without. There may be no possible of race for memory
> > > mapping at the very beginning time of kernel boot, thus we can use the
> > > no lock version of __create_pgd_mapping safely.
> > > In my test, this issue is gone if there is no lock held in
> > > create_mapping_noalloc. I think create_mapping_noalloc is called early
> > > enough to avoid the race conditions of memory mapping, however, I have
> > > not proved it.
> >
> > I think method 2 would work better but rather than implementing new
> > nolock functions I'd add a NO_LOCK flag and check it in
> > alloc_init_pud() before mutex_lock/unlock. Also add a comment when
> > passing the NO_LOCK flag on why it's needed and why there wouldn't be
> > any races at that stage (early boot etc.)
> >
> The problematic code path is:
> __primary_switched
>         early_fdt_map->fixmap_remap_fdt
>                 create_mapping_noalloc->alloc_init_pud
>                         mutex_lock (with Jianyong's patch)
>
> The problem seems to be that we will clear BSS segment twice if kaslr
> is enabled. Hence, some of the static variables in lockdep init process were
> messed up. That is to said, with kaslr enabled we might initialize lockdep
> twice if we add mutex_lock/unlock in alloc_init_pud().
>

Thanks for tracking that down.

Note that clearing the BSS twice is not the root problem here. The
root problem is that we set global state while the kernel runs at the
default link time address, and then refer to it again after the entire
kernel has been shifted in the kernel VA space. Such global state
could consist of mutable pointers to statically allocated data (which
would be reset to their default values after the relocation code runs
again), or global pointer variables in BSS. In either case, relying on
such a global variable after the second relocation performed by KASLR
would be risky, and so we should avoid manipulating global state at
all if it might involve pointer to statically allocated data
structures.

> In other ways, if we invoke mutex_lock/unlock in such a early booting stage.
> It might be unsafe because lockdep inserts lock_acquire/release as the complex
> hooks.
>
> In summary, would it better if Jianyong splits these early boot and late boot
> case? e.g. introduce a nolock version for create_mapping_noalloc().
>
> What do you think of it?
>

The pre-KASLR case definitely doesn't need a lock. But given that
create_mapping_noalloc() is only used to map the FDT, which happens
very early one way or the other, wouldn't it be better to move the
lock/unlock into other callers of __create_pgd_mapping()? (and make
sure no other users of the fixmap slots exist)
Jianyong Wu Jan. 26, 2022, 10:09 a.m. UTC | #12
Hi Ard,

> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Wednesday, January 26, 2022 4:37 PM
> To: Justin He <Justin.He@arm.com>
> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> david@redhat.com; quic_qiancai@quicinc.com; linux-
> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> gshan@redhat.com; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
> >
> > Hi Catalin
> >
> > > -----Original Message-----
> > > From: Catalin Marinas <catalin.marinas@arm.com>
> > > Sent: Friday, January 7, 2022 6:43 PM
> > > To: Jianyong Wu <Jianyong.Wu@arm.com>
> > > Cc: will@kernel.org; Anshuman Khandual
> <Anshuman.Khandual@arm.com>;
> > > akpm@linux-foundation.org; david@redhat.com;
> > > quic_qiancai@quicinc.com; ardb@kernel.org;
> > > linux-kernel@vger.kernel.org; linux-arm- kernel@lists.infradead.org;
> > > gshan@redhat.com; Justin He <Justin.He@arm.com>; nd <nd@arm.com>
> > > Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
> > > create pud mapping
> > >
> > > On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> > > > Hi Catalin,
> > > >
> > > > I roughly find the root cause.
> > > >  alloc_init_pud will be called at the very beginning of kernel
> > > > boot in
> > > create_mapping_noalloc where no memory allocator is initialized. But
> > > lockdep check may need allocate memory. So, kernel take exception
> > > when acquire lock.(I have not found the exact code that cause this
> > > issue) that's say we may not be able to use a lock so early.
> > > >
> > > > I come up with 2 methods to address it.
> > > > 1) skip dead lock check at the very beginning of kernel boot in
> > > > lockdep
> > > code.
> > > > 2) provided 2 two versions of __create_pgd_mapping, one with lock
> > > > in it and the other without. There may be no possible of race for
> > > > memory mapping at the very beginning time of kernel boot, thus we
> > > > can use the no lock version of __create_pgd_mapping safely.
> > > > In my test, this issue is gone if there is no lock held in
> > > > create_mapping_noalloc. I think create_mapping_noalloc is called
> > > > early enough to avoid the race conditions of memory mapping,
> > > > however, I have not proved it.
> > >
> > > I think method 2 would work better but rather than implementing new
> > > nolock functions I'd add a NO_LOCK flag and check it in
> > > alloc_init_pud() before mutex_lock/unlock. Also add a comment when
> > > passing the NO_LOCK flag on why it's needed and why there wouldn't
> > > be any races at that stage (early boot etc.)
> > >
> > The problematic code path is:
> > __primary_switched
> >         early_fdt_map->fixmap_remap_fdt
> >                 create_mapping_noalloc->alloc_init_pud
> >                         mutex_lock (with Jianyong's patch)
> >
> > The problem seems to be that we will clear BSS segment twice if kaslr
> > is enabled. Hence, some of the static variables in lockdep init
> > process were messed up. That is to said, with kaslr enabled we might
> > initialize lockdep twice if we add mutex_lock/unlock in alloc_init_pud().
> >
> 
> Thanks for tracking that down.
> 
> Note that clearing the BSS twice is not the root problem here. The root
> problem is that we set global state while the kernel runs at the default link
> time address, and then refer to it again after the entire kernel has been
> shifted in the kernel VA space. Such global state could consist of mutable
> pointers to statically allocated data (which would be reset to their default
> values after the relocation code runs again), or global pointer variables in BSS.
> In either case, relying on such a global variable after the second relocation
> performed by KASLR would be risky, and so we should avoid manipulating
> global state at all if it might involve pointer to statically allocated data
> structures.
> 
> > In other ways, if we invoke mutex_lock/unlock in such a early booting stage.
> > It might be unsafe because lockdep inserts lock_acquire/release as the
> > complex hooks.
> >
> > In summary, would it better if Jianyong splits these early boot and
> > late boot case? e.g. introduce a nolock version for
> create_mapping_noalloc().
> >
> > What do you think of it?
> >
> 
> The pre-KASLR case definitely doesn't need a lock. But given that
> create_mapping_noalloc() is only used to map the FDT, which happens very
> early one way or the other, wouldn't it be better to move the lock/unlock
> into other callers of __create_pgd_mapping()? (and make sure no other
> users of the fixmap slots exist)

There are server callers of __create_pgd_mapping. I think some of them need no fixmap lock as they are called so early. I figure out all of them here:
create_mapping_noalloc:   no lock
create_pgd_mapping:   no lock
__map_memblock:    no lock
map_kernel_segment:  no lock
map_entry_trampoline: no lock
update_mapping_prot:    need lock
arch_add_memory:  need lock

WDYT?

Thanks
Jianyong
Ard Biesheuvel Jan. 26, 2022, 10:12 a.m. UTC | #13
On Wed, 26 Jan 2022 at 11:09, Jianyong Wu <Jianyong.Wu@arm.com> wrote:
>
> Hi Ard,
>
> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Wednesday, January 26, 2022 4:37 PM
> > To: Justin He <Justin.He@arm.com>
> > Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
> > <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
> > <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> > david@redhat.com; quic_qiancai@quicinc.com; linux-
> > kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> > gshan@redhat.com; nd <nd@arm.com>
> > Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> > pud mapping
> >
> > On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
> > >
> > > Hi Catalin
> > >
> > > > -----Original Message-----
> > > > From: Catalin Marinas <catalin.marinas@arm.com>
> > > > Sent: Friday, January 7, 2022 6:43 PM
> > > > To: Jianyong Wu <Jianyong.Wu@arm.com>
> > > > Cc: will@kernel.org; Anshuman Khandual
> > <Anshuman.Khandual@arm.com>;
> > > > akpm@linux-foundation.org; david@redhat.com;
> > > > quic_qiancai@quicinc.com; ardb@kernel.org;
> > > > linux-kernel@vger.kernel.org; linux-arm- kernel@lists.infradead.org;
> > > > gshan@redhat.com; Justin He <Justin.He@arm.com>; nd <nd@arm.com>
> > > > Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
> > > > create pud mapping
> > > >
> > > > On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> > > > > Hi Catalin,
> > > > >
> > > > > I roughly find the root cause.
> > > > >  alloc_init_pud will be called at the very beginning of kernel
> > > > > boot in
> > > > create_mapping_noalloc where no memory allocator is initialized. But
> > > > lockdep check may need allocate memory. So, kernel take exception
> > > > when acquire lock.(I have not found the exact code that cause this
> > > > issue) that's say we may not be able to use a lock so early.
> > > > >
> > > > > I come up with 2 methods to address it.
> > > > > 1) skip dead lock check at the very beginning of kernel boot in
> > > > > lockdep
> > > > code.
> > > > > 2) provided 2 two versions of __create_pgd_mapping, one with lock
> > > > > in it and the other without. There may be no possible of race for
> > > > > memory mapping at the very beginning time of kernel boot, thus we
> > > > > can use the no lock version of __create_pgd_mapping safely.
> > > > > In my test, this issue is gone if there is no lock held in
> > > > > create_mapping_noalloc. I think create_mapping_noalloc is called
> > > > > early enough to avoid the race conditions of memory mapping,
> > > > > however, I have not proved it.
> > > >
> > > > I think method 2 would work better but rather than implementing new
> > > > nolock functions I'd add a NO_LOCK flag and check it in
> > > > alloc_init_pud() before mutex_lock/unlock. Also add a comment when
> > > > passing the NO_LOCK flag on why it's needed and why there wouldn't
> > > > be any races at that stage (early boot etc.)
> > > >
> > > The problematic code path is:
> > > __primary_switched
> > >         early_fdt_map->fixmap_remap_fdt
> > >                 create_mapping_noalloc->alloc_init_pud
> > >                         mutex_lock (with Jianyong's patch)
> > >
> > > The problem seems to be that we will clear BSS segment twice if kaslr
> > > is enabled. Hence, some of the static variables in lockdep init
> > > process were messed up. That is to said, with kaslr enabled we might
> > > initialize lockdep twice if we add mutex_lock/unlock in alloc_init_pud().
> > >
> >
> > Thanks for tracking that down.
> >
> > Note that clearing the BSS twice is not the root problem here. The root
> > problem is that we set global state while the kernel runs at the default link
> > time address, and then refer to it again after the entire kernel has been
> > shifted in the kernel VA space. Such global state could consist of mutable
> > pointers to statically allocated data (which would be reset to their default
> > values after the relocation code runs again), or global pointer variables in BSS.
> > In either case, relying on such a global variable after the second relocation
> > performed by KASLR would be risky, and so we should avoid manipulating
> > global state at all if it might involve pointer to statically allocated data
> > structures.
> >
> > > In other ways, if we invoke mutex_lock/unlock in such a early booting stage.
> > > It might be unsafe because lockdep inserts lock_acquire/release as the
> > > complex hooks.
> > >
> > > In summary, would it better if Jianyong splits these early boot and
> > > late boot case? e.g. introduce a nolock version for
> > create_mapping_noalloc().
> > >
> > > What do you think of it?
> > >
> >
> > The pre-KASLR case definitely doesn't need a lock. But given that
> > create_mapping_noalloc() is only used to map the FDT, which happens very
> > early one way or the other, wouldn't it be better to move the lock/unlock
> > into other callers of __create_pgd_mapping()? (and make sure no other
> > users of the fixmap slots exist)
>
> There are server callers of __create_pgd_mapping. I think some of them need no fixmap lock as they are called so early. I figure out all of them here:
> create_mapping_noalloc:   no lock
> create_pgd_mapping:   no lock
> __map_memblock:    no lock
> map_kernel_segment:  no lock
> map_entry_trampoline: no lock
> update_mapping_prot:    need lock
> arch_add_memory:  need lock
>
> WDYT?
>

That seems reasonable, but it needs to be documented clearly in the code.
David Hildenbrand Jan. 26, 2022, 10:17 a.m. UTC | #14
On 26.01.22 11:12, Ard Biesheuvel wrote:
> On Wed, 26 Jan 2022 at 11:09, Jianyong Wu <Jianyong.Wu@arm.com> wrote:
>>
>> Hi Ard,
>>
>>> -----Original Message-----
>>> From: Ard Biesheuvel <ardb@kernel.org>
>>> Sent: Wednesday, January 26, 2022 4:37 PM
>>> To: Justin He <Justin.He@arm.com>
>>> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
>>> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
>>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
>>> david@redhat.com; quic_qiancai@quicinc.com; linux-
>>> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
>>> gshan@redhat.com; nd <nd@arm.com>
>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
>>> pud mapping
>>>
>>> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
>>>>
>>>> Hi Catalin
>>>>
>>>>> -----Original Message-----
>>>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>>> Sent: Friday, January 7, 2022 6:43 PM
>>>>> To: Jianyong Wu <Jianyong.Wu@arm.com>
>>>>> Cc: will@kernel.org; Anshuman Khandual
>>> <Anshuman.Khandual@arm.com>;
>>>>> akpm@linux-foundation.org; david@redhat.com;
>>>>> quic_qiancai@quicinc.com; ardb@kernel.org;
>>>>> linux-kernel@vger.kernel.org; linux-arm- kernel@lists.infradead.org;
>>>>> gshan@redhat.com; Justin He <Justin.He@arm.com>; nd <nd@arm.com>
>>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
>>>>> create pud mapping
>>>>>
>>>>> On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
>>>>>> Hi Catalin,
>>>>>>
>>>>>> I roughly find the root cause.
>>>>>>  alloc_init_pud will be called at the very beginning of kernel
>>>>>> boot in
>>>>> create_mapping_noalloc where no memory allocator is initialized. But
>>>>> lockdep check may need allocate memory. So, kernel take exception
>>>>> when acquire lock.(I have not found the exact code that cause this
>>>>> issue) that's say we may not be able to use a lock so early.
>>>>>>
>>>>>> I come up with 2 methods to address it.
>>>>>> 1) skip dead lock check at the very beginning of kernel boot in
>>>>>> lockdep
>>>>> code.
>>>>>> 2) provided 2 two versions of __create_pgd_mapping, one with lock
>>>>>> in it and the other without. There may be no possible of race for
>>>>>> memory mapping at the very beginning time of kernel boot, thus we
>>>>>> can use the no lock version of __create_pgd_mapping safely.
>>>>>> In my test, this issue is gone if there is no lock held in
>>>>>> create_mapping_noalloc. I think create_mapping_noalloc is called
>>>>>> early enough to avoid the race conditions of memory mapping,
>>>>>> however, I have not proved it.
>>>>>
>>>>> I think method 2 would work better but rather than implementing new
>>>>> nolock functions I'd add a NO_LOCK flag and check it in
>>>>> alloc_init_pud() before mutex_lock/unlock. Also add a comment when
>>>>> passing the NO_LOCK flag on why it's needed and why there wouldn't
>>>>> be any races at that stage (early boot etc.)
>>>>>
>>>> The problematic code path is:
>>>> __primary_switched
>>>>         early_fdt_map->fixmap_remap_fdt
>>>>                 create_mapping_noalloc->alloc_init_pud
>>>>                         mutex_lock (with Jianyong's patch)
>>>>
>>>> The problem seems to be that we will clear BSS segment twice if kaslr
>>>> is enabled. Hence, some of the static variables in lockdep init
>>>> process were messed up. That is to said, with kaslr enabled we might
>>>> initialize lockdep twice if we add mutex_lock/unlock in alloc_init_pud().
>>>>
>>>
>>> Thanks for tracking that down.
>>>
>>> Note that clearing the BSS twice is not the root problem here. The root
>>> problem is that we set global state while the kernel runs at the default link
>>> time address, and then refer to it again after the entire kernel has been
>>> shifted in the kernel VA space. Such global state could consist of mutable
>>> pointers to statically allocated data (which would be reset to their default
>>> values after the relocation code runs again), or global pointer variables in BSS.
>>> In either case, relying on such a global variable after the second relocation
>>> performed by KASLR would be risky, and so we should avoid manipulating
>>> global state at all if it might involve pointer to statically allocated data
>>> structures.
>>>
>>>> In other ways, if we invoke mutex_lock/unlock in such a early booting stage.
>>>> It might be unsafe because lockdep inserts lock_acquire/release as the
>>>> complex hooks.
>>>>
>>>> In summary, would it better if Jianyong splits these early boot and
>>>> late boot case? e.g. introduce a nolock version for
>>> create_mapping_noalloc().
>>>>
>>>> What do you think of it?
>>>>
>>>
>>> The pre-KASLR case definitely doesn't need a lock. But given that
>>> create_mapping_noalloc() is only used to map the FDT, which happens very
>>> early one way or the other, wouldn't it be better to move the lock/unlock
>>> into other callers of __create_pgd_mapping()? (and make sure no other
>>> users of the fixmap slots exist)
>>
>> There are server callers of __create_pgd_mapping. I think some of them need no fixmap lock as they are called so early. I figure out all of them here:
>> create_mapping_noalloc:   no lock
>> create_pgd_mapping:   no lock
>> __map_memblock:    no lock
>> map_kernel_segment:  no lock
>> map_entry_trampoline: no lock
>> update_mapping_prot:    need lock
>> arch_add_memory:  need lock
>>
>> WDYT?
>>
> 
> That seems reasonable, but it needs to be documented clearly in the code.
> 

Just a random thought, could we rely on system_state to do the locking
conditionally?
Jianyong Wu Jan. 26, 2022, 10:28 a.m. UTC | #15
Hi David,

> -----Original Message-----
> From: David Hildenbrand <david@redhat.com>
> Sent: Wednesday, January 26, 2022 6:18 PM
> To: Ard Biesheuvel <ardb@kernel.org>; Jianyong Wu
> <Jianyong.Wu@arm.com>
> Cc: Justin He <Justin.He@arm.com>; Catalin Marinas
> <Catalin.Marinas@arm.com>; will@kernel.org; Anshuman Khandual
> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On 26.01.22 11:12, Ard Biesheuvel wrote:
> > On Wed, 26 Jan 2022 at 11:09, Jianyong Wu <Jianyong.Wu@arm.com>
> wrote:
> >>
> >> Hi Ard,
> >>
> >>> -----Original Message-----
> >>> From: Ard Biesheuvel <ardb@kernel.org>
> >>> Sent: Wednesday, January 26, 2022 4:37 PM
> >>> To: Justin He <Justin.He@arm.com>
> >>> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
> >>> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
> >>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> >>> david@redhat.com; quic_qiancai@quicinc.com; linux-
> >>> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
> >>> gshan@redhat.com; nd <nd@arm.com>
> >>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
> >>> create pud mapping
> >>>
> >>> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
> >>>>
> >>>> Hi Catalin
> >>>>
> >>>>> -----Original Message-----
> >>>>> From: Catalin Marinas <catalin.marinas@arm.com>
> >>>>> Sent: Friday, January 7, 2022 6:43 PM
> >>>>> To: Jianyong Wu <Jianyong.Wu@arm.com>
> >>>>> Cc: will@kernel.org; Anshuman Khandual
> >>> <Anshuman.Khandual@arm.com>;
> >>>>> akpm@linux-foundation.org; david@redhat.com;
> >>>>> quic_qiancai@quicinc.com; ardb@kernel.org;
> >>>>> linux-kernel@vger.kernel.org; linux-arm-
> >>>>> kernel@lists.infradead.org; gshan@redhat.com; Justin He
> >>>>> <Justin.He@arm.com>; nd <nd@arm.com>
> >>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
> >>>>> create pud mapping
> >>>>>
> >>>>> On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> >>>>>> Hi Catalin,
> >>>>>>
> >>>>>> I roughly find the root cause.
> >>>>>>  alloc_init_pud will be called at the very beginning of kernel
> >>>>>> boot in
> >>>>> create_mapping_noalloc where no memory allocator is initialized.
> >>>>> But lockdep check may need allocate memory. So, kernel take
> >>>>> exception when acquire lock.(I have not found the exact code that
> >>>>> cause this
> >>>>> issue) that's say we may not be able to use a lock so early.
> >>>>>>
> >>>>>> I come up with 2 methods to address it.
> >>>>>> 1) skip dead lock check at the very beginning of kernel boot in
> >>>>>> lockdep
> >>>>> code.
> >>>>>> 2) provided 2 two versions of __create_pgd_mapping, one with lock
> >>>>>> in it and the other without. There may be no possible of race for
> >>>>>> memory mapping at the very beginning time of kernel boot, thus we
> >>>>>> can use the no lock version of __create_pgd_mapping safely.
> >>>>>> In my test, this issue is gone if there is no lock held in
> >>>>>> create_mapping_noalloc. I think create_mapping_noalloc is called
> >>>>>> early enough to avoid the race conditions of memory mapping,
> >>>>>> however, I have not proved it.
> >>>>>
> >>>>> I think method 2 would work better but rather than implementing
> >>>>> new nolock functions I'd add a NO_LOCK flag and check it in
> >>>>> alloc_init_pud() before mutex_lock/unlock. Also add a comment
> when
> >>>>> passing the NO_LOCK flag on why it's needed and why there wouldn't
> >>>>> be any races at that stage (early boot etc.)
> >>>>>
> >>>> The problematic code path is:
> >>>> __primary_switched
> >>>>         early_fdt_map->fixmap_remap_fdt
> >>>>                 create_mapping_noalloc->alloc_init_pud
> >>>>                         mutex_lock (with Jianyong's patch)
> >>>>
> >>>> The problem seems to be that we will clear BSS segment twice if
> >>>> kaslr is enabled. Hence, some of the static variables in lockdep
> >>>> init process were messed up. That is to said, with kaslr enabled we
> >>>> might initialize lockdep twice if we add mutex_lock/unlock in
> alloc_init_pud().
> >>>>
> >>>
> >>> Thanks for tracking that down.
> >>>
> >>> Note that clearing the BSS twice is not the root problem here. The
> >>> root problem is that we set global state while the kernel runs at
> >>> the default link time address, and then refer to it again after the
> >>> entire kernel has been shifted in the kernel VA space. Such global
> >>> state could consist of mutable pointers to statically allocated data
> >>> (which would be reset to their default values after the relocation code
> runs again), or global pointer variables in BSS.
> >>> In either case, relying on such a global variable after the second
> >>> relocation performed by KASLR would be risky, and so we should avoid
> >>> manipulating global state at all if it might involve pointer to
> >>> statically allocated data structures.
> >>>
> >>>> In other ways, if we invoke mutex_lock/unlock in such a early booting
> stage.
> >>>> It might be unsafe because lockdep inserts lock_acquire/release as
> >>>> the complex hooks.
> >>>>
> >>>> In summary, would it better if Jianyong splits these early boot and
> >>>> late boot case? e.g. introduce a nolock version for
> >>> create_mapping_noalloc().
> >>>>
> >>>> What do you think of it?
> >>>>
> >>>
> >>> The pre-KASLR case definitely doesn't need a lock. But given that
> >>> create_mapping_noalloc() is only used to map the FDT, which happens
> >>> very early one way or the other, wouldn't it be better to move the
> >>> lock/unlock into other callers of __create_pgd_mapping()? (and make
> >>> sure no other users of the fixmap slots exist)
> >>
> >> There are server callers of __create_pgd_mapping. I think some of them
> need no fixmap lock as they are called so early. I figure out all of them here:
> >> create_mapping_noalloc:   no lock
> >> create_pgd_mapping:   no lock
> >> __map_memblock:    no lock
> >> map_kernel_segment:  no lock
> >> map_entry_trampoline: no lock
> >> update_mapping_prot:    need lock
> >> arch_add_memory:  need lock
> >>
> >> WDYT?
> >>
> >
> > That seems reasonable, but it needs to be documented clearly in the code.
> >
> 
> Just a random thought, could we rely on system_state to do the locking
> conditionally?

I can't see the point. At the early stages of kernel boot, we definitely need no lock. Also, I think we should keep it simple.

Thanks
Jianyong

> 
> --
> Thanks,
> 
> David / dhildenb
David Hildenbrand Jan. 26, 2022, 10:30 a.m. UTC | #16
On 26.01.22 11:28, Jianyong Wu wrote:
> Hi David,
> 
>> -----Original Message-----
>> From: David Hildenbrand <david@redhat.com>
>> Sent: Wednesday, January 26, 2022 6:18 PM
>> To: Ard Biesheuvel <ardb@kernel.org>; Jianyong Wu
>> <Jianyong.Wu@arm.com>
>> Cc: Justin He <Justin.He@arm.com>; Catalin Marinas
>> <Catalin.Marinas@arm.com>; will@kernel.org; Anshuman Khandual
>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
>> quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org; linux-arm-
>> kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
>> pud mapping
>>
>> On 26.01.22 11:12, Ard Biesheuvel wrote:
>>> On Wed, 26 Jan 2022 at 11:09, Jianyong Wu <Jianyong.Wu@arm.com>
>> wrote:
>>>>
>>>> Hi Ard,
>>>>
>>>>> -----Original Message-----
>>>>> From: Ard Biesheuvel <ardb@kernel.org>
>>>>> Sent: Wednesday, January 26, 2022 4:37 PM
>>>>> To: Justin He <Justin.He@arm.com>
>>>>> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
>>>>> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
>>>>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
>>>>> david@redhat.com; quic_qiancai@quicinc.com; linux-
>>>>> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
>>>>> gshan@redhat.com; nd <nd@arm.com>
>>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
>>>>> create pud mapping
>>>>>
>>>>> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
>>>>>>
>>>>>> Hi Catalin
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>>>>> Sent: Friday, January 7, 2022 6:43 PM
>>>>>>> To: Jianyong Wu <Jianyong.Wu@arm.com>
>>>>>>> Cc: will@kernel.org; Anshuman Khandual
>>>>> <Anshuman.Khandual@arm.com>;
>>>>>>> akpm@linux-foundation.org; david@redhat.com;
>>>>>>> quic_qiancai@quicinc.com; ardb@kernel.org;
>>>>>>> linux-kernel@vger.kernel.org; linux-arm-
>>>>>>> kernel@lists.infradead.org; gshan@redhat.com; Justin He
>>>>>>> <Justin.He@arm.com>; nd <nd@arm.com>
>>>>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
>>>>>>> create pud mapping
>>>>>>>
>>>>>>> On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
>>>>>>>> Hi Catalin,
>>>>>>>>
>>>>>>>> I roughly find the root cause.
>>>>>>>>  alloc_init_pud will be called at the very beginning of kernel
>>>>>>>> boot in
>>>>>>> create_mapping_noalloc where no memory allocator is initialized.
>>>>>>> But lockdep check may need allocate memory. So, kernel take
>>>>>>> exception when acquire lock.(I have not found the exact code that
>>>>>>> cause this
>>>>>>> issue) that's say we may not be able to use a lock so early.
>>>>>>>>
>>>>>>>> I come up with 2 methods to address it.
>>>>>>>> 1) skip dead lock check at the very beginning of kernel boot in
>>>>>>>> lockdep
>>>>>>> code.
>>>>>>>> 2) provided 2 two versions of __create_pgd_mapping, one with lock
>>>>>>>> in it and the other without. There may be no possible of race for
>>>>>>>> memory mapping at the very beginning time of kernel boot, thus we
>>>>>>>> can use the no lock version of __create_pgd_mapping safely.
>>>>>>>> In my test, this issue is gone if there is no lock held in
>>>>>>>> create_mapping_noalloc. I think create_mapping_noalloc is called
>>>>>>>> early enough to avoid the race conditions of memory mapping,
>>>>>>>> however, I have not proved it.
>>>>>>>
>>>>>>> I think method 2 would work better but rather than implementing
>>>>>>> new nolock functions I'd add a NO_LOCK flag and check it in
>>>>>>> alloc_init_pud() before mutex_lock/unlock. Also add a comment
>> when
>>>>>>> passing the NO_LOCK flag on why it's needed and why there wouldn't
>>>>>>> be any races at that stage (early boot etc.)
>>>>>>>
>>>>>> The problematic code path is:
>>>>>> __primary_switched
>>>>>>         early_fdt_map->fixmap_remap_fdt
>>>>>>                 create_mapping_noalloc->alloc_init_pud
>>>>>>                         mutex_lock (with Jianyong's patch)
>>>>>>
>>>>>> The problem seems to be that we will clear BSS segment twice if
>>>>>> kaslr is enabled. Hence, some of the static variables in lockdep
>>>>>> init process were messed up. That is to said, with kaslr enabled we
>>>>>> might initialize lockdep twice if we add mutex_lock/unlock in
>> alloc_init_pud().
>>>>>>
>>>>>
>>>>> Thanks for tracking that down.
>>>>>
>>>>> Note that clearing the BSS twice is not the root problem here. The
>>>>> root problem is that we set global state while the kernel runs at
>>>>> the default link time address, and then refer to it again after the
>>>>> entire kernel has been shifted in the kernel VA space. Such global
>>>>> state could consist of mutable pointers to statically allocated data
>>>>> (which would be reset to their default values after the relocation code
>> runs again), or global pointer variables in BSS.
>>>>> In either case, relying on such a global variable after the second
>>>>> relocation performed by KASLR would be risky, and so we should avoid
>>>>> manipulating global state at all if it might involve pointer to
>>>>> statically allocated data structures.
>>>>>
>>>>>> In other ways, if we invoke mutex_lock/unlock in such a early booting
>> stage.
>>>>>> It might be unsafe because lockdep inserts lock_acquire/release as
>>>>>> the complex hooks.
>>>>>>
>>>>>> In summary, would it better if Jianyong splits these early boot and
>>>>>> late boot case? e.g. introduce a nolock version for
>>>>> create_mapping_noalloc().
>>>>>>
>>>>>> What do you think of it?
>>>>>>
>>>>>
>>>>> The pre-KASLR case definitely doesn't need a lock. But given that
>>>>> create_mapping_noalloc() is only used to map the FDT, which happens
>>>>> very early one way or the other, wouldn't it be better to move the
>>>>> lock/unlock into other callers of __create_pgd_mapping()? (and make
>>>>> sure no other users of the fixmap slots exist)
>>>>
>>>> There are server callers of __create_pgd_mapping. I think some of them
>> need no fixmap lock as they are called so early. I figure out all of them here:
>>>> create_mapping_noalloc:   no lock
>>>> create_pgd_mapping:   no lock
>>>> __map_memblock:    no lock
>>>> map_kernel_segment:  no lock
>>>> map_entry_trampoline: no lock
>>>> update_mapping_prot:    need lock
>>>> arch_add_memory:  need lock
>>>>
>>>> WDYT?
>>>>
>>>
>>> That seems reasonable, but it needs to be documented clearly in the code.
>>>
>>
>> Just a random thought, could we rely on system_state to do the locking
>> conditionally?
> 
> I can't see the point. At the early stages of kernel boot, we definitely need no lock. Also, I think we should keep it simple.
> 

Is e.g.,

if (system_state < SYSTEM_RUNNING)
	/* lock */

if (system_state < SYSTEM_RUNNING)
	/* unlock */

more complicated than checking individual users and eventually getting
it wrong?

> Thanks
> Jianyong
David Hildenbrand Jan. 26, 2022, 10:31 a.m. UTC | #17
On 26.01.22 11:30, David Hildenbrand wrote:
> On 26.01.22 11:28, Jianyong Wu wrote:
>> Hi David,
>>
>>> -----Original Message-----
>>> From: David Hildenbrand <david@redhat.com>
>>> Sent: Wednesday, January 26, 2022 6:18 PM
>>> To: Ard Biesheuvel <ardb@kernel.org>; Jianyong Wu
>>> <Jianyong.Wu@arm.com>
>>> Cc: Justin He <Justin.He@arm.com>; Catalin Marinas
>>> <Catalin.Marinas@arm.com>; will@kernel.org; Anshuman Khandual
>>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
>>> quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org; linux-arm-
>>> kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
>>> pud mapping
>>>
>>> On 26.01.22 11:12, Ard Biesheuvel wrote:
>>>> On Wed, 26 Jan 2022 at 11:09, Jianyong Wu <Jianyong.Wu@arm.com>
>>> wrote:
>>>>>
>>>>> Hi Ard,
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Ard Biesheuvel <ardb@kernel.org>
>>>>>> Sent: Wednesday, January 26, 2022 4:37 PM
>>>>>> To: Justin He <Justin.He@arm.com>
>>>>>> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
>>>>>> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
>>>>>> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
>>>>>> david@redhat.com; quic_qiancai@quicinc.com; linux-
>>>>>> kernel@vger.kernel.org; linux-arm-kernel@lists.infradead.org;
>>>>>> gshan@redhat.com; nd <nd@arm.com>
>>>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
>>>>>> create pud mapping
>>>>>>
>>>>>> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
>>>>>>>
>>>>>>> Hi Catalin
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Catalin Marinas <catalin.marinas@arm.com>
>>>>>>>> Sent: Friday, January 7, 2022 6:43 PM
>>>>>>>> To: Jianyong Wu <Jianyong.Wu@arm.com>
>>>>>>>> Cc: will@kernel.org; Anshuman Khandual
>>>>>> <Anshuman.Khandual@arm.com>;
>>>>>>>> akpm@linux-foundation.org; david@redhat.com;
>>>>>>>> quic_qiancai@quicinc.com; ardb@kernel.org;
>>>>>>>> linux-kernel@vger.kernel.org; linux-arm-
>>>>>>>> kernel@lists.infradead.org; gshan@redhat.com; Justin He
>>>>>>>> <Justin.He@arm.com>; nd <nd@arm.com>
>>>>>>>> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
>>>>>>>> create pud mapping
>>>>>>>>
>>>>>>>> On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
>>>>>>>>> Hi Catalin,
>>>>>>>>>
>>>>>>>>> I roughly find the root cause.
>>>>>>>>>  alloc_init_pud will be called at the very beginning of kernel
>>>>>>>>> boot in
>>>>>>>> create_mapping_noalloc where no memory allocator is initialized.
>>>>>>>> But lockdep check may need allocate memory. So, kernel take
>>>>>>>> exception when acquire lock.(I have not found the exact code that
>>>>>>>> cause this
>>>>>>>> issue) that's say we may not be able to use a lock so early.
>>>>>>>>>
>>>>>>>>> I come up with 2 methods to address it.
>>>>>>>>> 1) skip dead lock check at the very beginning of kernel boot in
>>>>>>>>> lockdep
>>>>>>>> code.
>>>>>>>>> 2) provided 2 two versions of __create_pgd_mapping, one with lock
>>>>>>>>> in it and the other without. There may be no possible of race for
>>>>>>>>> memory mapping at the very beginning time of kernel boot, thus we
>>>>>>>>> can use the no lock version of __create_pgd_mapping safely.
>>>>>>>>> In my test, this issue is gone if there is no lock held in
>>>>>>>>> create_mapping_noalloc. I think create_mapping_noalloc is called
>>>>>>>>> early enough to avoid the race conditions of memory mapping,
>>>>>>>>> however, I have not proved it.
>>>>>>>>
>>>>>>>> I think method 2 would work better but rather than implementing
>>>>>>>> new nolock functions I'd add a NO_LOCK flag and check it in
>>>>>>>> alloc_init_pud() before mutex_lock/unlock. Also add a comment
>>> when
>>>>>>>> passing the NO_LOCK flag on why it's needed and why there wouldn't
>>>>>>>> be any races at that stage (early boot etc.)
>>>>>>>>
>>>>>>> The problematic code path is:
>>>>>>> __primary_switched
>>>>>>>         early_fdt_map->fixmap_remap_fdt
>>>>>>>                 create_mapping_noalloc->alloc_init_pud
>>>>>>>                         mutex_lock (with Jianyong's patch)
>>>>>>>
>>>>>>> The problem seems to be that we will clear BSS segment twice if
>>>>>>> kaslr is enabled. Hence, some of the static variables in lockdep
>>>>>>> init process were messed up. That is to said, with kaslr enabled we
>>>>>>> might initialize lockdep twice if we add mutex_lock/unlock in
>>> alloc_init_pud().
>>>>>>>
>>>>>>
>>>>>> Thanks for tracking that down.
>>>>>>
>>>>>> Note that clearing the BSS twice is not the root problem here. The
>>>>>> root problem is that we set global state while the kernel runs at
>>>>>> the default link time address, and then refer to it again after the
>>>>>> entire kernel has been shifted in the kernel VA space. Such global
>>>>>> state could consist of mutable pointers to statically allocated data
>>>>>> (which would be reset to their default values after the relocation code
>>> runs again), or global pointer variables in BSS.
>>>>>> In either case, relying on such a global variable after the second
>>>>>> relocation performed by KASLR would be risky, and so we should avoid
>>>>>> manipulating global state at all if it might involve pointer to
>>>>>> statically allocated data structures.
>>>>>>
>>>>>>> In other ways, if we invoke mutex_lock/unlock in such a early booting
>>> stage.
>>>>>>> It might be unsafe because lockdep inserts lock_acquire/release as
>>>>>>> the complex hooks.
>>>>>>>
>>>>>>> In summary, would it better if Jianyong splits these early boot and
>>>>>>> late boot case? e.g. introduce a nolock version for
>>>>>> create_mapping_noalloc().
>>>>>>>
>>>>>>> What do you think of it?
>>>>>>>
>>>>>>
>>>>>> The pre-KASLR case definitely doesn't need a lock. But given that
>>>>>> create_mapping_noalloc() is only used to map the FDT, which happens
>>>>>> very early one way or the other, wouldn't it be better to move the
>>>>>> lock/unlock into other callers of __create_pgd_mapping()? (and make
>>>>>> sure no other users of the fixmap slots exist)
>>>>>
>>>>> There are server callers of __create_pgd_mapping. I think some of them
>>> need no fixmap lock as they are called so early. I figure out all of them here:
>>>>> create_mapping_noalloc:   no lock
>>>>> create_pgd_mapping:   no lock
>>>>> __map_memblock:    no lock
>>>>> map_kernel_segment:  no lock
>>>>> map_entry_trampoline: no lock
>>>>> update_mapping_prot:    need lock
>>>>> arch_add_memory:  need lock
>>>>>
>>>>> WDYT?
>>>>>
>>>>
>>>> That seems reasonable, but it needs to be documented clearly in the code.
>>>>
>>>
>>> Just a random thought, could we rely on system_state to do the locking
>>> conditionally?
>>
>> I can't see the point. At the early stages of kernel boot, we definitely need no lock. Also, I think we should keep it simple.
>>
> 
> Is e.g.,
> 
> if (system_state < SYSTEM_RUNNING)
> 	/* lock */
> 
> if (system_state < SYSTEM_RUNNING)
> 	/* unlock */

of course, inverting the conditions ;)
Jia He Jan. 27, 2022, 1:31 a.m. UTC | #18
Hi Ard

> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Wednesday, January 26, 2022 4:37 PM
> To: Justin He <Justin.He@arm.com>
> Cc: Catalin Marinas <Catalin.Marinas@arm.com>; Jianyong Wu
> <Jianyong.Wu@arm.com>; will@kernel.org; Anshuman Khandual
> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org; david@redhat.com;
> quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Wed, 26 Jan 2022 at 05:21, Justin He <Justin.He@arm.com> wrote:
> >
> > Hi Catalin
> >
> > > -----Original Message-----
> > > From: Catalin Marinas <catalin.marinas@arm.com>
> > > Sent: Friday, January 7, 2022 6:43 PM
> > > To: Jianyong Wu <Jianyong.Wu@arm.com>
> > > Cc: will@kernel.org; Anshuman Khandual <Anshuman.Khandual@arm.com>;
> > > akpm@linux-foundation.org; david@redhat.com; quic_qiancai@quicinc.com;
> > > ardb@kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> > > kernel@lists.infradead.org; gshan@redhat.com; Justin He
> > > <Justin.He@arm.com>; nd <nd@arm.com>
> > > Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when
> create
> > > pud mapping
> > >
> > > On Fri, Jan 07, 2022 at 09:10:57AM +0000, Jianyong Wu wrote:
> > > > Hi Catalin,
> > > >
> > > > I roughly find the root cause.
> > > >  alloc_init_pud will be called at the very beginning of kernel boot
> in
> > > create_mapping_noalloc where no memory allocator is initialized. But
> > > lockdep check may need allocate memory. So, kernel take exception when
> > > acquire lock.(I have not found the exact code that cause this issue)
> > > that's say we may not be able to use a lock so early.
> > > >
> > > > I come up with 2 methods to address it.
> > > > 1) skip dead lock check at the very beginning of kernel boot in
> lockdep
> > > code.
> > > > 2) provided 2 two versions of __create_pgd_mapping, one with lock in
> > > > it and the other without. There may be no possible of race for
> memory
> > > > mapping at the very beginning time of kernel boot, thus we can use
> the
> > > > no lock version of __create_pgd_mapping safely.
> > > > In my test, this issue is gone if there is no lock held in
> > > > create_mapping_noalloc. I think create_mapping_noalloc is called
> early
> > > > enough to avoid the race conditions of memory mapping, however, I
> have
> > > > not proved it.
> > >
> > > I think method 2 would work better but rather than implementing new
> > > nolock functions I'd add a NO_LOCK flag and check it in
> > > alloc_init_pud() before mutex_lock/unlock. Also add a comment when
> > > passing the NO_LOCK flag on why it's needed and why there wouldn't be
> > > any races at that stage (early boot etc.)
> > >
> > The problematic code path is:
> > __primary_switched
> >         early_fdt_map->fixmap_remap_fdt
> >                 create_mapping_noalloc->alloc_init_pud
> >                         mutex_lock (with Jianyong's patch)
> >
> > The problem seems to be that we will clear BSS segment twice if kaslr
> > is enabled. Hence, some of the static variables in lockdep init process
> were
> > messed up. That is to said, with kaslr enabled we might initialize
> lockdep
> > twice if we add mutex_lock/unlock in alloc_init_pud().
> >
> 
> Thanks for tracking that down.
> 
> Note that clearing the BSS twice is not the root problem here. The
> root problem is that we set global state while the kernel runs at the
> default link time address, and then refer to it again after the entire
> kernel has been shifted in the kernel VA space. Such global state
> could consist of mutable pointers to statically allocated data (which
> would be reset to their default values after the relocation code runs
> again), or global pointer variables in BSS. In either case, relying on
> such a global variable after the second relocation performed by KASLR
> would be risky, and so we should avoid manipulating global state at
> all if it might involve pointer to statically allocated data
> structures.
> 
Thanks for the explanation, which makes root cause clearer.
I have a question off this thread:
Should we avoid to invoke early_fdt_map and init_feature_override twice
with kaslr enabled?

In Commit f6f0c4362f07 ("arm64: Extract early FDT mapping from kaslr
early_init() "), it implicitly invokes early_fdt_map first time before
kaslr is enabled and 2nd time after it.

What to you think of below changes (tested in both guest and host boot):

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 6a98f1a38c29..3758ac057a6a 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -450,12 +450,12 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        bl      kasan_early_init
 #endif
-       mov     x0, x21                         // pass FDT address in x0
-       bl      early_fdt_map                   // Try mapping the FDT early
-       bl      init_feature_override           // Parse cpu feature overrides
 #ifdef CONFIG_RANDOMIZE_BASE
        tst     x23, ~(MIN_KIMG_ALIGN - 1)      // already running randomized?
        b.ne    0f
+       mov     x0, x21                         // pass FDT address in x0
+       bl      early_fdt_map                   // Try mapping the FDT early
+       bl      init_feature_override           // Parse cpu feature overrides
        bl      kaslr_early_init                // parse FDT for KASLR options
        cbz     x0, 0f                          // KASLR disabled? just proceed
        orr     x23, x23, x0                    // record KASLR offset


--
Cheers,
Justin (Jia He)
Jianyong Wu Jan. 27, 2022, 6:24 a.m. UTC | #19
Hi David,

> >>>>>> Thanks for tracking that down.
> >>>>>>
> >>>>>> Note that clearing the BSS twice is not the root problem here.
> >>>>>> The root problem is that we set global state while the kernel
> >>>>>> runs at the default link time address, and then refer to it again
> >>>>>> after the entire kernel has been shifted in the kernel VA space.
> >>>>>> Such global state could consist of mutable pointers to statically
> >>>>>> allocated data (which would be reset to their default values
> >>>>>> after the relocation code
> >>> runs again), or global pointer variables in BSS.
> >>>>>> In either case, relying on such a global variable after the
> >>>>>> second relocation performed by KASLR would be risky, and so we
> >>>>>> should avoid manipulating global state at all if it might involve
> >>>>>> pointer to statically allocated data structures.
> >>>>>>
> >>>>>>> In other ways, if we invoke mutex_lock/unlock in such a early
> >>>>>>> booting
> >>> stage.
> >>>>>>> It might be unsafe because lockdep inserts lock_acquire/release
> >>>>>>> as the complex hooks.
> >>>>>>>
> >>>>>>> In summary, would it better if Jianyong splits these early boot
> >>>>>>> and late boot case? e.g. introduce a nolock version for
> >>>>>> create_mapping_noalloc().
> >>>>>>>
> >>>>>>> What do you think of it?
> >>>>>>>
> >>>>>>
> >>>>>> The pre-KASLR case definitely doesn't need a lock. But given that
> >>>>>> create_mapping_noalloc() is only used to map the FDT, which
> >>>>>> happens very early one way or the other, wouldn't it be better to
> >>>>>> move the lock/unlock into other callers of
> >>>>>> __create_pgd_mapping()? (and make sure no other users of the
> >>>>>> fixmap slots exist)
> >>>>>
> >>>>> There are server callers of __create_pgd_mapping. I think some of
> >>>>> them
> >>> need no fixmap lock as they are called so early. I figure out all of them
> here:
> >>>>> create_mapping_noalloc:   no lock
> >>>>> create_pgd_mapping:   no lock
> >>>>> __map_memblock:    no lock
> >>>>> map_kernel_segment:  no lock
> >>>>> map_entry_trampoline: no lock
> >>>>> update_mapping_prot:    need lock
> >>>>> arch_add_memory:  need lock
> >>>>>
> >>>>> WDYT?
> >>>>>
> >>>>
> >>>> That seems reasonable, but it needs to be documented clearly in the
> code.
> >>>>
> >>>
> >>> Just a random thought, could we rely on system_state to do the
> >>> locking conditionally?
> >>
> >> I can't see the point. At the early stages of kernel boot, we definitely
> need no lock. Also, I think we should keep it simple.
> >>
> >
> > Is e.g.,
> >
> > if (system_state < SYSTEM_RUNNING)
> > 	/* lock */
> >
> > if (system_state < SYSTEM_RUNNING)
> > 	/* unlock */
> 
> of course, inverting the conditions ;)

Yes, system_state can roughly separate these callers of __create_pgd_mapping. When system_state > SYSTEM_BOOTING we can add the lock.
Thus, I have the following change:

static DEFINE_SPINLOCK(swapper_pgdir_lock);
+static DEFINE_MUTEX(fixmap_lock);

 void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -329,6 +330,8 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
        }
        BUG_ON(p4d_bad(p4d));

+       if (system_state > SYSTEM_BOOTING)
+               mutex_lock(&fixmap_lock);
        pudp = pud_set_fixmap_offset(p4dp, addr);
        do {
                pud_t old_pud = READ_ONCE(*pudp);
@@ -359,6 +362,8 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
        } while (pudp++, addr = next, addr != end);

        pud_clear_fixmap();
+       if (system_state > SYSTEM_BOOTING)
+               mutex_unlock(&fixmap_lock);
 }

It seems work and somehow simper. But I don't know if it is reasonable to do this. So, any idea? @Ard Biesheuvel  @Catalin Marinas 

Thanks
Jianyong

> 
> 
> --
> Thanks,
> 
> David / dhildenb
David Hildenbrand Jan. 27, 2022, 12:22 p.m. UTC | #20
> 
> Yes, system_state can roughly separate these callers of __create_pgd_mapping. When system_state > SYSTEM_BOOTING we can add the lock.
> Thus, I have the following change:
> 
> static DEFINE_SPINLOCK(swapper_pgdir_lock);
> +static DEFINE_MUTEX(fixmap_lock);
> 
>  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
>  {
> @@ -329,6 +330,8 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>         }
>         BUG_ON(p4d_bad(p4d));
> 
> +       if (system_state > SYSTEM_BOOTING)

As there is nothing smaller than SYSTEM_BOOTING, you can use
	if (system_state != SYSTEM_BOOTING)

...

> 
> It seems work and somehow simper. But I don't know if it is reasonable to do this. So, any idea? @Ard Biesheuvel  @Catalin Marinas 

It's worth looking at kernel/notifier.c, e.g.,
blocking_notifier_chain_register()

if (unlikely(system_state == SYSTEM_BOOTING))
	return notifier_chain_register(&nh->head, n);

down_write(&nh->rwsem);
ret = notifier_chain_register(&nh->head, n);
up_write(&nh->rwsem);



If we decide to go down that path, we should make sure to add a comment like

/*
 * No need for locking during early boot. And it doesn't work as
 * expected with KASLR enabled where we might clear BSS twice.
 */
Catalin Marinas Jan. 27, 2022, 12:34 p.m. UTC | #21
On Thu, Jan 27, 2022 at 01:22:47PM +0100, David Hildenbrand wrote:
> > Yes, system_state can roughly separate these callers of __create_pgd_mapping. When system_state > SYSTEM_BOOTING we can add the lock.
> > Thus, I have the following change:
> > 
> > static DEFINE_SPINLOCK(swapper_pgdir_lock);
> > +static DEFINE_MUTEX(fixmap_lock);
> > 
> >  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
> >  {
> > @@ -329,6 +330,8 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> >         }
> >         BUG_ON(p4d_bad(p4d));
> > 
> > +       if (system_state > SYSTEM_BOOTING)
> 
> As there is nothing smaller than SYSTEM_BOOTING, you can use
> 	if (system_state != SYSTEM_BOOTING)
> 
> ...
> 
> > 
> > It seems work and somehow simper. But I don't know if it is reasonable to do this. So, any idea? @Ard Biesheuvel  @Catalin Marinas 
> 
> It's worth looking at kernel/notifier.c, e.g.,
> blocking_notifier_chain_register()
> 
> if (unlikely(system_state == SYSTEM_BOOTING))
> 	return notifier_chain_register(&nh->head, n);
> 
> down_write(&nh->rwsem);
> ret = notifier_chain_register(&nh->head, n);
> up_write(&nh->rwsem);
> 
> If we decide to go down that path, we should make sure to add a comment like
> 
> /*
>  * No need for locking during early boot. And it doesn't work as
>  * expected with KASLR enabled where we might clear BSS twice.
>  */

A similar approach sounds fine to me.
Jianyong Wu Jan. 31, 2022, 8:10 a.m. UTC | #22
Hi David,

> -----Original Message-----
> From: David Hildenbrand <david@redhat.com>
> Sent: Thursday, January 27, 2022 8:23 PM
> To: Jianyong Wu <Jianyong.Wu@arm.com>; Ard Biesheuvel
> <ardb@kernel.org>; Catalin Marinas <Catalin.Marinas@arm.com>
> Cc: Justin He <Justin.He@arm.com>; will@kernel.org; Anshuman Khandual
> <Anshuman.Khandual@arm.com>; akpm@linux-foundation.org;
> quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org; linux-arm-
> kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> >
> > Yes, system_state can roughly separate these callers of
> __create_pgd_mapping. When system_state > SYSTEM_BOOTING we can
> add the lock.
> > Thus, I have the following change:
> >
> > static DEFINE_SPINLOCK(swapper_pgdir_lock);
> > +static DEFINE_MUTEX(fixmap_lock);
> >
> >  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)  { @@ -329,6 +330,8 @@
> > static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long
> end,
> >         }
> >         BUG_ON(p4d_bad(p4d));
> >
> > +       if (system_state > SYSTEM_BOOTING)
> 
> As there is nothing smaller than SYSTEM_BOOTING, you can use
> 	if (system_state != SYSTEM_BOOTING)
> 
> ...
OK,

> 
> >
> > It seems work and somehow simper. But I don't know if it is reasonable
> > to do this. So, any idea? @Ard Biesheuvel  @Catalin Marinas
> 
> It's worth looking at kernel/notifier.c, e.g.,
> blocking_notifier_chain_register()
> 
> if (unlikely(system_state == SYSTEM_BOOTING))
> 	return notifier_chain_register(&nh->head, n);
> 
> down_write(&nh->rwsem);
> ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem);
> 
> 
> 
> If we decide to go down that path, we should make sure to add a comment
> like
> 
> /*
>  * No need for locking during early boot. And it doesn't work as
>  * expected with KASLR enabled where we might clear BSS twice.
>  */
> 

Thanks David, it's very helpful.

Thanks
Jianyong

> --
> Thanks,
> 
> David / dhildenb
Jianyong Wu Jan. 31, 2022, 8:13 a.m. UTC | #23
Hi Catalin,

> -----Original Message-----
> From: Catalin Marinas <catalin.marinas@arm.com>
> Sent: Thursday, January 27, 2022 8:34 PM
> To: David Hildenbrand <david@redhat.com>
> Cc: Jianyong Wu <Jianyong.Wu@arm.com>; Ard Biesheuvel
> <ardb@kernel.org>; Justin He <Justin.He@arm.com>; will@kernel.org;
> Anshuman Khandual <Anshuman.Khandual@arm.com>; akpm@linux-
> foundation.org; quic_qiancai@quicinc.com; linux-kernel@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org; gshan@redhat.com; nd <nd@arm.com>
> Subject: Re: [PATCH v3] arm64/mm: avoid fixmap race condition when create
> pud mapping
> 
> On Thu, Jan 27, 2022 at 01:22:47PM +0100, David Hildenbrand wrote:
> > > Yes, system_state can roughly separate these callers of
> __create_pgd_mapping. When system_state > SYSTEM_BOOTING we can
> add the lock.
> > > Thus, I have the following change:
> > >
> > > static DEFINE_SPINLOCK(swapper_pgdir_lock);
> > > +static DEFINE_MUTEX(fixmap_lock);
> > >
> > >  void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)  { @@ -329,6 +330,8
> @@
> > > static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long
> end,
> > >         }
> > >         BUG_ON(p4d_bad(p4d));
> > >
> > > +       if (system_state > SYSTEM_BOOTING)
> >
> > As there is nothing smaller than SYSTEM_BOOTING, you can use
> > 	if (system_state != SYSTEM_BOOTING)
> >
> > ...
> >
> > >
> > > It seems work and somehow simper. But I don't know if it is
> > > reasonable to do this. So, any idea? @Ard Biesheuvel  @Catalin
> > > Marinas
> >
> > It's worth looking at kernel/notifier.c, e.g.,
> > blocking_notifier_chain_register()
> >
> > if (unlikely(system_state == SYSTEM_BOOTING))
> > 	return notifier_chain_register(&nh->head, n);
> >
> > down_write(&nh->rwsem);
> > ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem);
> >
> > If we decide to go down that path, we should make sure to add a
> > comment like
> >
> > /*
> >  * No need for locking during early boot. And it doesn't work as
> >  * expected with KASLR enabled where we might clear BSS twice.
> >  */
> 
> A similar approach sounds fine to me.
> 

OK, I'll send the next version based on David's comments.

Thanks
Jianyong

> --
> Catalin
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8..e680a6a8ca40 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -63,6 +63,7 @@  static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
 
 static DEFINE_SPINLOCK(swapper_pgdir_lock);
+static DEFINE_MUTEX(fixmap_lock);
 
 void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -329,6 +330,11 @@  static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	}
 	BUG_ON(p4d_bad(p4d));
 
+	/*
+	 * We only have one fixmap entry per page-table level, so take
+	 * the fixmap lock until we're done.
+	 */
+	mutex_lock(&fixmap_lock);
 	pudp = pud_set_fixmap_offset(p4dp, addr);
 	do {
 		pud_t old_pud = READ_ONCE(*pudp);
@@ -359,6 +365,7 @@  static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	} while (pudp++, addr = next, addr != end);
 
 	pud_clear_fixmap();
+	mutex_unlock(&fixmap_lock);
 }
 
 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,