Message ID | 20210127211851.GA32689@ls3530.fritz.box (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | parisc: Optimize per-pagetable spinlocks (v11) | expand |
Am Mittwoch, 27. Januar 2021, 22:18:51 CET schrieb Helge Deller: > On parisc a spinlock is stored in the next page behind the pgd which > protects against parallel accesses to the pgd. That's why one additional > page (PGD_ALLOC_ORDER) is allocated for the pgd. > > Matthew Wilcox suggested that we instead should use a pointer in the > struct page table for this spinlock and noted, that the comments for the > PGD_ORDER and PMD_ORDER defines were wrong. > > Both suggestions are addressed in this patch. The pgd spinlock > (parisc_pgd_lock) is stored in the struct page table. In > switch_mm_irqs_off() the physical address of this lock is loaded into > cr28 (tr4) and the pgd into cr25, so that the fault handlers can > directly access the lock. > > The currently implemened Hybrid L2/L3 page table scheme (where the pmd > is adjacent to the pgd) is dropped now too. > > Suggested-by: Matthew Wilcox <willy@infradead.org> > Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") > Signed-off-by: Helge Deller <deller@gmx.de> > Signed-off-by: John David Anglin <dave.anglin@bell.net> > > diff --git a/arch/parisc/include/asm/mmu_context.h > b/arch/parisc/include/asm/mmu_context.h index 46f8c22c5977..3e02b1f75e54 > 100644 > --- a/arch/parisc/include/asm/mmu_context.h > +++ b/arch/parisc/include/asm/mmu_context.h > @@ -5,6 +5,7 @@ > #include <linux/mm.h> > #include <linux/sched.h> > #include <linux/atomic.h> > +#include <linux/spinlock.h> > #include <asm-generic/mm_hooks.h> > > /* on PA-RISC, we actually have enough contexts to justify an allocator > @@ -50,6 +51,14 @@ static inline void switch_mm_irqs_off(struct mm_struct > *prev, struct mm_struct *next, struct task_struct *tsk) > { > if (prev != next) { > +#ifdef CONFIG_SMP > + /* phys address of tlb lock in cr28 (tr4) for TLB faults */ > + struct page *page; > + > + page = virt_to_page((unsigned long) next->pgd); This is one of the few cases you have a space after the cast. Another one is in pgd_alloc(): >+ page = virt_to_page((unsigned long) pgd); > diff --git a/arch/parisc/include/asm/pgalloc.h > b/arch/parisc/include/asm/pgalloc.h index a6482b2ce0ea..9c1a54543c87 100644 > --- a/arch/parisc/include/asm/pgalloc.h > +++ b/arch/parisc/include/asm/pgalloc.h > @@ -68,43 +66,27 @@ static inline void pud_populate(struct mm_struct *mm, > pud_t *pud, pmd_t *pmd) (__u32)(__pa((unsigned long)pmd) >> > PxD_VALUE_SHIFT))); > } > > -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long > address) > +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned > long addr) Does that change add benefit? > { > - return (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_ORDER); > + pmd_t *pmd; > + > + pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_ORDER); > + if (pmd) Maybe annotate that as likely() as it was in pgd_alloc() before? > + memset ((void *)pmd, 0, PAGE_SIZE << PMD_ORDER); > + return pmd; > } > diff --git a/arch/parisc/include/asm/pgtable.h > b/arch/parisc/include/asm/pgtable.h index 75cf84070fc9..c08c7b37e5f4 100644 > --- a/arch/parisc/include/asm/pgtable.h > +++ b/arch/parisc/include/asm/pgtable.h > @@ -94,10 +96,12 @@ static inline void purge_tlb_entries(struct mm_struct > *mm, unsigned long addr) #define set_pte_at(mm, addr, ptep, pteval) \ > do { \ > unsigned long flags; \ > - spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\ > + spinlock_t *pgd_lock; \ > + pgd_lock = pgd_spinlock((mm)->pgd); \ These should just fit into a single line. > + spin_lock_irqsave(pgd_lock, flags); \ > set_pte(ptep, pteval); \ > purge_tlb_entries(mm, addr); \ > - spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\ > + spin_unlock_irqrestore(pgd_lock, flags); \ > } while (0) > > #endif /* !__ASSEMBLY__ */ > diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c > index 3ec633b11b54..4f3f180b0b20 100644 > --- a/arch/parisc/mm/init.c > +++ b/arch/parisc/mm/init.c > @@ -681,6 +681,24 @@ static void __init parisc_bootmem_free(void) > free_area_init(max_zone_pfn); > } > > +static void __init parisc_init_pgd_lock(void) > +{ > + struct page *page; > + > + page = virt_to_page((unsigned long) &swapper_pg_dir); Another space. > + page->parisc_pgd_lock = &pa_swapper_pg_lock; > +} > + > +#ifdef CONFIG_SMP > +spinlock_t *pgd_spinlock(pgd_t *pgd) > +{ > + struct page *page; > + > + page = virt_to_page((unsigned long) pgd); > + return page->parisc_pgd_lock; > +} > +#endif This is very simple, and I suspect it being called rather often. Wouldn't it make sense to make it inline? Eike
On 1/28/21 9:36 AM, Rolf Eike Beer wrote: > Am Mittwoch, 27. Januar 2021, 22:18:51 CET schrieb Helge Deller: >> On parisc a spinlock is stored in the next page behind the pgd which >> protects against parallel accesses to the pgd. That's why one additional >> page (PGD_ALLOC_ORDER) is allocated for the pgd. >> >> Matthew Wilcox suggested that we instead should use a pointer in the >> struct page table for this spinlock and noted, that the comments for the >> PGD_ORDER and PMD_ORDER defines were wrong. >> >> Both suggestions are addressed in this patch. The pgd spinlock >> (parisc_pgd_lock) is stored in the struct page table. In >> switch_mm_irqs_off() the physical address of this lock is loaded into >> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >> directly access the lock. >> >> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >> is adjacent to the pgd) is dropped now too. >> >> Suggested-by: Matthew Wilcox <willy@infradead.org> >> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >> Signed-off-by: Helge Deller <deller@gmx.de> >> Signed-off-by: John David Anglin <dave.anglin@bell.net> >> [...lots of suggestions by Rolf...] Rolf, thanks a lot for your review. I've addressed most of your suggestions and published a v12 patch here: https://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git/log/?h=pgtable_spinlock-v12 >> diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c >> index 3ec633b11b54..4f3f180b0b20 100644 >> --- a/arch/parisc/mm/init.c >> +++ b/arch/parisc/mm/init.c >> +#ifdef CONFIG_SMP >> +spinlock_t *pgd_spinlock(pgd_t *pgd) >> +{ >> + struct page *page; >> + >> + page = virt_to_page((unsigned long) pgd); >> + return page->parisc_pgd_lock; >> +} >> +#endif > > This is very simple, and I suspect it being called rather often. Wouldn't it > make sense to make it inline? No, it's not simple, that's why I haven't inlined it. The virt_to_page() expands to many asm instructions based on the selected memory model. Thanks! Helge
On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: > On parisc a spinlock is stored in the next page behind the pgd which > protects against parallel accesses to the pgd. That's why one additional > page (PGD_ALLOC_ORDER) is allocated for the pgd. > > Matthew Wilcox suggested that we instead should use a pointer in the > struct page table for this spinlock and noted, that the comments for the > PGD_ORDER and PMD_ORDER defines were wrong. > > Both suggestions are addressed in this patch. The pgd spinlock > (parisc_pgd_lock) is stored in the struct page table. In > switch_mm_irqs_off() the physical address of this lock is loaded into > cr28 (tr4) and the pgd into cr25, so that the fault handlers can > directly access the lock. > > The currently implemened Hybrid L2/L3 page table scheme (where the pmd > is adjacent to the pgd) is dropped now too. > > Suggested-by: Matthew Wilcox <willy@infradead.org> > Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") > Signed-off-by: Helge Deller <deller@gmx.de> > Signed-off-by: John David Anglin <dave.anglin@bell.net> This patch results in: BUG: spinlock recursion on CPU#0, swapper/0/1 lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 Hardware name: 9000/778/B160L Backtrace: [<1019f9bc>] show_stack+0x34/0x48 [<10a65278>] dump_stack+0x94/0x114 [<10219f4c>] spin_dump+0x8c/0xb8 [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 [<102b8900>] __get_user_pages_remote+0x134/0x34c [<102b8b80>] get_user_pages_remote+0x68/0x90 [<102fccb0>] get_arg_page+0x94/0xd8 [<102fdd84>] copy_string_kernel+0xc4/0x234 [<102fe70c>] kernel_execve+0xcc/0x1a4 [<10a58d94>] run_init_process+0xbc/0xe0 [<10a70d50>] kernel_init+0x98/0x13c [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 when trying to boot parisc/hppa images in qemu. Reverting this patch fixes the problem. Bitsect log attached. Guenter --- # bad: [a4bfd8d46ac357c12529e4eebb6c89502b03ecc9] Add linux-next specific files for 20210209 # good: [92bf22614b21a2706f4993b278017e437f7785b3] Linux 5.11-rc7 git bisect start 'HEAD' 'v5.11-rc7' # bad: [a8eb921ba7e8e77d994a1c6c69c8ef08456ecf53] Merge remote-tracking branch 'crypto/master' git bisect bad a8eb921ba7e8e77d994a1c6c69c8ef08456ecf53 # bad: [b68df186dae8ae890df08059bb068b78252b053a] Merge remote-tracking branch 'hid/for-next' git bisect bad b68df186dae8ae890df08059bb068b78252b053a # good: [323c9f6fb99b033883b404ecbc811e7b283a60b3] Merge remote-tracking branch 'sunxi/sunxi/for-next' git bisect good 323c9f6fb99b033883b404ecbc811e7b283a60b3 # bad: [9d40a7a579a5c51fad0d734a4ed39e39b858fca2] Merge remote-tracking branch 'btrfs/for-next' git bisect bad 9d40a7a579a5c51fad0d734a4ed39e39b858fca2 # bad: [afe0c3efe88f6c295542fd336d5f604115e9184f] Merge remote-tracking branch 'powerpc/next' git bisect bad afe0c3efe88f6c295542fd336d5f604115e9184f # good: [c276186556ed2ce6d30da69ce275234a7df85b09] Merge remote-tracking branch 'mips/mips-next' git bisect good c276186556ed2ce6d30da69ce275234a7df85b09 # good: [755d664174463791489dddf34c33308b61de68c3] powerpc: DebugException remove args git bisect good 755d664174463791489dddf34c33308b61de68c3 # good: [26418b36a11f2eaf2556aa8cefe86132907e311f] powerpc/64s/radix: refactor TLB flush type selection git bisect good 26418b36a11f2eaf2556aa8cefe86132907e311f # bad: [d7bbb31642d2bd4aa5aad3595061a5e96c32d91d] Merge remote-tracking branch 'parisc-hd/for-next' git bisect bad d7bbb31642d2bd4aa5aad3595061a5e96c32d91d # good: [2261352157a932717ec08b9dd18d1bfbb7c37c52] Merge remote-tracking branch 'openrisc/or1k-5.11-fixes' into or1k-5.12-updates git bisect good 2261352157a932717ec08b9dd18d1bfbb7c37c52 # good: [3c92b9eed3ae088ade3688fb356a90926c1c8ee4] Merge remote-tracking branch 'openrisc/for-next' git bisect good 3c92b9eed3ae088ade3688fb356a90926c1c8ee4 # good: [accb4993d2ee6b644a3d01851cf2fb2c1e0813a6] parisc: Fix IVT checksum calculation wrt HPMC git bisect good accb4993d2ee6b644a3d01851cf2fb2c1e0813a6 # bad: [4add5f175b1e4e71c06493f9a2c52490d2ea4365] parisc: Optimize per-pagetable spinlocks git bisect bad 4add5f175b1e4e71c06493f9a2c52490d2ea4365 # good: [0d2d3836dd0a597e514e6231fbf2ae3944f5d38c] parisc: Bump 64-bit IRQ stack size to 64 KB git bisect good 0d2d3836dd0a597e514e6231fbf2ae3944f5d38c # first bad commit: [4add5f175b1e4e71c06493f9a2c52490d2ea4365] parisc: Optimize per-pagetable spinlocks
On 2/10/21 3:52 PM, Guenter Roeck wrote: > On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >> On parisc a spinlock is stored in the next page behind the pgd which >> protects against parallel accesses to the pgd. That's why one additional >> page (PGD_ALLOC_ORDER) is allocated for the pgd. >> >> Matthew Wilcox suggested that we instead should use a pointer in the >> struct page table for this spinlock and noted, that the comments for the >> PGD_ORDER and PMD_ORDER defines were wrong. >> >> Both suggestions are addressed in this patch. The pgd spinlock >> (parisc_pgd_lock) is stored in the struct page table. In >> switch_mm_irqs_off() the physical address of this lock is loaded into >> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >> directly access the lock. >> >> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >> is adjacent to the pgd) is dropped now too. >> >> Suggested-by: Matthew Wilcox <willy@infradead.org> >> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >> Signed-off-by: Helge Deller <deller@gmx.de> >> Signed-off-by: John David Anglin <dave.anglin@bell.net> > > This patch results in: > > BUG: spinlock recursion on CPU#0, swapper/0/1 > lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 > CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 > Hardware name: 9000/778/B160L > Backtrace: > [<1019f9bc>] show_stack+0x34/0x48 > [<10a65278>] dump_stack+0x94/0x114 > [<10219f4c>] spin_dump+0x8c/0xb8 > [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 > [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 > [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 > [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 > [<102b8900>] __get_user_pages_remote+0x134/0x34c > [<102b8b80>] get_user_pages_remote+0x68/0x90 > [<102fccb0>] get_arg_page+0x94/0xd8 > [<102fdd84>] copy_string_kernel+0xc4/0x234 > [<102fe70c>] kernel_execve+0xcc/0x1a4 > [<10a58d94>] run_init_process+0xbc/0xe0 > [<10a70d50>] kernel_init+0x98/0x13c > [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 > > when trying to boot parisc/hppa images in qemu. Reverting this patch fixes > the problem. True, I can reproduce the problem. With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. With CONFIG_DEBUG_SPINLOCK=n it just hangs. Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. By the way, the latest version of this patch is in my for-next tree, here: https://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux.git/commit/?h=for-next&id=4add5f175b1e4e71c06493f9a2c52490d2ea4365 Problem happens with it too. Helge
On 2021-02-10 12:23 p.m., Helge Deller wrote: > On 2/10/21 3:52 PM, Guenter Roeck wrote: >> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >>> On parisc a spinlock is stored in the next page behind the pgd which >>> protects against parallel accesses to the pgd. That's why one additional >>> page (PGD_ALLOC_ORDER) is allocated for the pgd. >>> >>> Matthew Wilcox suggested that we instead should use a pointer in the >>> struct page table for this spinlock and noted, that the comments for the >>> PGD_ORDER and PMD_ORDER defines were wrong. >>> >>> Both suggestions are addressed in this patch. The pgd spinlock >>> (parisc_pgd_lock) is stored in the struct page table. In >>> switch_mm_irqs_off() the physical address of this lock is loaded into >>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >>> directly access the lock. >>> >>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >>> is adjacent to the pgd) is dropped now too. >>> >>> Suggested-by: Matthew Wilcox <willy@infradead.org> >>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >>> Signed-off-by: Helge Deller <deller@gmx.de> >>> Signed-off-by: John David Anglin <dave.anglin@bell.net> >> >> This patch results in: >> >> BUG: spinlock recursion on CPU#0, swapper/0/1 >> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 >> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 >> Hardware name: 9000/778/B160L >> Backtrace: >> [<1019f9bc>] show_stack+0x34/0x48 >> [<10a65278>] dump_stack+0x94/0x114 >> [<10219f4c>] spin_dump+0x8c/0xb8 >> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 >> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 >> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 >> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 >> [<102b8900>] __get_user_pages_remote+0x134/0x34c >> [<102b8b80>] get_user_pages_remote+0x68/0x90 >> [<102fccb0>] get_arg_page+0x94/0xd8 >> [<102fdd84>] copy_string_kernel+0xc4/0x234 >> [<102fe70c>] kernel_execve+0xcc/0x1a4 >> [<10a58d94>] run_init_process+0xbc/0xe0 >> [<10a70d50>] kernel_init+0x98/0x13c >> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 >> >> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes >> the problem. > > True, I can reproduce the problem. > With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. > With CONFIG_DEBUG_SPINLOCK=n it just hangs. > Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we have a lock that's not initialized. It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with locked NULL: /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); Dave
On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: > On 2021-02-10 12:23 p.m., Helge Deller wrote: > > On 2/10/21 3:52 PM, Guenter Roeck wrote: > >> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: > >>> On parisc a spinlock is stored in the next page behind the pgd which > >>> protects against parallel accesses to the pgd. That's why one additional > >>> page (PGD_ALLOC_ORDER) is allocated for the pgd. > >>> > >>> Matthew Wilcox suggested that we instead should use a pointer in the > >>> struct page table for this spinlock and noted, that the comments for the > >>> PGD_ORDER and PMD_ORDER defines were wrong. > >>> > >>> Both suggestions are addressed in this patch. The pgd spinlock > >>> (parisc_pgd_lock) is stored in the struct page table. In > >>> switch_mm_irqs_off() the physical address of this lock is loaded into > >>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can > >>> directly access the lock. > >>> > >>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd > >>> is adjacent to the pgd) is dropped now too. > >>> > >>> Suggested-by: Matthew Wilcox <willy@infradead.org> > >>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") > >>> Signed-off-by: Helge Deller <deller@gmx.de> > >>> Signed-off-by: John David Anglin <dave.anglin@bell.net> > >> > >> This patch results in: > >> > >> BUG: spinlock recursion on CPU#0, swapper/0/1 > >> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 > >> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 > >> Hardware name: 9000/778/B160L > >> Backtrace: > >> [<1019f9bc>] show_stack+0x34/0x48 > >> [<10a65278>] dump_stack+0x94/0x114 > >> [<10219f4c>] spin_dump+0x8c/0xb8 > >> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 > >> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 > >> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 > >> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 > >> [<102b8900>] __get_user_pages_remote+0x134/0x34c > >> [<102b8b80>] get_user_pages_remote+0x68/0x90 > >> [<102fccb0>] get_arg_page+0x94/0xd8 > >> [<102fdd84>] copy_string_kernel+0xc4/0x234 > >> [<102fe70c>] kernel_execve+0xcc/0x1a4 > >> [<10a58d94>] run_init_process+0xbc/0xe0 > >> [<10a70d50>] kernel_init+0x98/0x13c > >> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 > >> > >> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes > >> the problem. > > > > True, I can reproduce the problem. > > With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. > > With CONFIG_DEBUG_SPINLOCK=n it just hangs. > > Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. > Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we > have a lock that's not initialized. > > It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with The fact that reverting it fixes the problem is a good indication that the problem does relate to this patch. As for how - no idea. That is not my area of expertise. Thanks, Guenter
On 2021-02-10 8:20 p.m., Guenter Roeck wrote: > On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: >> On 2021-02-10 12:23 p.m., Helge Deller wrote: >>> On 2/10/21 3:52 PM, Guenter Roeck wrote: >>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >>>>> On parisc a spinlock is stored in the next page behind the pgd which >>>>> protects against parallel accesses to the pgd. That's why one additional >>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. >>>>> >>>>> Matthew Wilcox suggested that we instead should use a pointer in the >>>>> struct page table for this spinlock and noted, that the comments for the >>>>> PGD_ORDER and PMD_ORDER defines were wrong. >>>>> >>>>> Both suggestions are addressed in this patch. The pgd spinlock >>>>> (parisc_pgd_lock) is stored in the struct page table. In >>>>> switch_mm_irqs_off() the physical address of this lock is loaded into >>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >>>>> directly access the lock. >>>>> >>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >>>>> is adjacent to the pgd) is dropped now too. >>>>> >>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> >>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >>>>> Signed-off-by: Helge Deller <deller@gmx.de> >>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> >>>> This patch results in: >>>> >>>> BUG: spinlock recursion on CPU#0, swapper/0/1 >>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 >>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 >>>> Hardware name: 9000/778/B160L >>>> Backtrace: >>>> [<1019f9bc>] show_stack+0x34/0x48 >>>> [<10a65278>] dump_stack+0x94/0x114 >>>> [<10219f4c>] spin_dump+0x8c/0xb8 >>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 >>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 >>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 >>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 >>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c >>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 >>>> [<102fccb0>] get_arg_page+0x94/0xd8 >>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 >>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 >>>> [<10a58d94>] run_init_process+0xbc/0xe0 >>>> [<10a70d50>] kernel_init+0x98/0x13c >>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 >>>> >>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes >>>> the problem. >>> True, I can reproduce the problem. >>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. >>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. >>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. >> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we >> have a lock that's not initialized. >> >> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with > The fact that reverting it fixes the problem is a good indication > that the problem does relate to this patch. > > As for how - no idea. That is not my area of expertise. I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both builds work fine on c8000. The only thing that might have changed in the patch is the alignment of the lock used for page table updates. Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment of the lock pointer or the lock initialization in the PA 1.x build for qemu.
On Thu, Feb 11, 2021 at 09:38:25AM -0500, John David Anglin wrote: > On 2021-02-10 8:20 p.m., Guenter Roeck wrote: > > On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: > >> On 2021-02-10 12:23 p.m., Helge Deller wrote: > >>> On 2/10/21 3:52 PM, Guenter Roeck wrote: > >>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: > >>>>> On parisc a spinlock is stored in the next page behind the pgd which > >>>>> protects against parallel accesses to the pgd. That's why one additional > >>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. > >>>>> > >>>>> Matthew Wilcox suggested that we instead should use a pointer in the > >>>>> struct page table for this spinlock and noted, that the comments for the > >>>>> PGD_ORDER and PMD_ORDER defines were wrong. > >>>>> > >>>>> Both suggestions are addressed in this patch. The pgd spinlock > >>>>> (parisc_pgd_lock) is stored in the struct page table. In > >>>>> switch_mm_irqs_off() the physical address of this lock is loaded into > >>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can > >>>>> directly access the lock. > >>>>> > >>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd > >>>>> is adjacent to the pgd) is dropped now too. > >>>>> > >>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> > >>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") > >>>>> Signed-off-by: Helge Deller <deller@gmx.de> > >>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> > >>>> This patch results in: > >>>> > >>>> BUG: spinlock recursion on CPU#0, swapper/0/1 > >>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 > >>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 > >>>> Hardware name: 9000/778/B160L > >>>> Backtrace: > >>>> [<1019f9bc>] show_stack+0x34/0x48 > >>>> [<10a65278>] dump_stack+0x94/0x114 > >>>> [<10219f4c>] spin_dump+0x8c/0xb8 > >>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 > >>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 > >>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 > >>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 > >>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c > >>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 > >>>> [<102fccb0>] get_arg_page+0x94/0xd8 > >>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 > >>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 > >>>> [<10a58d94>] run_init_process+0xbc/0xe0 > >>>> [<10a70d50>] kernel_init+0x98/0x13c > >>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 > >>>> > >>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes > >>>> the problem. > >>> True, I can reproduce the problem. > >>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. > >>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. > >>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. > >> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we > >> have a lock that's not initialized. > >> > >> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with > > The fact that reverting it fixes the problem is a good indication > > that the problem does relate to this patch. > > > > As for how - no idea. That is not my area of expertise. > I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both > builds work fine on c8000. > > The only thing that might have changed in the patch is the alignment of the lock used for page table updates. > Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and > there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions > and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment > of the lock pointer or the lock initialization in the PA 1.x build for qemu. > The first lock is acquired in mm/memory.c from line 3565: vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); The second (recursive) lock is acquired from line 3587 in the same function: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); Source code lines are from next-20210211. I confirmed with debug code that the lock address passed to do_raw_spin_lock() is the same in both calls. Guenter
On 2021-02-11 4:51 p.m., Guenter Roeck wrote: > On Thu, Feb 11, 2021 at 09:38:25AM -0500, John David Anglin wrote: >> On 2021-02-10 8:20 p.m., Guenter Roeck wrote: >>> On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: >>>> On 2021-02-10 12:23 p.m., Helge Deller wrote: >>>>> On 2/10/21 3:52 PM, Guenter Roeck wrote: >>>>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >>>>>>> On parisc a spinlock is stored in the next page behind the pgd which >>>>>>> protects against parallel accesses to the pgd. That's why one additional >>>>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. >>>>>>> >>>>>>> Matthew Wilcox suggested that we instead should use a pointer in the >>>>>>> struct page table for this spinlock and noted, that the comments for the >>>>>>> PGD_ORDER and PMD_ORDER defines were wrong. >>>>>>> >>>>>>> Both suggestions are addressed in this patch. The pgd spinlock >>>>>>> (parisc_pgd_lock) is stored in the struct page table. In >>>>>>> switch_mm_irqs_off() the physical address of this lock is loaded into >>>>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >>>>>>> directly access the lock. >>>>>>> >>>>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >>>>>>> is adjacent to the pgd) is dropped now too. >>>>>>> >>>>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> >>>>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >>>>>>> Signed-off-by: Helge Deller <deller@gmx.de> >>>>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> >>>>>> This patch results in: >>>>>> >>>>>> BUG: spinlock recursion on CPU#0, swapper/0/1 >>>>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 >>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 >>>>>> Hardware name: 9000/778/B160L >>>>>> Backtrace: >>>>>> [<1019f9bc>] show_stack+0x34/0x48 >>>>>> [<10a65278>] dump_stack+0x94/0x114 >>>>>> [<10219f4c>] spin_dump+0x8c/0xb8 >>>>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 >>>>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 >>>>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 >>>>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 >>>>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c >>>>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 >>>>>> [<102fccb0>] get_arg_page+0x94/0xd8 >>>>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 >>>>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 >>>>>> [<10a58d94>] run_init_process+0xbc/0xe0 >>>>>> [<10a70d50>] kernel_init+0x98/0x13c >>>>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 >>>>>> >>>>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes >>>>>> the problem. >>>>> True, I can reproduce the problem. >>>>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. >>>>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. >>>>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. >>>> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we >>>> have a lock that's not initialized. >>>> >>>> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with >>> The fact that reverting it fixes the problem is a good indication >>> that the problem does relate to this patch. >>> >>> As for how - no idea. That is not my area of expertise. >> I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both >> builds work fine on c8000. >> >> The only thing that might have changed in the patch is the alignment of the lock used for page table updates. >> Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and >> there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions >> and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment >> of the lock pointer or the lock initialization in the PA 1.x build for qemu. >> > The first lock is acquired in mm/memory.c from line 3565: > > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > &vmf->ptl); > > The second (recursive) lock is acquired from line 3587 in the same > function: > > set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); > > Source code lines are from next-20210211. I confirmed with debug code > that the lock address passed to do_raw_spin_lock() is the same in both > calls. Thanks Guenter. I assume this is with v15 of the patch? It looks as if we might not need the ptl lock in set_pte_at() and probably elsewhere. But I'm still puzzled as to why this doesn't happen when different locks are used as in your report with the earlier patch.
On 2/11/21 2:16 PM, John David Anglin wrote: > On 2021-02-11 4:51 p.m., Guenter Roeck wrote: >> On Thu, Feb 11, 2021 at 09:38:25AM -0500, John David Anglin wrote: >>> On 2021-02-10 8:20 p.m., Guenter Roeck wrote: >>>> On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: >>>>> On 2021-02-10 12:23 p.m., Helge Deller wrote: >>>>>> On 2/10/21 3:52 PM, Guenter Roeck wrote: >>>>>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >>>>>>>> On parisc a spinlock is stored in the next page behind the pgd which >>>>>>>> protects against parallel accesses to the pgd. That's why one additional >>>>>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. >>>>>>>> >>>>>>>> Matthew Wilcox suggested that we instead should use a pointer in the >>>>>>>> struct page table for this spinlock and noted, that the comments for the >>>>>>>> PGD_ORDER and PMD_ORDER defines were wrong. >>>>>>>> >>>>>>>> Both suggestions are addressed in this patch. The pgd spinlock >>>>>>>> (parisc_pgd_lock) is stored in the struct page table. In >>>>>>>> switch_mm_irqs_off() the physical address of this lock is loaded into >>>>>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >>>>>>>> directly access the lock. >>>>>>>> >>>>>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >>>>>>>> is adjacent to the pgd) is dropped now too. >>>>>>>> >>>>>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> >>>>>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >>>>>>>> Signed-off-by: Helge Deller <deller@gmx.de> >>>>>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> >>>>>>> This patch results in: >>>>>>> >>>>>>> BUG: spinlock recursion on CPU#0, swapper/0/1 >>>>>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 >>>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 >>>>>>> Hardware name: 9000/778/B160L >>>>>>> Backtrace: >>>>>>> [<1019f9bc>] show_stack+0x34/0x48 >>>>>>> [<10a65278>] dump_stack+0x94/0x114 >>>>>>> [<10219f4c>] spin_dump+0x8c/0xb8 >>>>>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 >>>>>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 >>>>>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 >>>>>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 >>>>>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c >>>>>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 >>>>>>> [<102fccb0>] get_arg_page+0x94/0xd8 >>>>>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 >>>>>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 >>>>>>> [<10a58d94>] run_init_process+0xbc/0xe0 >>>>>>> [<10a70d50>] kernel_init+0x98/0x13c >>>>>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 >>>>>>> >>>>>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes >>>>>>> the problem. >>>>>> True, I can reproduce the problem. >>>>>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. >>>>>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. >>>>>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. >>>>> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we >>>>> have a lock that's not initialized. >>>>> >>>>> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with >>>> The fact that reverting it fixes the problem is a good indication >>>> that the problem does relate to this patch. >>>> >>>> As for how - no idea. That is not my area of expertise. >>> I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both >>> builds work fine on c8000. >>> >>> The only thing that might have changed in the patch is the alignment of the lock used for page table updates. >>> Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and >>> there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions >>> and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment >>> of the lock pointer or the lock initialization in the PA 1.x build for qemu. >>> >> The first lock is acquired in mm/memory.c from line 3565: >> >> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >> &vmf->ptl); >> >> The second (recursive) lock is acquired from line 3587 in the same >> function: >> >> set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); >> >> Source code lines are from next-20210211. I confirmed with deb ug code >> that the lock address passed to do_raw_spin_lock() is the same in both >> calls. > Thanks Guenter. I assume this is with v15 of the patch? > I have no idea what version it is, sorry. It is with the version that is in next-20210211. The problem was first seen with next-20210201. > It looks as if we might not need the ptl lock in set_pte_at() and probably elsewhere. > > But I'm still puzzled as to why this doesn't happen when different locks are used as in your > report with the earlier patch. > Maybe I reported it against the wrong version ? If so, sorry, my bad. Guenter
* John David Anglin <dave.anglin@bell.net>: > On 2021-02-11 4:51 p.m., Guenter Roeck wrote: > > On Thu, Feb 11, 2021 at 09:38:25AM -0500, John David Anglin wrote: > >> On 2021-02-10 8:20 p.m., Guenter Roeck wrote: > >>> On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: > >>>> On 2021-02-10 12:23 p.m., Helge Deller wrote: > >>>>> On 2/10/21 3:52 PM, Guenter Roeck wrote: > >>>>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: > >>>>>>> On parisc a spinlock is stored in the next page behind the pgd which > >>>>>>> protects against parallel accesses to the pgd. That's why one additional > >>>>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. > >>>>>>> > >>>>>>> Matthew Wilcox suggested that we instead should use a pointer in the > >>>>>>> struct page table for this spinlock and noted, that the comments for the > >>>>>>> PGD_ORDER and PMD_ORDER defines were wrong. > >>>>>>> > >>>>>>> Both suggestions are addressed in this patch. The pgd spinlock > >>>>>>> (parisc_pgd_lock) is stored in the struct page table. In > >>>>>>> switch_mm_irqs_off() the physical address of this lock is loaded into > >>>>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can > >>>>>>> directly access the lock. > >>>>>>> > >>>>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd > >>>>>>> is adjacent to the pgd) is dropped now too. > >>>>>>> > >>>>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> > >>>>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") > >>>>>>> Signed-off-by: Helge Deller <deller@gmx.de> > >>>>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> > >>>>>> This patch results in: > >>>>>> > >>>>>> BUG: spinlock recursion on CPU#0, swapper/0/1 > >>>>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 > >>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 > >>>>>> Hardware name: 9000/778/B160L > >>>>>> Backtrace: > >>>>>> [<1019f9bc>] show_stack+0x34/0x48 > >>>>>> [<10a65278>] dump_stack+0x94/0x114 > >>>>>> [<10219f4c>] spin_dump+0x8c/0xb8 > >>>>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 > >>>>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 > >>>>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 > >>>>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 > >>>>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c > >>>>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 > >>>>>> [<102fccb0>] get_arg_page+0x94/0xd8 > >>>>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 > >>>>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 > >>>>>> [<10a58d94>] run_init_process+0xbc/0xe0 > >>>>>> [<10a70d50>] kernel_init+0x98/0x13c > >>>>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 > >>>>>> > >>>>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes > >>>>>> the problem. > >>>>> True, I can reproduce the problem. > >>>>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. > >>>>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. > >>>>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. > >>>> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we > >>>> have a lock that's not initialized. > >>>> > >>>> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with > >>> The fact that reverting it fixes the problem is a good indication > >>> that the problem does relate to this patch. > >>> > >>> As for how - no idea. That is not my area of expertise. > >> I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both > >> builds work fine on c8000. > >> > >> The only thing that might have changed in the patch is the alignment of the lock used for page table updates. > >> Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and > >> there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions > >> and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment > >> of the lock pointer or the lock initialization in the PA 1.x build for qemu. > >> > > The first lock is acquired in mm/memory.c from line 3565: > > > > vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, > > &vmf->ptl); > > > > The second (recursive) lock is acquired from line 3587 in the same > > function: > > > > set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); > > > > Source code lines are from next-20210211. I confirmed with debug code > > that the lock address passed to do_raw_spin_lock() is the same in both > > calls. > Thanks Guenter. I assume this is with v15 of the patch? Yes, happens with latest version too. > It looks as if we might not need the ptl lock in set_pte_at() and probably elsewhere. Just removing the locks from set_pte_at() didn't solved it. After removing the others too, it works. Patch is attached below. I added a memory-barrier to set_pte() too. > But I'm still puzzled as to why this doesn't happen when different locks are used as in your > report with the earlier patch. I think it happened earlier too. Would need to test though. Helge diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 2e1873cd4877..2c68010d896a 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -82,17 +82,14 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) */ #define set_pte(pteptr, pteval) \ do{ \ - *(pteptr) = (pteval); \ + *(pteptr) = (pteval); \ + mb(); \ } while(0) #define set_pte_at(mm, addr, ptep, pteval) \ do { \ - unsigned long flags; \ - spinlock_t *pgd_lock = &(mm)->page_table_lock; \ - spin_lock_irqsave(pgd_lock, flags); \ set_pte(ptep, pteval); \ purge_tlb_entries(mm, addr); \ - spin_unlock_irqrestore(pgd_lock, flags); \ } while (0) #endif /* !__ASSEMBLY__ */ @@ -431,22 +428,15 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; - unsigned long flags; - spinlock_t *pgd_lock; if (!pte_young(*ptep)) return 0; - pgd_lock = &vma->vm_mm->page_table_lock; - spin_lock_irqsave(pgd_lock, flags); pte = *ptep; if (!pte_young(pte)) { - spin_unlock_irqrestore(pgd_lock, flags); return 0; } - set_pte(ptep, pte_mkold(pte)); - purge_tlb_entries(vma->vm_mm, addr); - spin_unlock_irqrestore(pgd_lock, flags); + set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); return 1; } @@ -454,29 +444,16 @@ struct mm_struct; static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t old_pte; - unsigned long flags; - spinlock_t *pgd_lock; - pgd_lock = &mm->page_table_lock; - spin_lock_irqsave(pgd_lock, flags); old_pte = *ptep; - set_pte(ptep, __pte(0)); - purge_tlb_entries(mm, addr); - spin_unlock_irqrestore(pgd_lock, flags); + set_pte_at(mm, addr, ptep, __pte(0)); return old_pte; } static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long flags; - spinlock_t *pgd_lock; - - pgd_lock = &mm->page_table_lock; - spin_lock_irqsave(pgd_lock, flags); - set_pte(ptep, pte_wrprotect(*ptep)); - purge_tlb_entries(mm, addr); - spin_unlock_irqrestore(pgd_lock, flags); + set_pte_at(mm, addr, ptep, pte_wrprotect(*ptep)); } #define pte_same(A,B) (pte_val(A) == pte_val(B))
On 2/12/21 12:14 AM, Helge Deller wrote: > * John David Anglin <dave.anglin@bell.net>: >> On 2021-02-11 4:51 p.m., Guenter Roeck wrote: >>> On Thu, Feb 11, 2021 at 09:38:25AM -0500, John David Anglin wrote: >>>> On 2021-02-10 8:20 p.m., Guenter Roeck wrote: >>>>> On Wed, Feb 10, 2021 at 01:57:42PM -0500, John David Anglin wrote: >>>>>> On 2021-02-10 12:23 p.m., Helge Deller wrote: >>>>>>> On 2/10/21 3:52 PM, Guenter Roeck wrote: >>>>>>>> On Wed, Jan 27, 2021 at 10:18:51PM +0100, Helge Deller wrote: >>>>>>>>> On parisc a spinlock is stored in the next page behind the pgd which >>>>>>>>> protects against parallel accesses to the pgd. That's why one additional >>>>>>>>> page (PGD_ALLOC_ORDER) is allocated for the pgd. >>>>>>>>> >>>>>>>>> Matthew Wilcox suggested that we instead should use a pointer in the >>>>>>>>> struct page table for this spinlock and noted, that the comments for the >>>>>>>>> PGD_ORDER and PMD_ORDER defines were wrong. >>>>>>>>> >>>>>>>>> Both suggestions are addressed in this patch. The pgd spinlock >>>>>>>>> (parisc_pgd_lock) is stored in the struct page table. In >>>>>>>>> switch_mm_irqs_off() the physical address of this lock is loaded into >>>>>>>>> cr28 (tr4) and the pgd into cr25, so that the fault handlers can >>>>>>>>> directly access the lock. >>>>>>>>> >>>>>>>>> The currently implemened Hybrid L2/L3 page table scheme (where the pmd >>>>>>>>> is adjacent to the pgd) is dropped now too. >>>>>>>>> >>>>>>>>> Suggested-by: Matthew Wilcox <willy@infradead.org> >>>>>>>>> Fixes: b37d1c1898b2 ("parisc: Use per-pagetable spinlock") >>>>>>>>> Signed-off-by: Helge Deller <deller@gmx.de> >>>>>>>>> Signed-off-by: John David Anglin <dave.anglin@bell.net> >>>>>>>> This patch results in: >>>>>>>> >>>>>>>> BUG: spinlock recursion on CPU#0, swapper/0/1 >>>>>>>> lock: 0x12226d14, .magic: dead4ead, .owner: swapper/0/1, .owner_cpu: 0 >>>>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.11.0-rc7-next-20210209-32bit #1 >>>>>>>> Hardware name: 9000/778/B160L >>>>>>>> Backtrace: >>>>>>>> [<1019f9bc>] show_stack+0x34/0x48 >>>>>>>> [<10a65278>] dump_stack+0x94/0x114 >>>>>>>> [<10219f4c>] spin_dump+0x8c/0xb8 >>>>>>>> [<1021a0b4>] do_raw_spin_lock+0xdc/0x108 >>>>>>>> [<10a7367c>] _raw_spin_lock_irqsave+0x30/0x48 >>>>>>>> [<102bf41c>] handle_mm_fault+0x5e8/0xdb0 >>>>>>>> [<102b813c>] __get_user_pages.part.0+0x1b0/0x3d4 >>>>>>>> [<102b8900>] __get_user_pages_remote+0x134/0x34c >>>>>>>> [<102b8b80>] get_user_pages_remote+0x68/0x90 >>>>>>>> [<102fccb0>] get_arg_page+0x94/0xd8 >>>>>>>> [<102fdd84>] copy_string_kernel+0xc4/0x234 >>>>>>>> [<102fe70c>] kernel_execve+0xcc/0x1a4 >>>>>>>> [<10a58d94>] run_init_process+0xbc/0xe0 >>>>>>>> [<10a70d50>] kernel_init+0x98/0x13c >>>>>>>> [<1019a01c>] ret_from_kernel_thread+0x1c/0x24 >>>>>>>> >>>>>>>> when trying to boot parisc/hppa images in qemu. Reverting this patch fixes >>>>>>>> the problem. >>>>>>> True, I can reproduce the problem. >>>>>>> With CONFIG_DEBUG_SPINLOCK=y you get the backtrace above. >>>>>>> With CONFIG_DEBUG_SPINLOCK=n it just hangs. >>>>>>> Happenes with 32-bit kernel with SMP kernel, even if only one virtual CPU is started. >>>>>> Which is quite puzzling since most spin locks are optimized in this case case. Strikes me we >>>>>> have a lock that's not initialized. >>>>>> >>>>>> It's not obvious how this relates to the patch. get_arg_page() calls get_user_pages_remote() with >>>>> The fact that reverting it fixes the problem is a good indication >>>>> that the problem does relate to this patch. >>>>> >>>>> As for how - no idea. That is not my area of expertise. >>>> I built Helge's for-next tree both with CONFIG_DEBUG_SPINLOCK=y and CONFIG_DEBUG_SPINLOCK=n. Both >>>> builds work fine on c8000. >>>> >>>> The only thing that might have changed in the patch is the alignment of the lock used for page table updates. >>>> Qemu only support PA 1.x instructions. The ldcw instruction needs 16-byte alignment on real hardware and >>>> there is code to dynamically align lock pointers to 16-byte alignment. The c8000 supports PA 2.0 instructions >>>> and the ldcw,co instruction only needs 4-byte alignment. Perhaps there is an issue with the dynamic alignment >>>> of the lock pointer or the lock initialization in the PA 1.x build for qemu. >>>> >>> The first lock is acquired in mm/memory.c from line 3565: >>> >>> vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, >>> &vmf->ptl); >>> >>> The second (recursive) lock is acquired from line 3587 in the same >>> function: >>> >>> set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); >>> >>> Source code lines are from next-20210211. I confirmed with debug code >>> that the lock address passed to do_raw_spin_lock() is the same in both >>> calls. >> Thanks Guenter. I assume this is with v15 of the patch? > > Yes, happens with latest version too. > >> It looks as if we might not need the ptl lock in set_pte_at() and probably elsewhere. > > Just removing the locks from set_pte_at() didn't solved it. > After removing the others too, it works. > Patch is attached below. > I added a memory-barrier to set_pte() too. > >> But I'm still puzzled as to why this doesn't happen when different locks are used as in your >> report with the earlier patch. > > I think it happened earlier too. Would need to test though. > > Helge > > diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h > index 2e1873cd4877..2c68010d896a 100644 > --- a/arch/parisc/include/asm/pgtable.h > +++ b/arch/parisc/include/asm/pgtable.h > @@ -82,17 +82,14 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) > */ > #define set_pte(pteptr, pteval) \ > do{ \ > - *(pteptr) = (pteval); \ > + *(pteptr) = (pteval); \ > + mb(); \ Actually, this should be barrier() instead of mb(). mb() is overkill on SMP. Helge > } while(0) > > #define set_pte_at(mm, addr, ptep, pteval) \ > do { \ > - unsigned long flags; \ > - spinlock_t *pgd_lock = &(mm)->page_table_lock; \ > - spin_lock_irqsave(pgd_lock, flags); \ > set_pte(ptep, pteval); \ > purge_tlb_entries(mm, addr); \ > - spin_unlock_irqrestore(pgd_lock, flags); \ > } while (0) > > #endif /* !__ASSEMBLY__ */ > @@ -431,22 +428,15 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); > static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) > { > pte_t pte; > - unsigned long flags; > - spinlock_t *pgd_lock; > > if (!pte_young(*ptep)) > return 0; > > - pgd_lock = &vma->vm_mm->page_table_lock; > - spin_lock_irqsave(pgd_lock, flags); > pte = *ptep; > if (!pte_young(pte)) { > - spin_unlock_irqrestore(pgd_lock, flags); > return 0; > } > - set_pte(ptep, pte_mkold(pte)); > - purge_tlb_entries(vma->vm_mm, addr); > - spin_unlock_irqrestore(pgd_lock, flags); > + set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); > return 1; > } > > @@ -454,29 +444,16 @@ struct mm_struct; > static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) > { > pte_t old_pte; > - unsigned long flags; > - spinlock_t *pgd_lock; > > - pgd_lock = &mm->page_table_lock; > - spin_lock_irqsave(pgd_lock, flags); > old_pte = *ptep; > - set_pte(ptep, __pte(0)); > - purge_tlb_entries(mm, addr); > - spin_unlock_irqrestore(pgd_lock, flags); > + set_pte_at(mm, addr, ptep, __pte(0)); > > return old_pte; > } > > static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) > { > - unsigned long flags; > - spinlock_t *pgd_lock; > - > - pgd_lock = &mm->page_table_lock; > - spin_lock_irqsave(pgd_lock, flags); > - set_pte(ptep, pte_wrprotect(*ptep)); > - purge_tlb_entries(mm, addr); > - spin_unlock_irqrestore(pgd_lock, flags); > + set_pte_at(mm, addr, ptep, pte_wrprotect(*ptep)); > } > > #define pte_same(A,B) (pte_val(A) == pte_val(B)) >
On 2021-02-11 6:14 p.m., Helge Deller wrote: > Yes, happens with latest version too. > >> It looks as if we might not need the ptl lock in set_pte_at() and probably elsewhere. > Just removing the locks from set_pte_at() didn't solved it. > After removing the others too, it works. > Patch is attached below. > I added a memory-barrier to set_pte() too. You are ahead of me. Should the memory barrier be a smp_mb()? Dave
On 2021-02-11 6:14 p.m., Helge Deller wrote: > Patch is attached below. > I added a memory-barrier to set_pte() too. I think the huge page support needs looking at.
diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 46f8c22c5977..3e02b1f75e54 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -5,6 +5,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/atomic.h> +#include <linux/spinlock.h> #include <asm-generic/mm_hooks.h> /* on PA-RISC, we actually have enough contexts to justify an allocator @@ -50,6 +51,14 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { if (prev != next) { +#ifdef CONFIG_SMP + /* phys address of tlb lock in cr28 (tr4) for TLB faults */ + struct page *page; + + page = virt_to_page((unsigned long) next->pgd); + /* BUG_ON(!page->parisc_pgd_lock); */ + mtctl(__pa(__ldcw_align(&page->parisc_pgd_lock->rlock.raw_lock)), 28); +#endif mtctl(__pa(next->pgd), 25); load_context(next->context); } diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 6b3f6740a6a6..d00313d1274e 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -112,7 +112,7 @@ extern int npmem_ranges; #else #define BITS_PER_PTE_ENTRY 2 #define BITS_PER_PMD_ENTRY 2 -#define BITS_PER_PGD_ENTRY BITS_PER_PMD_ENTRY +#define BITS_PER_PGD_ENTRY 2 #endif #define PGD_ENTRY_SIZE (1UL << BITS_PER_PGD_ENTRY) #define PMD_ENTRY_SIZE (1UL << BITS_PER_PMD_ENTRY) diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index a6482b2ce0ea..9c1a54543c87 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -15,47 +15,45 @@ #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> -/* Allocate the top level pgd (page directory) - * - * Here (for 64 bit kernels) we implement a Hybrid L2/L3 scheme: we - * allocate the first pmd adjacent to the pgd. This means that we can - * subtract a constant offset to get to it. The pmd and pgd sizes are - * arranged so that a single pmd covers 4GB (giving a full 64-bit - * process access to 8TB) so our lookups are effectively L2 for the - * first 4GB of the kernel (i.e. for all ILP32 processes and all the - * kernel for machines with under 4GB of memory) */ +/* Allocate the top level pgd (page directory) */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, - PGD_ALLOC_ORDER); - pgd_t *actual_pgd = pgd; - - if (likely(pgd != NULL)) { - memset(pgd, 0, PAGE_SIZE<<PGD_ALLOC_ORDER); -#if CONFIG_PGTABLE_LEVELS == 3 - actual_pgd += PTRS_PER_PGD; - /* Populate first pmd with allocated memory. We mark it - * with PxD_FLAG_ATTACHED as a signal to the system that this - * pmd entry may not be cleared. */ - set_pgd(actual_pgd, __pgd((PxD_FLAG_PRESENT | - PxD_FLAG_VALID | - PxD_FLAG_ATTACHED) - + (__u32)(__pa((unsigned long)pgd) >> PxD_VALUE_SHIFT))); - /* The first pmd entry also is marked with PxD_FLAG_ATTACHED as - * a signal that this pmd may not be freed */ - set_pgd(pgd, __pgd(PxD_FLAG_ATTACHED)); + pgd_t *pgd; +#ifdef CONFIG_SMP + spinlock_t *pgd_lock; + struct page *page; #endif + + pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + if (unlikely(pgd == NULL)) + return NULL; + + memset(pgd, 0, PAGE_SIZE << PGD_ORDER); + +#ifdef CONFIG_SMP + /* allocate & initialize pgd_spinlock() for this PGD */ + pgd_lock = kmalloc(sizeof(*pgd_lock), GFP_KERNEL); + if (unlikely(pgd_lock == NULL)) { + free_pages((unsigned long)pgd, PGD_ORDER); + return NULL; } - spin_lock_init(pgd_spinlock(actual_pgd)); - return actual_pgd; + + page = virt_to_page((unsigned long) pgd); + spin_lock_init(pgd_lock); + page->parisc_pgd_lock = pgd_lock; +#endif + + return pgd; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { -#if CONFIG_PGTABLE_LEVELS == 3 - pgd -= PTRS_PER_PGD; +#ifdef CONFIG_SMP + spinlock_t *lock = pgd_spinlock(pgd); + kfree(lock); #endif - free_pages((unsigned long)pgd, PGD_ALLOC_ORDER); + + free_pages((unsigned long)pgd, PGD_ORDER); } #if CONFIG_PGTABLE_LEVELS == 3 @@ -68,43 +66,27 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) (__u32)(__pa((unsigned long)pmd) >> PxD_VALUE_SHIFT))); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_ORDER); + pmd_t *pmd; + + pmd = (pmd_t *)__get_free_pages(GFP_PGTABLE_KERNEL, PMD_ORDER); + if (pmd) + memset ((void *)pmd, 0, PAGE_SIZE << PMD_ORDER); + return pmd; } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) { - /* - * This is the permanent pmd attached to the pgd; - * cannot free it. - * Increment the counter to compensate for the decrement - * done by generic mm code. - */ - mm_inc_nr_pmds(mm); - return; - } free_pages((unsigned long)pmd, PMD_ORDER); } - #endif static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { -#if CONFIG_PGTABLE_LEVELS == 3 - /* preserve the gateway marker if this is the beginning of - * the permanent pmd */ - if(pmd_flag(*pmd) & PxD_FLAG_ATTACHED) - set_pmd(pmd, __pmd((PxD_FLAG_PRESENT | - PxD_FLAG_VALID | - PxD_FLAG_ATTACHED) - + (__u32)(__pa((unsigned long)pte) >> PxD_VALUE_SHIFT))); - else -#endif - set_pmd(pmd, __pmd((PxD_FLAG_PRESENT | PxD_FLAG_VALID) - + (__u32)(__pa((unsigned long)pte) >> PxD_VALUE_SHIFT))); + set_pmd(pmd, __pmd((PxD_FLAG_PRESENT | PxD_FLAG_VALID) + + (__u32)(__pa((unsigned long)pte) >> PxD_VALUE_SHIFT))); } #define pmd_populate(mm, pmd, pte_page) \ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 75cf84070fc9..c08c7b37e5f4 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -23,8 +23,6 @@ #include <asm/processor.h> #include <asm/cache.h> -static inline spinlock_t *pgd_spinlock(pgd_t *); - /* * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel * memory. For the return value to be meaningful, ADDR must be >= @@ -40,14 +38,18 @@ static inline spinlock_t *pgd_spinlock(pgd_t *); */ #define kern_addr_valid(addr) (1) +/* PTE updates are protected by a lock per PGD in the page table struct. */ +extern spinlock_t pa_swapper_pg_lock; +#ifdef CONFIG_SMP +extern spinlock_t *pgd_spinlock(pgd_t *pgd); +#else +#define pgd_spinlock(x) &pa_swapper_pg_lock +#endif + /* This is for the serialization of PxTLB broadcasts. At least on the N class * systems, only one PxTLB inter processor broadcast can be active at any one - * time on the Merced bus. - - * PTE updates are protected by locks in the PMD. - */ + * time on the Merced bus. */ extern spinlock_t pa_tlb_flush_lock; -extern spinlock_t pa_swapper_pg_lock; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) extern int pa_serialize_tlb_flushes; #else @@ -94,10 +96,12 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define set_pte_at(mm, addr, ptep, pteval) \ do { \ unsigned long flags; \ - spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\ + spinlock_t *pgd_lock; \ + pgd_lock = pgd_spinlock((mm)->pgd); \ + spin_lock_irqsave(pgd_lock, flags); \ set_pte(ptep, pteval); \ purge_tlb_entries(mm, addr); \ - spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\ + spin_unlock_irqrestore(pgd_lock, flags); \ } while (0) #endif /* !__ASSEMBLY__ */ @@ -120,12 +124,10 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define KERNEL_INITIAL_SIZE (1 << KERNEL_INITIAL_ORDER) #if CONFIG_PGTABLE_LEVELS == 3 -#define PGD_ORDER 1 /* Number of pages per pgd */ -#define PMD_ORDER 1 /* Number of pages per pmd */ -#define PGD_ALLOC_ORDER (2 + 1) /* first pgd contains pmd */ +#define PMD_ORDER 1 +#define PGD_ORDER 0 #else -#define PGD_ORDER 1 /* Number of pages per pgd */ -#define PGD_ALLOC_ORDER (PGD_ORDER + 1) +#define PGD_ORDER 1 #endif /* Definitions for 3rd level (we use PLD here for Page Lower directory @@ -240,11 +242,9 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) * able to effectively address 40/42/44-bits of physical address space * depending on 4k/16k/64k PAGE_SIZE */ #define _PxD_PRESENT_BIT 31 -#define _PxD_ATTACHED_BIT 30 -#define _PxD_VALID_BIT 29 +#define _PxD_VALID_BIT 30 #define PxD_FLAG_PRESENT (1 << xlate_pabit(_PxD_PRESENT_BIT)) -#define PxD_FLAG_ATTACHED (1 << xlate_pabit(_PxD_ATTACHED_BIT)) #define PxD_FLAG_VALID (1 << xlate_pabit(_PxD_VALID_BIT)) #define PxD_FLAG_MASK (0xf) #define PxD_FLAG_SHIFT (4) @@ -326,23 +326,10 @@ extern unsigned long *empty_zero_page; #define pgd_flag(x) (pgd_val(x) & PxD_FLAG_MASK) #define pgd_address(x) ((unsigned long)(pgd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT) -#if CONFIG_PGTABLE_LEVELS == 3 -/* The first entry of the permanent pmd is not there if it contains - * the gateway marker */ -#define pmd_none(x) (!pmd_val(x) || pmd_flag(x) == PxD_FLAG_ATTACHED) -#else #define pmd_none(x) (!pmd_val(x)) -#endif #define pmd_bad(x) (!(pmd_flag(x) & PxD_FLAG_VALID)) #define pmd_present(x) (pmd_flag(x) & PxD_FLAG_PRESENT) static inline void pmd_clear(pmd_t *pmd) { -#if CONFIG_PGTABLE_LEVELS == 3 - if (pmd_flag(*pmd) & PxD_FLAG_ATTACHED) - /* This is the entry pointing to the permanent pmd - * attached to the pgd; cannot clear it */ - set_pmd(pmd, __pmd(PxD_FLAG_ATTACHED)); - else -#endif set_pmd(pmd, __pmd(0)); } @@ -358,12 +345,6 @@ static inline void pmd_clear(pmd_t *pmd) { #define pud_bad(x) (!(pud_flag(x) & PxD_FLAG_VALID)) #define pud_present(x) (pud_flag(x) & PxD_FLAG_PRESENT) static inline void pud_clear(pud_t *pud) { -#if CONFIG_PGTABLE_LEVELS == 3 - if(pud_flag(*pud) & PxD_FLAG_ATTACHED) - /* This is the permanent pmd attached to the pud; cannot - * free it */ - return; -#endif set_pud(pud, __pud(0)); } #endif @@ -456,32 +437,25 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -static inline spinlock_t *pgd_spinlock(pgd_t *pgd) -{ - if (unlikely(pgd == swapper_pg_dir)) - return &pa_swapper_pg_lock; - return (spinlock_t *)((char *)pgd + (PAGE_SIZE << (PGD_ALLOC_ORDER - 1))); -} - - static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; unsigned long flags; + spinlock_t *pgd_lock; if (!pte_young(*ptep)) return 0; - spin_lock_irqsave(pgd_spinlock(vma->vm_mm->pgd), flags); + pgd_lock = pgd_spinlock(vma->vm_mm->pgd); + spin_lock_irqsave(pgd_lock, flags); pte = *ptep; if (!pte_young(pte)) { - spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags); + spin_unlock_irqrestore(pgd_lock, flags); return 0; } set_pte(ptep, pte_mkold(pte)); purge_tlb_entries(vma->vm_mm, addr); - spin_unlock_irqrestore(pgd_spinlock(vma->vm_mm->pgd), flags); + spin_unlock_irqrestore(pgd_lock, flags); return 1; } @@ -490,12 +464,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, { pte_t old_pte; unsigned long flags; + spinlock_t *pgd_lock; - spin_lock_irqsave(pgd_spinlock(mm->pgd), flags); + pgd_lock = pgd_spinlock(mm->pgd); + spin_lock_irqsave(pgd_lock, flags); old_pte = *ptep; set_pte(ptep, __pte(0)); purge_tlb_entries(mm, addr); - spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags); + spin_unlock_irqrestore(pgd_lock, flags); return old_pte; } @@ -503,10 +479,13 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { unsigned long flags; - spin_lock_irqsave(pgd_spinlock(mm->pgd), flags); + spinlock_t *pgd_lock; + + pgd_lock = pgd_spinlock(mm->pgd); + spin_lock_irqsave(pgd_lock, flags); set_pte(ptep, pte_wrprotect(*ptep)); purge_tlb_entries(mm, addr); - spin_unlock_irqrestore(pgd_spinlock(mm->pgd), flags); + spin_unlock_irqrestore(pgd_lock, flags); } #define pte_same(A,B) (pte_val(A) == pte_val(B)) diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 305768a40773..cd2cc1b1648c 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -268,7 +268,6 @@ int main(void) DEFINE(ASM_BITS_PER_PGD, BITS_PER_PGD); DEFINE(ASM_BITS_PER_PMD, BITS_PER_PMD); DEFINE(ASM_BITS_PER_PTE, BITS_PER_PTE); - DEFINE(ASM_PGD_PMD_OFFSET, -(PAGE_SIZE << PGD_ORDER)); DEFINE(ASM_PMD_ENTRY, ((PAGE_OFFSET & PMD_MASK) >> PMD_SHIFT)); DEFINE(ASM_PGD_ENTRY, PAGE_OFFSET >> PGDIR_SHIFT); DEFINE(ASM_PGD_ENTRY_SIZE, PGD_ENTRY_SIZE); diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index beba9816cc6c..538114d81760 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -35,10 +35,9 @@ .level 2.0 #endif - .import pa_tlb_lock,data - .macro load_pa_tlb_lock reg - mfctl %cr25,\reg - addil L%(PAGE_SIZE << (PGD_ALLOC_ORDER - 1)),\reg + /* Load parisc_pgd_lock address from cr28/tr4 for this pgd (cr25) */ + .macro load_pgd_spinlock reg + mfctl %cr28,\reg .endm /* space_to_prot macro creates a prot id from a space id */ @@ -407,7 +406,9 @@ # endif #endif dep %r0,31,PAGE_SHIFT,\pmd /* clear offset */ +#if CONFIG_PGTABLE_LEVELS < 3 copy %r0,\pte +#endif ldw,s \index(\pmd),\pmd bb,>=,n \pmd,_PxD_PRESENT_BIT,\fault dep %r0,31,PxD_FLAG_SHIFT,\pmd /* clear flags */ @@ -417,29 +418,14 @@ shladd \index,BITS_PER_PTE_ENTRY,\pmd,\pmd /* pmd is now pte */ .endm - /* Look up PTE in a 3-Level scheme. - * - * Here we implement a Hybrid L2/L3 scheme: we allocate the - * first pmd adjacent to the pgd. This means that we can - * subtract a constant offset to get to it. The pmd and pgd - * sizes are arranged so that a single pmd covers 4GB (giving - * a full LP64 process access to 8TB) so our lookups are - * effectively L2 for the first 4GB of the kernel (i.e. for - * all ILP32 processes and all the kernel for machines with - * under 4GB of memory) */ + /* Look up PTE in a 3-Level scheme. */ .macro L3_ptep pgd,pte,index,va,fault -#if CONFIG_PGTABLE_LEVELS == 3 /* we might have a 2-Level scheme, e.g. with 16kb page size */ +#if CONFIG_PGTABLE_LEVELS == 3 + copy %r0,\pte extrd,u \va,63-ASM_PGDIR_SHIFT,ASM_BITS_PER_PGD,\index - extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 ldw,s \index(\pgd),\pgd - extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 bb,>=,n \pgd,_PxD_PRESENT_BIT,\fault - extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 - shld \pgd,PxD_VALUE_SHIFT,\index - extrd,u,*= \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 - copy \index,\pgd - extrd,u,*<> \va,63-ASM_PGDIR_SHIFT,64-ASM_PGDIR_SHIFT,%r0 - ldo ASM_PGD_PMD_OFFSET(\pgd),\pgd + shld \pgd,PxD_VALUE_SHIFT,\pgd #endif L2_ptep \pgd,\pte,\index,\va,\fault .endm @@ -448,7 +434,7 @@ .macro tlb_lock spc,ptp,pte,tmp,tmp1,fault #ifdef CONFIG_SMP 98: cmpib,COND(=),n 0,\spc,2f - load_pa_tlb_lock \tmp + load_pgd_spinlock \tmp 1: LDCW 0(\tmp),\tmp1 cmpib,COND(=) 0,\tmp1,1b nop @@ -480,9 +466,9 @@ /* Release pa_tlb_lock lock. */ .macro tlb_unlock1 spc,tmp #ifdef CONFIG_SMP -98: load_pa_tlb_lock \tmp -99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) +98: load_pgd_spinlock \tmp tlb_unlock0 \spc,\tmp +99: ALTERNATIVE(98b, 99b, ALT_COND_NO_SMP, INSN_NOP) #endif .endm diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3ec633b11b54..4f3f180b0b20 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -681,6 +681,24 @@ static void __init parisc_bootmem_free(void) free_area_init(max_zone_pfn); } +static void __init parisc_init_pgd_lock(void) +{ + struct page *page; + + page = virt_to_page((unsigned long) &swapper_pg_dir); + page->parisc_pgd_lock = &pa_swapper_pg_lock; +} + +#ifdef CONFIG_SMP +spinlock_t *pgd_spinlock(pgd_t *pgd) +{ + struct page *page; + + page = virt_to_page((unsigned long) pgd); + return page->parisc_pgd_lock; +} +#endif + void __init paging_init(void) { setup_bootmem(); @@ -691,6 +709,7 @@ void __init paging_init(void) sparse_init(); parisc_bootmem_free(); + parisc_init_pgd_lock(); } #ifdef CONFIG_PA20 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 07d9acb5b19c..d8520af04b7d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -150,6 +150,7 @@ struct page { union { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ + spinlock_t *parisc_pgd_lock; /* parisc pgds only */ }; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl;