diff mbox series

[v2,1/3] xen/riscv: introduce setup_initial_pages

Message ID 85a21ada5a0fc44bb9db1dcc1f6cf191a6e66bfb.1678984041.git.oleksii.kurochko@gmail.com (mailing list archive)
State Superseded
Headers show
Series enable MMU for RISC-V | expand

Commit Message

Oleksii Kurochko March 16, 2023, 4:43 p.m. UTC
Mostly the code for setup_initial_pages was taken from Bobby's
repo except for the following changes:
* Use only a minimal part of the code enough to enable MMU
* rename {_}setup_initial_pagetables functions
* add an argument for setup_initial_mapping to have
  an opportunity to make set PTE flags.
* update setup_initial_pagetables function to map sections
  with correct PTE flags.
* introduce separate enable_mmu() to be able for proper
  handling the case when load start address isn't equal to
  linker start address.
* map linker addresses range to load addresses range without
  1:1 mapping.
* add safety checks such as:
  * Xen size is less than page size
  * linker addresses range doesn't overlap load addresses
    range
* Rework macros {THIRD,SECOND,FIRST,ZEROETH}_{SHIFT,MASK}
* change PTE_LEAF_DEFAULT to RX instead of RWX.
* Remove phys_offset as it isn't used now.
* Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0); in
  setup_inital_mapping() as they should be already aligned.
* Remove clear_pagetables() as initial pagetables will be
  zeroed during bss initialization
* Remove __attribute__((section(".entry")) for setup_initial_pagetables()
  as there is no such section in xen.lds.S
* Update the argument of pte_is_valid() to "const pte_t *p"

Origin: https://gitlab.com/xen-on-risc-v/xen/-/tree/riscv-rebase 4af165b468af
Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V2:
 * update the commit message:
 * Remove {ZEROETH,FIRST,...}_{SHIFT,MASK, SIZE,...} and
   introduce instead of them XEN_PT_LEVEL_*() and LEVEL_*
 * Rework pt_linear_offset() and pt_index based on  XEN_PT_LEVEL_*()
 * Remove clear_pagetables() functions as pagetables were zeroed during
   .bss initialization
 * Rename _setup_initial_pagetables() to setup_initial_mapping()
 * Make PTE_DEFAULT equal to RX.
 * Update prototype of setup_initial_mapping(..., bool writable) -> 
   setup_initial_mapping(..., UL flags)  
 * Update calls of setup_initial_mapping according to new prototype
 * Remove unnecessary call of:
   _setup_initial_pagetables(..., load_addr_start, load_addr_end, load_addr_start, ...)
 * Define index* in the loop of setup_initial_mapping
 * Remove attribute "__attribute__((section(".entry")))" for setup_initial_pagetables()
   as we don't have such section
 * make arguments of paddr_to_pte() and pte_is_valid() as const.
 * make xen_second_pagetable static.
 * use <xen/kernel.h> instead of declaring extern unsigned long _stext, 0etext, _srodata, _erodata
 * update  'extern unsigned long __init_begin' to 'extern unsigned long __init_begin[]'
 * use aligned() instead of "__attribute__((__aligned__(PAGE_SIZE)))"
 * set __section(".bss.page_aligned") for page tables arrays
 * fix identatations
 * Change '__attribute__((section(".entry")))' to '__init'
 * Remove phys_offset as it isn't used now.
 * Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0); in
   setup_inital_mapping() as they should be already aligned.
 * Remove clear_pagetables() as initial pagetables will be
   zeroed during bss initialization
 * Remove __attribute__((section(".entry")) for setup_initial_pagetables()
   as there is no such section in xen.lds.S
 * Update the argument of pte_is_valid() to "const pte_t *p"
---
 xen/arch/riscv/Makefile           |   1 +
 xen/arch/riscv/include/asm/mm.h   |   8 ++
 xen/arch/riscv/include/asm/page.h |  67 +++++++++++++++++
 xen/arch/riscv/mm.c               | 121 ++++++++++++++++++++++++++++++
 xen/arch/riscv/riscv64/head.S     |  65 ++++++++++++++++
 xen/arch/riscv/xen.lds.S          |   2 +
 6 files changed, 264 insertions(+)
 create mode 100644 xen/arch/riscv/include/asm/mm.h
 create mode 100644 xen/arch/riscv/include/asm/page.h
 create mode 100644 xen/arch/riscv/mm.c

Comments

Jan Beulich March 20, 2023, 4:41 p.m. UTC | #1
On 16.03.2023 17:43, Oleksii Kurochko wrote:
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/mm.h
> @@ -0,0 +1,8 @@
> +#ifndef _ASM_RISCV_MM_H
> +#define _ASM_RISCV_MM_H
> +
> +void setup_initial_pagetables(void);
> +
> +extern void enable_mmu(void);

Nit: At least within a single header you probably want to be consistent
about the use of "extern". Personally I think it would better be omitted
from function declarations.

> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -0,0 +1,67 @@
> +#ifndef _ASM_RISCV_PAGE_H
> +#define _ASM_RISCV_PAGE_H
> +
> +#include <xen/const.h>
> +#include <xen/types.h>
> +
> +#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
> +#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES - 1))
> +
> +#define PAGE_ORDER                  (12)

DYM PAGE_SHIFT here, as used elsewhere in Xen?

Also are you aware of page-bits.h, where I think some of these constants
should go?

> +#ifdef CONFIG_RISCV_64
> +#define PAGETABLE_ORDER             (9)
> +#else /* CONFIG_RISCV_32 */
> +#define PAGETABLE_ORDER             (10)
> +#endif
> +
> +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
> +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) + PAGE_ORDER)
> +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) << LEVEL_SHIFT(lvl))
> +
> +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
> +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
> +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)

Mind me asking what these are good for? Doesn't one set of macros
suffice?

> +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) - 1))
> +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK << XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define PTE_SHIFT                   10

What does this describe? According to its single use here it may
simply require a better name.

> +#define PTE_VALID                   BIT(0, UL)
> +#define PTE_READABLE                BIT(1, UL)
> +#define PTE_WRITABLE                BIT(2, UL)
> +#define PTE_EXECUTABLE              BIT(3, UL)
> +#define PTE_USER                    BIT(4, UL)
> +#define PTE_GLOBAL                  BIT(5, UL)
> +#define PTE_ACCESSED                BIT(6, UL)
> +#define PTE_DIRTY                   BIT(7, UL)
> +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
> +
> +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE | PTE_EXECUTABLE)
> +#define PTE_TABLE                   (PTE_VALID)
> +
> +/* Calculate the offsets into the pagetables for a given VA */
> +#define pt_linear_offset(lvl, va)   ((va) >> XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) & XEN_PT_LEVEL_MASK(lvl))
> +
> +/* Page Table entry */
> +typedef struct {
> +    uint64_t pte;
> +} pte_t;

Not having read the respective spec (yet) I'm wondering if this really
is this way also for RV32 (despite the different PAGETABLE_ORDER).

> --- /dev/null
> +++ b/xen/arch/riscv/mm.c
> @@ -0,0 +1,121 @@
> +#include <xen/compiler.h>
> +#include <xen/init.h>
> +#include <xen/kernel.h>
> +#include <xen/lib.h>
> +#include <xen/page-size.h>
> +
> +#include <asm/boot-info.h>
> +#include <asm/config.h>
> +#include <asm/csr.h>
> +#include <asm/mm.h>
> +#include <asm/page.h>
> +#include <asm/traps.h>
> +
> +/*
> + * xen_second_pagetable is indexed with the VPN[2] page table entry field
> + * xen_first_pagetable is accessed from the VPN[1] page table entry field
> + * xen_zeroeth_pagetable is accessed from the VPN[0] page table entry field
> + */
> +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_second_pagetable[PAGE_ENTRIES];
> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_first_pagetable[PAGE_ENTRIES];
> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_zeroeth_pagetable[PAGE_ENTRIES];

I would assume Andrew's comment on the earlier version also extended to
the names used here (and then also to various local variables or function
parameters further down).

> +extern unsigned long __init_begin[];
> +extern unsigned long __init_end[];
> +extern unsigned char cpu0_boot_stack[STACK_SIZE];
> +
> +static void __init
> +setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
> +                      unsigned long map_start,
> +                      unsigned long map_end,
> +                      unsigned long pa_start,
> +                      unsigned long flags)
> +{
> +    unsigned long page_addr;
> +
> +    // /* align start addresses */
> +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);

It's not clear what this is about, but in any event the comment is malformed.

> +    page_addr = map_start;
> +    while ( page_addr < map_end )
> +    {
> +        unsigned long index2 = pt_index(2, page_addr);
> +        unsigned long index1 = pt_index(1, page_addr);
> +        unsigned long index0 = pt_index(0, page_addr);
> +
> +        /* Setup level2 table */
> +        second[index2] = paddr_to_pte((unsigned long)first);
> +        second[index2].pte |= PTE_TABLE;
> +
> +        /* Setup level1 table */
> +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
> +        first[index1].pte |= PTE_TABLE;

Whould it make sense to have paddr_to_pte() take attributes right
away as 2nd argument?

> +
> +

Nit: No double blank lines please.

> +        /* Setup level0 table */
> +        if ( !pte_is_valid(&zeroeth[index0]) )
> +        {
> +            /* Update level0 table */
> +            zeroeth[index0] = paddr_to_pte((page_addr - map_start) + pa_start);
> +            zeroeth[index0].pte |= flags;
> +        }
> +
> +        /* Point to next page */
> +        page_addr += XEN_PT_LEVEL_SIZE(0);
> +    }
> +}
> +
> +/*
> + * setup_initial_pagetables:
> + *
> + * Build the page tables for Xen that map the following:
> + *   load addresses to linker addresses
> + */
> +void __init setup_initial_pagetables(void)
> +{
> +    pte_t *second;
> +    pte_t *first;
> +    pte_t *zeroeth;
> +
> +    unsigned long load_addr_start   = boot_info.load_start;
> +    unsigned long load_addr_end     = boot_info.load_end;
> +    unsigned long linker_addr_start = boot_info.linker_start;
> +    unsigned long linker_addr_end   = boot_info.linker_end;
> +
> +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> +    if (load_addr_start != linker_addr_start)

Nit: Style (missing blanks).

> +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end > linker_addr_start));
> +
> +    /* Get the addresses where the page tables were loaded */
> +    second  = (pte_t *)(&xen_second_pagetable);
> +    first   = (pte_t *)(&xen_first_pagetable);
> +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);

Please avoid casts whenever possible.

> @@ -32,3 +33,67 @@ ENTRY(start)
>          add     sp, sp, t0
>  
>          tail    start_xen
> +
> +ENTRY(enable_mmu)
> +        /* Calculate physical offset between linker and load addresses */
> +        la      t0, boot_info
> +        REG_L   t1, BI_LINKER_START(t0)
> +        REG_L   t2, BI_LOAD_START(t0)
> +        sub     t1, t1, t2
> +
> +        /*
> +         * Calculate and update a linker time address of the .L_mmu_is_enabled
> +         * label and update CSR_STVEC with it.
> +         * MMU is configured in a way where linker addresses are mapped
> +         * on load addresses so case when linker addresses are not equal to
> +         * load addresses, and thereby, after MMU is enabled, it will cause
> +         * an exception and jump to linker time addresses
> +         */
> +        la      t3, .L_mmu_is_enabled
> +        add     t3, t3, t1
> +        csrw    CSR_STVEC, t3
> +
> +        /* Calculate a value for SATP register */
> +        li      t5, SATP_MODE_SV39
> +        li      t6, SATP_MODE_SHIFT
> +        sll     t5, t5, t6
> +
> +        la      t4, xen_second_pagetable
> +        srl     t4, t4, PAGE_SHIFT
> +        or      t4, t4, t5
> +        sfence.vma
> +        csrw    CSR_SATP, t4
> +
> +        .align 2
> +.L_mmu_is_enabled:
> +        /*
> +         * Stack should be re-inited as:
> +         * 1. Right now an address of the stack is relative to load time
> +         *    addresses what will cause an issue in case of load start address
> +         *    isn't equal to linker start address.
> +         * 2. Addresses in stack are all load time relative which can be an
> +         *    issue in case when load start address isn't equal to linker
> +         *    start address.
> +         */
> +        la      sp, cpu0_boot_stack
> +        li      t0, STACK_SIZE
> +        add     sp, sp, t0
> +
> +        /*
> +         * Re-init an address of exception handler as it was overwritten  with
> +         * the address of the .L_mmu_is_enabled label.
> +         * Before jump to trap_init save return address of enable_mmu() to
> +         * know where we should back after enable_mmu() will be finished.
> +         */
> +        mv      s0, ra

Don't you need to preserve s0 for your caller?

> +        lla     t0, trap_init

Any reason for lla here when elsewhere above you use la? And aren't ...

> +        jalr    ra, t0

... these two together "call" anyway?

> +        /*
> +         * Re-calculate the return address of enable_mmu() function for case
> +         * when linker start address isn't equal to load start address
> +         */
> +        add     s0, s0, t1
> +        mv      ra, s0

"add ra, s0, t1"?

But then - can't t1 be clobbered by trap_init()?

> --- a/xen/arch/riscv/xen.lds.S
> +++ b/xen/arch/riscv/xen.lds.S
> @@ -179,3 +179,5 @@ SECTIONS
>  
>  ASSERT(!SIZEOF(.got),      ".got non-empty")
>  ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
> +
> +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot assumptions")

Again the question whether this is also applicable to RV32.

Jan
Oleksii Kurochko March 21, 2023, 5:08 p.m. UTC | #2
On Mon, 2023-03-20 at 17:41 +0100, Jan Beulich wrote:
> On 16.03.2023 17:43, Oleksii Kurochko wrote:
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/mm.h
> > @@ -0,0 +1,8 @@
> > +#ifndef _ASM_RISCV_MM_H
> > +#define _ASM_RISCV_MM_H
> > +
> > +void setup_initial_pagetables(void);
> > +
> > +extern void enable_mmu(void);
> 
> Nit: At least within a single header you probably want to be
> consistent
> about the use of "extern". Personally I think it would better be
> omitted
> from function declarations.
Agree. It would be better to remove extern.

> 
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/page.h
> > @@ -0,0 +1,67 @@
> > +#ifndef _ASM_RISCV_PAGE_H
> > +#define _ASM_RISCV_PAGE_H
> > +
> > +#include <xen/const.h>
> > +#include <xen/types.h>
> > +
> > +#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
> > +#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES
> > - 1))
> > +
> > +#define PAGE_ORDER                  (12)
> 
> DYM PAGE_SHIFT here, as used elsewhere in Xen?
yes, PAGE_SHIFT is meant here.

> 
> Also are you aware of page-bits.h, where I think some of these
> constants
> should go?
I'll move some constants to page-bits.h

> 
> > +#ifdef CONFIG_RISCV_64
> > +#define PAGETABLE_ORDER             (9)
> > +#else /* CONFIG_RISCV_32 */
> > +#define PAGETABLE_ORDER             (10)
> > +#endif
> > +
> > +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
> > +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) +
> > PAGE_ORDER)
> > +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) <<
> > LEVEL_SHIFT(lvl))
> > +
> > +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
> > +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
> > +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
> 
> Mind me asking what these are good for? Doesn't one set of macros
> suffice?
Do you mean XEN_PT_LEVEL_{SHIFT...}? it can be used only one pair of
macros. I'll check how they are used in ARM ( I copied that macros from
there ).
> 
> > +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) -
> > 1))
> > +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK <<
> > XEN_PT_LEVEL_SHIFT(lvl))
> > +
> > +#define PTE_SHIFT                   10
> 
> What does this describe? According to its single use here it may
> simply require a better name.
If look at Sv39 page table entry in riscv-priviliged.pdf. It has the
following description:
  63 62 61  60    54  53  28 27  19 18  10 9 8 7 6 5 4 3 2 1 0
  N  PBMT   Rererved  PPN[2] PPN[1] PPN[0] RSW D A G U X W R V
So 10 it means the 0-9 bits.

> 
> > +#define PTE_VALID                   BIT(0, UL)
> > +#define PTE_READABLE                BIT(1, UL)
> > +#define PTE_WRITABLE                BIT(2, UL)
> > +#define PTE_EXECUTABLE              BIT(3, UL)
> > +#define PTE_USER                    BIT(4, UL)
> > +#define PTE_GLOBAL                  BIT(5, UL)
> > +#define PTE_ACCESSED                BIT(6, UL)
> > +#define PTE_DIRTY                   BIT(7, UL)
> > +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
> > +
> > +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE |
> > PTE_EXECUTABLE)
> > +#define PTE_TABLE                   (PTE_VALID)
> > +
> > +/* Calculate the offsets into the pagetables for a given VA */
> > +#define pt_linear_offset(lvl, va)   ((va) >>
> > XEN_PT_LEVEL_SHIFT(lvl))
> > +
> > +#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) &
> > XEN_PT_LEVEL_MASK(lvl))
> > +
> > +/* Page Table entry */
> > +typedef struct {
> > +    uint64_t pte;
> > +} pte_t;
> 
> Not having read the respective spec (yet) I'm wondering if this
> really
> is this way also for RV32 (despite the different PAGETABLE_ORDER).
RISC-V architecture support several MMU mode to translate VA to PA.
The following mode can be: Sv32, Sv39, Sv48, Sv57 and PAGESIZE is equal
to 8 in all cases except Sv32 ( it is equal to 4 ). but I looked at
different big projects who have RISC-V support and no one supports
Sv32.

So it will be correct for RV32 as it supports the same Sv32 and Sv39
modes too.
> 
> > --- /dev/null
> > +++ b/xen/arch/riscv/mm.c
> > @@ -0,0 +1,121 @@
> > +#include <xen/compiler.h>
> > +#include <xen/init.h>
> > +#include <xen/kernel.h>
> > +#include <xen/lib.h>
> > +#include <xen/page-size.h>
> > +
> > +#include <asm/boot-info.h>
> > +#include <asm/config.h>
> > +#include <asm/csr.h>
> > +#include <asm/mm.h>
> > +#include <asm/page.h>
> > +#include <asm/traps.h>
> > +
> > +/*
> > + * xen_second_pagetable is indexed with the VPN[2] page table
> > entry field
> > + * xen_first_pagetable is accessed from the VPN[1] page table
> > entry field
> > + * xen_zeroeth_pagetable is accessed from the VPN[0] page table
> > entry field
> > + */
> > +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_second_pagetable[PAGE_ENTRIES];
> > +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_first_pagetable[PAGE_ENTRIES];
> > +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_zeroeth_pagetable[PAGE_ENTRIES];
> 
> I would assume Andrew's comment on the earlier version also extended
> to
> the names used here (and then also to various local variables or
> function
> parameters further down).
Thanks for clarification. It looks like I misunderstood him.

> 
> > +extern unsigned long __init_begin[];
> > +extern unsigned long __init_end[];
> > +extern unsigned char cpu0_boot_stack[STACK_SIZE];
> > +
> > +static void __init
> > +setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
> > +                      unsigned long map_start,
> > +                      unsigned long map_end,
> > +                      unsigned long pa_start,
> > +                      unsigned long flags)
> > +{
> > +    unsigned long page_addr;
> > +
> > +    // /* align start addresses */
> > +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
> 
> It's not clear what this is about, but in any event the comment is
> malformed.
I forgot to remove so it should be removed and it wasn't supposed to be
in the final code after the comment for patch v1.
> 
> > +    page_addr = map_start;
> > +    while ( page_addr < map_end )
> > +    {
> > +        unsigned long index2 = pt_index(2, page_addr);
> > +        unsigned long index1 = pt_index(1, page_addr);
> > +        unsigned long index0 = pt_index(0, page_addr);
> > +
> > +        /* Setup level2 table */
> > +        second[index2] = paddr_to_pte((unsigned long)first);
> > +        second[index2].pte |= PTE_TABLE;
> > +
> > +        /* Setup level1 table */
> > +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
> > +        first[index1].pte |= PTE_TABLE;
> 
> Whould it make sense to have paddr_to_pte() take attributes right
> away as 2nd argument?
Yes, it makes sense. It is always .pte |= FLAG after paddr_to_pte().
I'll update the macros.
> 
> > +
> > +
> 
> Nit: No double blank lines please.
Sure. Thanks.

> 
> > +        /* Setup level0 table */
> > +        if ( !pte_is_valid(&zeroeth[index0]) )
> > +        {
> > +            /* Update level0 table */
> > +            zeroeth[index0] = paddr_to_pte((page_addr - map_start)
> > + pa_start);
> > +            zeroeth[index0].pte |= flags;
> > +        }
> > +
> > +        /* Point to next page */
> > +        page_addr += XEN_PT_LEVEL_SIZE(0);
> > +    }
> > +}
> > +
> > +/*
> > + * setup_initial_pagetables:
> > + *
> > + * Build the page tables for Xen that map the following:
> > + *   load addresses to linker addresses
> > + */
> > +void __init setup_initial_pagetables(void)
> > +{
> > +    pte_t *second;
> > +    pte_t *first;
> > +    pte_t *zeroeth;
> > +
> > +    unsigned long load_addr_start   = boot_info.load_start;
> > +    unsigned long load_addr_end     = boot_info.load_end;
> > +    unsigned long linker_addr_start = boot_info.linker_start;
> > +    unsigned long linker_addr_end   = boot_info.linker_end;
> > +
> > +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> > +    if (load_addr_start != linker_addr_start)
> 
> Nit: Style (missing blanks).
Thanks for noticing.

> 
> > +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end
> > > linker_addr_start));
> > +
> > +    /* Get the addresses where the page tables were loaded */
> > +    second  = (pte_t *)(&xen_second_pagetable);
> > +    first   = (pte_t *)(&xen_first_pagetable);
> > +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
> 
> Please avoid casts whenever possible.
I'll take into account during the work on new patch version. Thanks.
> 
> > @@ -32,3 +33,67 @@ ENTRY(start)
> >          add     sp, sp, t0
> >  
> >          tail    start_xen
> > +
> > +ENTRY(enable_mmu)
> > +        /* Calculate physical offset between linker and load
> > addresses */
> > +        la      t0, boot_info
> > +        REG_L   t1, BI_LINKER_START(t0)
> > +        REG_L   t2, BI_LOAD_START(t0)
> > +        sub     t1, t1, t2
> > +
> > +        /*
> > +         * Calculate and update a linker time address of the
> > .L_mmu_is_enabled
> > +         * label and update CSR_STVEC with it.
> > +         * MMU is configured in a way where linker addresses are
> > mapped
> > +         * on load addresses so case when linker addresses are not
> > equal to
> > +         * load addresses, and thereby, after MMU is enabled, it
> > will cause
> > +         * an exception and jump to linker time addresses
> > +         */
> > +        la      t3, .L_mmu_is_enabled
> > +        add     t3, t3, t1
> > +        csrw    CSR_STVEC, t3
> > +
> > +        /* Calculate a value for SATP register */
> > +        li      t5, SATP_MODE_SV39
> > +        li      t6, SATP_MODE_SHIFT
> > +        sll     t5, t5, t6
> > +
> > +        la      t4, xen_second_pagetable
> > +        srl     t4, t4, PAGE_SHIFT
> > +        or      t4, t4, t5
> > +        sfence.vma
> > +        csrw    CSR_SATP, t4
> > +
> > +        .align 2
> > +.L_mmu_is_enabled:
> > +        /*
> > +         * Stack should be re-inited as:
> > +         * 1. Right now an address of the stack is relative to
> > load time
> > +         *    addresses what will cause an issue in case of load
> > start address
> > +         *    isn't equal to linker start address.
> > +         * 2. Addresses in stack are all load time relative which
> > can be an
> > +         *    issue in case when load start address isn't equal to
> > linker
> > +         *    start address.
> > +         */
> > +        la      sp, cpu0_boot_stack
> > +        li      t0, STACK_SIZE
> > +        add     sp, sp, t0
> > +
> > +        /*
> > +         * Re-init an address of exception handler as it was
> > overwritten  with
> > +         * the address of the .L_mmu_is_enabled label.
> > +         * Before jump to trap_init save return address of
> > enable_mmu() to
> > +         * know where we should back after enable_mmu() will be
> > finished.
> > +         */
> > +        mv      s0, ra
> 
> Don't you need to preserve s0 for your caller?
Yeah, it is needed according to RISC-V ABI spec. Thanks.
> 
> > +        lla     t0, trap_init
> 
> Any reason for lla here when elsewhere above you use la? And aren't
There is no any reason for that after patches which resolve an issue
with la pseudo-instruction were introduced:
https://lore.kernel.org/xen-devel/cover.1678970065.git.oleksii.kurochko@gmail.com/
> ...
> 
> > +        jalr    ra, t0
> 
> ... these two together "call" anyway?
I think it can be replaced with "call".
> 
> > +        /*
> > +         * Re-calculate the return address of enable_mmu()
> > function for case
> > +         * when linker start address isn't equal to load start
> > address
> > +         */
> > +        add     s0, s0, t1
> > +        mv      ra, s0
> 
> "add ra, s0, t1"?
> 
> But then - can't t1 be clobbered by trap_init()?
Yeah, it can. I didn't think about that.
Thanks. I'll update that in new version of the patch series.
> 
> > --- a/xen/arch/riscv/xen.lds.S
> > +++ b/xen/arch/riscv/xen.lds.S
> > @@ -179,3 +179,5 @@ SECTIONS
> >  
> >  ASSERT(!SIZEOF(.got),      ".got non-empty")
> >  ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
> > +
> > +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot
> > assumptions")
> 
> Again the question whether this is also applicable to RV32.
As mentioned before it will applicable to RV32 only in  case if Sv32
won't be used as addressing mode. In that case it should 2^10 * 2^12 =
4 Mb.
But as I mentioned before I am not sure that we will support Sv32.
Probably it will be good to add #ifdef...

~ Oleksii
Julien Grall March 21, 2023, 5:58 p.m. UTC | #3
Hi,

I will try to not repeat the comment already made.

On 16/03/2023 16:43, Oleksii Kurochko wrote:
> Mostly the code for setup_initial_pages was taken from Bobby's
> repo except for the following changes:
> * Use only a minimal part of the code enough to enable MMU
> * rename {_}setup_initial_pagetables functions
> * add an argument for setup_initial_mapping to have
>    an opportunity to make set PTE flags.
> * update setup_initial_pagetables function to map sections
>    with correct PTE flags.
> * introduce separate enable_mmu() to be able for proper
>    handling the case when load start address isn't equal to
>    linker start address.
> * map linker addresses range to load addresses range without
>    1:1 mapping.
> * add safety checks such as:
>    * Xen size is less than page size
>    * linker addresses range doesn't overlap load addresses
>      range
> * Rework macros {THIRD,SECOND,FIRST,ZEROETH}_{SHIFT,MASK}
> * change PTE_LEAF_DEFAULT to RX instead of RWX.
> * Remove phys_offset as it isn't used now.
> * Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0); in
>    setup_inital_mapping() as they should be already aligned.
> * Remove clear_pagetables() as initial pagetables will be
>    zeroed during bss initialization
> * Remove __attribute__((section(".entry")) for setup_initial_pagetables()
>    as there is no such section in xen.lds.S
> * Update the argument of pte_is_valid() to "const pte_t *p"
> 
> Origin: https://gitlab.com/xen-on-risc-v/xen/-/tree/riscv-rebase 4af165b468af
> Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> ---
> Changes in V2:
>   * update the commit message:
>   * Remove {ZEROETH,FIRST,...}_{SHIFT,MASK, SIZE,...} and
>     introduce instead of them XEN_PT_LEVEL_*() and LEVEL_*
>   * Rework pt_linear_offset() and pt_index based on  XEN_PT_LEVEL_*()
>   * Remove clear_pagetables() functions as pagetables were zeroed during
>     .bss initialization
>   * Rename _setup_initial_pagetables() to setup_initial_mapping()
>   * Make PTE_DEFAULT equal to RX.
>   * Update prototype of setup_initial_mapping(..., bool writable) ->
>     setup_initial_mapping(..., UL flags)
>   * Update calls of setup_initial_mapping according to new prototype
>   * Remove unnecessary call of:
>     _setup_initial_pagetables(..., load_addr_start, load_addr_end, load_addr_start, ...)
>   * Define index* in the loop of setup_initial_mapping
>   * Remove attribute "__attribute__((section(".entry")))" for setup_initial_pagetables()
>     as we don't have such section
>   * make arguments of paddr_to_pte() and pte_is_valid() as const.
>   * make xen_second_pagetable static.
>   * use <xen/kernel.h> instead of declaring extern unsigned long _stext, 0etext, _srodata, _erodata
>   * update  'extern unsigned long __init_begin' to 'extern unsigned long __init_begin[]'
>   * use aligned() instead of "__attribute__((__aligned__(PAGE_SIZE)))"
>   * set __section(".bss.page_aligned") for page tables arrays
>   * fix identatations
>   * Change '__attribute__((section(".entry")))' to '__init'
>   * Remove phys_offset as it isn't used now.
>   * Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0); in
>     setup_inital_mapping() as they should be already aligned.
>   * Remove clear_pagetables() as initial pagetables will be
>     zeroed during bss initialization
>   * Remove __attribute__((section(".entry")) for setup_initial_pagetables()
>     as there is no such section in xen.lds.S
>   * Update the argument of pte_is_valid() to "const pte_t *p"
> ---
>   xen/arch/riscv/Makefile           |   1 +
>   xen/arch/riscv/include/asm/mm.h   |   8 ++
>   xen/arch/riscv/include/asm/page.h |  67 +++++++++++++++++
>   xen/arch/riscv/mm.c               | 121 ++++++++++++++++++++++++++++++
>   xen/arch/riscv/riscv64/head.S     |  65 ++++++++++++++++
>   xen/arch/riscv/xen.lds.S          |   2 +
>   6 files changed, 264 insertions(+)
>   create mode 100644 xen/arch/riscv/include/asm/mm.h
>   create mode 100644 xen/arch/riscv/include/asm/page.h
>   create mode 100644 xen/arch/riscv/mm.c
> 
> diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
> index 443f6bf15f..956ceb02df 100644
> --- a/xen/arch/riscv/Makefile
> +++ b/xen/arch/riscv/Makefile
> @@ -1,5 +1,6 @@
>   obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
>   obj-y += entry.o
> +obj-y += mm.o
>   obj-$(CONFIG_RISCV_64) += riscv64/
>   obj-y += sbi.o
>   obj-y += setup.o
> diff --git a/xen/arch/riscv/include/asm/mm.h b/xen/arch/riscv/include/asm/mm.h
> new file mode 100644
> index 0000000000..3cc98fe45b
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/mm.h
> @@ -0,0 +1,8 @@
> +#ifndef _ASM_RISCV_MM_H
> +#define _ASM_RISCV_MM_H
> +
> +void setup_initial_pagetables(void);
> +
> +extern void enable_mmu(void);
> +
> +#endif /* _ASM_RISCV_MM_H */
> diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
> new file mode 100644
> index 0000000000..fb8329a191
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -0,0 +1,67 @@
> +#ifndef _ASM_RISCV_PAGE_H
> +#define _ASM_RISCV_PAGE_H
> +
> +#include <xen/const.h>
> +#include <xen/types.h>
> +
> +#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
> +#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES - 1))
> +
> +#define PAGE_ORDER                  (12)
> +
> +#ifdef CONFIG_RISCV_64
> +#define PAGETABLE_ORDER             (9)
> +#else /* CONFIG_RISCV_32 */
> +#define PAGETABLE_ORDER             (10)
> +#endif
> +
> +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
> +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) + PAGE_ORDER)
> +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) << LEVEL_SHIFT(lvl))
> +
> +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
> +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
> +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
> +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) - 1))
> +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK << XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define PTE_SHIFT                   10
> +
> +#define PTE_VALID                   BIT(0, UL)
> +#define PTE_READABLE                BIT(1, UL)
> +#define PTE_WRITABLE                BIT(2, UL)
> +#define PTE_EXECUTABLE              BIT(3, UL)
> +#define PTE_USER                    BIT(4, UL)
> +#define PTE_GLOBAL                  BIT(5, UL)
> +#define PTE_ACCESSED                BIT(6, UL)
> +#define PTE_DIRTY                   BIT(7, UL)
> +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
> +
> +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE | PTE_EXECUTABLE)
> +#define PTE_TABLE                   (PTE_VALID)
> +
> +/* Calculate the offsets into the pagetables for a given VA */
> +#define pt_linear_offset(lvl, va)   ((va) >> XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) & XEN_PT_LEVEL_MASK(lvl))
> +
> +/* Page Table entry */
> +typedef struct {
> +    uint64_t pte;
> +} pte_t;
> +
> +/* Shift the VPN[x] or PPN[x] fields of a virtual or physical address
> + * to become the shifted PPN[x] fields of a page table entry */
> +#define addr_to_ppn(x) (((x) >> PAGE_SHIFT) << PTE_SHIFT)
> +
> +static inline pte_t paddr_to_pte(const unsigned long paddr)
> +{
> +    return (pte_t) { .pte = addr_to_ppn(paddr) };
> +}
> +
> +static inline bool pte_is_valid(const pte_t *p)
> +{
> +    return p->pte & PTE_VALID;
> +}
> +
> +#endif /* _ASM_RISCV_PAGE_H */
> diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
> new file mode 100644
> index 0000000000..0df6b47441
> --- /dev/null
> +++ b/xen/arch/riscv/mm.c
> @@ -0,0 +1,121 @@
> +#include <xen/compiler.h>
> +#include <xen/init.h>
> +#include <xen/kernel.h>
> +#include <xen/lib.h>
> +#include <xen/page-size.h>
> +
> +#include <asm/boot-info.h>
> +#include <asm/config.h>
> +#include <asm/csr.h>
> +#include <asm/mm.h>
> +#include <asm/page.h>
> +#include <asm/traps.h>
> +
> +/*
> + * xen_second_pagetable is indexed with the VPN[2] page table entry field
> + * xen_first_pagetable is accessed from the VPN[1] page table entry field
> + * xen_zeroeth_pagetable is accessed from the VPN[0] page table entry field
> + */
> +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_second_pagetable[PAGE_ENTRIES];
> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_first_pagetable[PAGE_ENTRIES];
> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +    xen_zeroeth_pagetable[PAGE_ENTRIES];
> +
> +extern unsigned long __init_begin[];
> +extern unsigned long __init_end[];
> +extern unsigned char cpu0_boot_stack[STACK_SIZE];
> +
> +static void __init
> +setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
> +                      unsigned long map_start,
> +                      unsigned long map_end,
> +                      unsigned long pa_start,
> +                      unsigned long flags)
> +{
> +    unsigned long page_addr;
> +
> +    // /* align start addresses */
> +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);

They should be switched to ASSERT() or BUG_ON().

> +
> +    page_addr = map_start;
> +    while ( page_addr < map_end )

This loop is continue to assume that only the mapping can first in 2MB 
section (or less if the start is not 2MB aligned).

I am OK if you want to assume it, but I think this should be documented 
(with words and ASSERT()/BUG_ON()) to avoid any mistake.

> +    {
> +        unsigned long index2 = pt_index(2, page_addr);
> +        unsigned long index1 = pt_index(1, page_addr);
> +        unsigned long index0 = pt_index(0, page_addr);
> +
> +        /* Setup level2 table */
> +        second[index2] = paddr_to_pte((unsigned long)first);
> +        second[index2].pte |= PTE_TABLE;
> +
> +        /* Setup level1 table */
> +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
> +        first[index1].pte |= PTE_TABLE;
> +
> +

NIT: Spurious line?

> +        /* Setup level0 table */
> +        if ( !pte_is_valid(&zeroeth[index0]) )

On the previous version, you said it should be checked for each level. 
But here you still only check for a single level.

Furthermore, I would strongly suggest to also check the valid PTE is the 
same as you intend to write to catch any override (they are a pain to 
debug).

> +        {
> +            /* Update level0 table */
> +            zeroeth[index0] = paddr_to_pte((page_addr - map_start) + pa_start);
> +            zeroeth[index0].pte |= flags;
> +        }
> +
> +        /* Point to next page */
> +        page_addr += XEN_PT_LEVEL_SIZE(0);
> +    }
> +}
> +
> +/*
> + * setup_initial_pagetables:
> + *
> + * Build the page tables for Xen that map the following:
> + *   load addresses to linker addresses
> + */
> +void __init setup_initial_pagetables(void)
> +{
> +    pte_t *second;
> +    pte_t *first;
> +    pte_t *zeroeth;
> +
> +    unsigned long load_addr_start   = boot_info.load_start;
> +    unsigned long load_addr_end     = boot_info.load_end;
> +    unsigned long linker_addr_start = boot_info.linker_start;
> +    unsigned long linker_addr_end   = boot_info.linker_end;
> +
> +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> +    if (load_addr_start != linker_addr_start)
> +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end > linker_addr_start));

I would suggest to switch to a panic() with an error message as this 
would help the user understanding what this is breaking.

Alternatively, you could document what this check is for.

> +
> +    /* Get the addresses where the page tables were loaded */
> +    second  = (pte_t *)(&xen_second_pagetable);
> +    first   = (pte_t *)(&xen_first_pagetable);
> +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
> +
> +    setup_initial_mapping(second, first, zeroeth,
> +                          LOAD_TO_LINK((unsigned long)&_stext),
> +                          LOAD_TO_LINK((unsigned long)&_etext),
> +                          (unsigned long)&_stext,
> +                          PTE_LEAF_DEFAULT);
> +
> +    setup_initial_mapping(second, first, zeroeth,
> +                          LOAD_TO_LINK((unsigned long)&__init_begin),
> +                          LOAD_TO_LINK((unsigned long)&__init_end),
> +                          (unsigned long)&__init_begin,
> +                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
> +
> +    setup_initial_mapping(second, first, zeroeth,
> +                          LOAD_TO_LINK((unsigned long)&_srodata),
> +                          LOAD_TO_LINK((unsigned long)&_erodata),
> +                          (unsigned long)(&_srodata),
> +                          PTE_VALID | PTE_READABLE);
> +
> +    setup_initial_mapping(second, first, zeroeth,
> +                          linker_addr_start,
> +                          linker_addr_end,
> +                          load_addr_start,
> +                          PTE_LEAF_DEFAULT | PTE_READABLE);

As I said in v1, you need to use a different set of page-table here. 
Also, where do you guarantee that Xen will be loaded at a 2MB aligned 
address? (For a fact I know that UEFI is only guarantee 4KB alignment).

> +}
> diff --git a/xen/arch/riscv/riscv64/head.S b/xen/arch/riscv/riscv64/head.S
> index 8887f0cbd4..f4a0582727 100644
> --- a/xen/arch/riscv/riscv64/head.S
> +++ b/xen/arch/riscv/riscv64/head.S
> @@ -1,4 +1,5 @@
>   #include <asm/asm.h>
> +#include <asm/asm-offsets.h>
>   #include <asm/riscv_encoding.h>
>   
>           .section .text.header, "ax", %progbits
> @@ -32,3 +33,67 @@ ENTRY(start)
>           add     sp, sp, t0
>   
>           tail    start_xen
> +
> +ENTRY(enable_mmu)
> +        /* Calculate physical offset between linker and load addresses */
> +        la      t0, boot_info
> +        REG_L   t1, BI_LINKER_START(t0)
> +        REG_L   t2, BI_LOAD_START(t0)
> +        sub     t1, t1, t2
> +
> +        /*
> +         * Calculate and update a linker time address of the .L_mmu_is_enabled
> +         * label and update CSR_STVEC with it.
> +         * MMU is configured in a way where linker addresses are mapped
> +         * on load addresses so case when linker addresses are not equal to
> +         * load addresses, and thereby, after MMU is enabled, it will cause
> +         * an exception and jump to linker time addresses
> +         */
> +        la      t3, .L_mmu_is_enabled
> +        add     t3, t3, t1
> +        csrw    CSR_STVEC, t3
> +
> +        /* Calculate a value for SATP register */
> +        li      t5, SATP_MODE_SV39
> +        li      t6, SATP_MODE_SHIFT
> +        sll     t5, t5, t6
> +
> +        la      t4, xen_second_pagetable
> +        srl     t4, t4, PAGE_SHIFT
> +        or      t4, t4, t5
> +        sfence.vma
> +        csrw    CSR_SATP, t4
> +
> +        .align 2
> +.L_mmu_is_enabled:
> +        /*
> +         * Stack should be re-inited as:
> +         * 1. Right now an address of the stack is relative to load time
> +         *    addresses what will cause an issue in case of load start address
> +         *    isn't equal to linker start address.
> +         * 2. Addresses in stack are all load time relative which can be an
> +         *    issue in case when load start address isn't equal to linker
> +         *    start address.
> +         */

Hmmm... I am not sure you can reset the stack and then return to the 
caller because it may have stash some variable on the stack.

So if you want to reset, then you should jump to a brand new function.

> +        la      sp, cpu0_boot_stack
> +        li      t0, STACK_SIZE
> +        add     sp, sp, t0
> +
> +        /*
> +         * Re-init an address of exception handler as it was overwritten  with
> +         * the address of the .L_mmu_is_enabled label.
> +         * Before jump to trap_init save return address of enable_mmu() to
> +         * know where we should back after enable_mmu() will be finished.
> +         */
> +        mv      s0, ra
> +        lla     t0, trap_init
> +        jalr    ra, t0
> +
> +        /*
> +         * Re-calculate the return address of enable_mmu() function for case
> +         * when linker start address isn't equal to load start address
> +         */
> +        add     s0, s0, t1
> +        mv      ra, s0
> +
> +        ret

Missing ENDPROC?

> diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
> index eed457c492..e4ac4e84b6 100644
> --- a/xen/arch/riscv/xen.lds.S
> +++ b/xen/arch/riscv/xen.lds.S
> @@ -179,3 +179,5 @@ SECTIONS
>   
>   ASSERT(!SIZEOF(.got),      ".got non-empty")
>   ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
> +
> +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot assumptions")

Cheers,
Jan Beulich March 22, 2023, 8:12 a.m. UTC | #4
On 21.03.2023 18:08, Oleksii wrote:
> On Mon, 2023-03-20 at 17:41 +0100, Jan Beulich wrote:
>> On 16.03.2023 17:43, Oleksii Kurochko wrote:
>>> +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
>>> +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) +
>>> PAGE_ORDER)
>>> +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) <<
>>> LEVEL_SHIFT(lvl))
>>> +
>>> +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
>>> +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
>>> +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
>>
>> Mind me asking what these are good for? Doesn't one set of macros
>> suffice?
> Do you mean XEN_PT_LEVEL_{SHIFT...}? it can be used only one pair of
> macros. I'll check how they are used in ARM ( I copied that macros from
> there ).

There's no similar duplication in Arm code: They have LEVEL_..._GS(),
but that takes a second parameter.

>>> +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) -
>>> 1))
>>> +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK <<
>>> XEN_PT_LEVEL_SHIFT(lvl))
>>> +
>>> +#define PTE_SHIFT                   10
>>
>> What does this describe? According to its single use here it may
>> simply require a better name.
> If look at Sv39 page table entry in riscv-priviliged.pdf. It has the
> following description:
>   63 62 61  60    54  53  28 27  19 18  10 9 8 7 6 5 4 3 2 1 0
>   N  PBMT   Rererved  PPN[2] PPN[1] PPN[0] RSW D A G U X W R V
> So 10 it means the 0-9 bits.

Right. As said, I think the name needs improving, so it becomes clear
what it refers to. Possibly PTE_PPN_SHIFT?

Another question: Do you really mean to only support Sv39?

>>> +/* Page Table entry */
>>> +typedef struct {
>>> +    uint64_t pte;
>>> +} pte_t;
>>
>> Not having read the respective spec (yet) I'm wondering if this
>> really
>> is this way also for RV32 (despite the different PAGETABLE_ORDER).
> RISC-V architecture support several MMU mode to translate VA to PA.
> The following mode can be: Sv32, Sv39, Sv48, Sv57 and PAGESIZE is equal
> to 8 in all cases except Sv32 ( it is equal to 4 ).

I guess you mean PTESIZE.

> but I looked at
> different big projects who have RISC-V support and no one supports
> Sv32.
> 
> So it will be correct for RV32 as it supports the same Sv32 and Sv39
> modes too.

Would you mind extending the comment then to mention that there's no
intention to support Sv32, even on RV32? (Alternatively, as per a
remark you had further down, some #ifdef-ary may be needed.)

Jan
Jan Beulich March 22, 2023, 8:19 a.m. UTC | #5
On 21.03.2023 18:58, Julien Grall wrote:
> Also, where do you guarantee that Xen will be loaded at a 2MB aligned 
> address? (For a fact I know that UEFI is only guarantee 4KB alignment).

I don't think this is true. We rely on UEFI honoring larger alignment on
x86, and I believe the PE loader code is uniform across architectures.
Of course you need to specify large enough alignment in the PE header's
SectionAlignment field (right now in the Arm64 binary it's only 4k).

Jan
Oleksii Kurochko March 22, 2023, 9:22 a.m. UTC | #6
On Wed, 2023-03-22 at 09:12 +0100, Jan Beulich wrote:
> On 21.03.2023 18:08, Oleksii wrote:
> > On Mon, 2023-03-20 at 17:41 +0100, Jan Beulich wrote:
> > > On 16.03.2023 17:43, Oleksii Kurochko wrote:
> > > > +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
> > > > +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) +
> > > > PAGE_ORDER)
> > > > +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) <<
> > > > LEVEL_SHIFT(lvl))
> > > > +
> > > > +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
> > > > +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
> > > > +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
> > > 
> > > Mind me asking what these are good for? Doesn't one set of macros
> > > suffice?
> > Do you mean XEN_PT_LEVEL_{SHIFT...}? it can be used only one pair
> > of
> > macros. I'll check how they are used in ARM ( I copied that macros
> > from
> > there ).
> 
> There's no similar duplication in Arm code: They have LEVEL_..._GS(),
> but that takes a second parameter.
> 
> > > > +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl)
> > > > -
> > > > 1))
> > > > +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK <<
> > > > XEN_PT_LEVEL_SHIFT(lvl))
> > > > +
> > > > +#define PTE_SHIFT                   10
> > > 
> > > What does this describe? According to its single use here it may
> > > simply require a better name.
> > If look at Sv39 page table entry in riscv-priviliged.pdf. It has
> > the
> > following description:
> >   63 62 61  60    54  53  28 27  19 18  10 9 8 7 6 5 4 3 2 1 0
> >   N  PBMT   Rererved  PPN[2] PPN[1] PPN[0] RSW D A G U X W R V
> > So 10 it means the 0-9 bits.
> 
> Right. As said, I think the name needs improving, so it becomes clear
> what it refers to. Possibly PTE_PPN_SHIFT?
It will be better so I'll update it in new version of the aptch series.
> 
> Another question: Do you really mean to only support Sv39?
At least for now yes, it looks like it is the most usable mode.
> 
> > > > +/* Page Table entry */
> > > > +typedef struct {
> > > > +    uint64_t pte;
> > > > +} pte_t;
> > > 
> > > Not having read the respective spec (yet) I'm wondering if this
> > > really
> > > is this way also for RV32 (despite the different
> > > PAGETABLE_ORDER).
> > RISC-V architecture support several MMU mode to translate VA to PA.
> > The following mode can be: Sv32, Sv39, Sv48, Sv57 and PAGESIZE is
> > equal
> > to 8 in all cases except Sv32 ( it is equal to 4 ).
> 
> I guess you mean PTESIZE.
Yes, I mean PTESIZE.
> 
> > but I looked at
> > different big projects who have RISC-V support and no one supports
> > Sv32.
> > 
> > So it will be correct for RV32 as it supports the same Sv32 and
> > Sv39
> > modes too.
> 
> Would you mind extending the comment then to mention that there's no
> intention to support Sv32, even on RV32? (Alternatively, as per a
> remark you had further down, some #ifdef-ary may be needed.)
I re-read documentation and it gave you incorrect information:

When SXLEN=32, the only other valid setting for MODE is Sv32, a paged
virtual-memory scheme described in Section 4.3.
When SXLEN=64, three paged virtual-memory schemes are defined: Sv39,
Sv48, and Sv57, described in Sections 4.4, 4.5, and 4.6, respectively.
One additional scheme, Sv64, will be defined in a later version of this
specification. The remaining MODE settings are reserved for future use
and may define different interpretations of the other fields in satp.

But I'll add #ifdef with the message that RV32 isn't supported now.

~ Oleksii
Oleksii Kurochko March 22, 2023, 9:55 a.m. UTC | #7
Hi Jullien,

On Tue, 2023-03-21 at 17:58 +0000, Julien Grall wrote:
> Hi,
> 
> I will try to not repeat the comment already made.
> 
> On 16/03/2023 16:43, Oleksii Kurochko wrote:
> > Mostly the code for setup_initial_pages was taken from Bobby's
> > repo except for the following changes:
> > * Use only a minimal part of the code enough to enable MMU
> > * rename {_}setup_initial_pagetables functions
> > * add an argument for setup_initial_mapping to have
> >    an opportunity to make set PTE flags.
> > * update setup_initial_pagetables function to map sections
> >    with correct PTE flags.
> > * introduce separate enable_mmu() to be able for proper
> >    handling the case when load start address isn't equal to
> >    linker start address.
> > * map linker addresses range to load addresses range without
> >    1:1 mapping.
> > * add safety checks such as:
> >    * Xen size is less than page size
> >    * linker addresses range doesn't overlap load addresses
> >      range
> > * Rework macros {THIRD,SECOND,FIRST,ZEROETH}_{SHIFT,MASK}
> > * change PTE_LEAF_DEFAULT to RX instead of RWX.
> > * Remove phys_offset as it isn't used now.
> > * Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > in
> >    setup_inital_mapping() as they should be already aligned.
> > * Remove clear_pagetables() as initial pagetables will be
> >    zeroed during bss initialization
> > * Remove __attribute__((section(".entry")) for
> > setup_initial_pagetables()
> >    as there is no such section in xen.lds.S
> > * Update the argument of pte_is_valid() to "const pte_t *p"
> > 
> > Origin:
> > https://gitlab.com/xen-on-risc-v/xen/-/tree/riscv-rebase 4af165b468
> > af
> > Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> > ---
> > Changes in V2:
> >   * update the commit message:
> >   * Remove {ZEROETH,FIRST,...}_{SHIFT,MASK, SIZE,...} and
> >     introduce instead of them XEN_PT_LEVEL_*() and LEVEL_*
> >   * Rework pt_linear_offset() and pt_index based on 
> > XEN_PT_LEVEL_*()
> >   * Remove clear_pagetables() functions as pagetables were zeroed
> > during
> >     .bss initialization
> >   * Rename _setup_initial_pagetables() to setup_initial_mapping()
> >   * Make PTE_DEFAULT equal to RX.
> >   * Update prototype of setup_initial_mapping(..., bool writable) -
> > >
> >     setup_initial_mapping(..., UL flags)
> >   * Update calls of setup_initial_mapping according to new
> > prototype
> >   * Remove unnecessary call of:
> >     _setup_initial_pagetables(..., load_addr_start, load_addr_end,
> > load_addr_start, ...)
> >   * Define index* in the loop of setup_initial_mapping
> >   * Remove attribute "__attribute__((section(".entry")))" for
> > setup_initial_pagetables()
> >     as we don't have such section
> >   * make arguments of paddr_to_pte() and pte_is_valid() as const.
> >   * make xen_second_pagetable static.
> >   * use <xen/kernel.h> instead of declaring extern unsigned long
> > _stext, 0etext, _srodata, _erodata
> >   * update  'extern unsigned long __init_begin' to 'extern unsigned
> > long __init_begin[]'
> >   * use aligned() instead of
> > "__attribute__((__aligned__(PAGE_SIZE)))"
> >   * set __section(".bss.page_aligned") for page tables arrays
> >   * fix identatations
> >   * Change '__attribute__((section(".entry")))' to '__init'
> >   * Remove phys_offset as it isn't used now.
> >   * Remove alignment  of {map, pa}_start &=
> > XEN_PT_LEVEL_MAP_MASK(0); in
> >     setup_inital_mapping() as they should be already aligned.
> >   * Remove clear_pagetables() as initial pagetables will be
> >     zeroed during bss initialization
> >   * Remove __attribute__((section(".entry")) for
> > setup_initial_pagetables()
> >     as there is no such section in xen.lds.S
> >   * Update the argument of pte_is_valid() to "const pte_t *p"
> > ---
> >   xen/arch/riscv/Makefile           |   1 +
> >   xen/arch/riscv/include/asm/mm.h   |   8 ++
> >   xen/arch/riscv/include/asm/page.h |  67 +++++++++++++++++
> >   xen/arch/riscv/mm.c               | 121
> > ++++++++++++++++++++++++++++++
> >   xen/arch/riscv/riscv64/head.S     |  65 ++++++++++++++++
> >   xen/arch/riscv/xen.lds.S          |   2 +
> >   6 files changed, 264 insertions(+)
> >   create mode 100644 xen/arch/riscv/include/asm/mm.h
> >   create mode 100644 xen/arch/riscv/include/asm/page.h
> >   create mode 100644 xen/arch/riscv/mm.c
> > 
> > diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
> > index 443f6bf15f..956ceb02df 100644
> > --- a/xen/arch/riscv/Makefile
> > +++ b/xen/arch/riscv/Makefile
> > @@ -1,5 +1,6 @@
> >   obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
> >   obj-y += entry.o
> > +obj-y += mm.o
> >   obj-$(CONFIG_RISCV_64) += riscv64/
> >   obj-y += sbi.o
> >   obj-y += setup.o
> > diff --git a/xen/arch/riscv/include/asm/mm.h
> > b/xen/arch/riscv/include/asm/mm.h
> > new file mode 100644
> > index 0000000000..3cc98fe45b
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/mm.h
> > @@ -0,0 +1,8 @@
> > +#ifndef _ASM_RISCV_MM_H
> > +#define _ASM_RISCV_MM_H
> > +
> > +void setup_initial_pagetables(void);
> > +
> > +extern void enable_mmu(void);
> > +
> > +#endif /* _ASM_RISCV_MM_H */
> > diff --git a/xen/arch/riscv/include/asm/page.h
> > b/xen/arch/riscv/include/asm/page.h
> > new file mode 100644
> > index 0000000000..fb8329a191
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/page.h
> > @@ -0,0 +1,67 @@
> > +#ifndef _ASM_RISCV_PAGE_H
> > +#define _ASM_RISCV_PAGE_H
> > +
> > +#include <xen/const.h>
> > +#include <xen/types.h>
> > +
> > +#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
> > +#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES
> > - 1))
> > +
> > +#define PAGE_ORDER                  (12)
> > +
> > +#ifdef CONFIG_RISCV_64
> > +#define PAGETABLE_ORDER             (9)
> > +#else /* CONFIG_RISCV_32 */
> > +#define PAGETABLE_ORDER             (10)
> > +#endif
> > +
> > +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
> > +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) +
> > PAGE_ORDER)
> > +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) <<
> > LEVEL_SHIFT(lvl))
> > +
> > +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
> > +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
> > +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
> > +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) -
> > 1))
> > +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK <<
> > XEN_PT_LEVEL_SHIFT(lvl))
> > +
> > +#define PTE_SHIFT                   10
> > +
> > +#define PTE_VALID                   BIT(0, UL)
> > +#define PTE_READABLE                BIT(1, UL)
> > +#define PTE_WRITABLE                BIT(2, UL)
> > +#define PTE_EXECUTABLE              BIT(3, UL)
> > +#define PTE_USER                    BIT(4, UL)
> > +#define PTE_GLOBAL                  BIT(5, UL)
> > +#define PTE_ACCESSED                BIT(6, UL)
> > +#define PTE_DIRTY                   BIT(7, UL)
> > +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
> > +
> > +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE |
> > PTE_EXECUTABLE)
> > +#define PTE_TABLE                   (PTE_VALID)
> > +
> > +/* Calculate the offsets into the pagetables for a given VA */
> > +#define pt_linear_offset(lvl, va)   ((va) >>
> > XEN_PT_LEVEL_SHIFT(lvl))
> > +
> > +#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) &
> > XEN_PT_LEVEL_MASK(lvl))
> > +
> > +/* Page Table entry */
> > +typedef struct {
> > +    uint64_t pte;
> > +} pte_t;
> > +
> > +/* Shift the VPN[x] or PPN[x] fields of a virtual or physical
> > address
> > + * to become the shifted PPN[x] fields of a page table entry */
> > +#define addr_to_ppn(x) (((x) >> PAGE_SHIFT) << PTE_SHIFT)
> > +
> > +static inline pte_t paddr_to_pte(const unsigned long paddr)
> > +{
> > +    return (pte_t) { .pte = addr_to_ppn(paddr) };
> > +}
> > +
> > +static inline bool pte_is_valid(const pte_t *p)
> > +{
> > +    return p->pte & PTE_VALID;
> > +}
> > +
> > +#endif /* _ASM_RISCV_PAGE_H */
> > diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
> > new file mode 100644
> > index 0000000000..0df6b47441
> > --- /dev/null
> > +++ b/xen/arch/riscv/mm.c
> > @@ -0,0 +1,121 @@
> > +#include <xen/compiler.h>
> > +#include <xen/init.h>
> > +#include <xen/kernel.h>
> > +#include <xen/lib.h>
> > +#include <xen/page-size.h>
> > +
> > +#include <asm/boot-info.h>
> > +#include <asm/config.h>
> > +#include <asm/csr.h>
> > +#include <asm/mm.h>
> > +#include <asm/page.h>
> > +#include <asm/traps.h>
> > +
> > +/*
> > + * xen_second_pagetable is indexed with the VPN[2] page table
> > entry field
> > + * xen_first_pagetable is accessed from the VPN[1] page table
> > entry field
> > + * xen_zeroeth_pagetable is accessed from the VPN[0] page table
> > entry field
> > + */
> > +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_second_pagetable[PAGE_ENTRIES];
> > +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_first_pagetable[PAGE_ENTRIES];
> > +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> > +    xen_zeroeth_pagetable[PAGE_ENTRIES];
> > +
> > +extern unsigned long __init_begin[];
> > +extern unsigned long __init_end[];
> > +extern unsigned char cpu0_boot_stack[STACK_SIZE];
> > +
> > +static void __init
> > +setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
> > +                      unsigned long map_start,
> > +                      unsigned long map_end,
> > +                      unsigned long pa_start,
> > +                      unsigned long flags)
> > +{
> > +    unsigned long page_addr;
> > +
> > +    // /* align start addresses */
> > +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
> 
> They should be switched to ASSERT() or BUG_ON().
Sure. Thanks. I'll update.
> 
> > +
> > +    page_addr = map_start;
> > +    while ( page_addr < map_end )
> 
> This loop is continue to assume that only the mapping can first in
> 2MB 
> section (or less if the start is not 2MB aligned).
> 
> I am OK if you want to assume it, but I think this should be
> documented 
> (with words and ASSERT()/BUG_ON()) to avoid any mistake.
I add a check in setup_initial_pagetables:
     BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
Probably this is not a correct place and should be moved to
setup_initial_mapping() instead of setup_initial_pagetables()
> 
> > +    {
> > +        unsigned long index2 = pt_index(2, page_addr);
> > +        unsigned long index1 = pt_index(1, page_addr);
> > +        unsigned long index0 = pt_index(0, page_addr);
> > +
> > +        /* Setup level2 table */
> > +        second[index2] = paddr_to_pte((unsigned long)first);
> > +        second[index2].pte |= PTE_TABLE;
> > +
> > +        /* Setup level1 table */
> > +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
> > +        first[index1].pte |= PTE_TABLE;
> > +
> > +
> 
> NIT: Spurious line?
Yeah, should be removed. Thanks.
> 
> > +        /* Setup level0 table */
> > +        if ( !pte_is_valid(&zeroeth[index0]) )
> 
> On the previous version, you said it should be checked for each
> level. 
I had a terrible internet connection, and my message wasn't sent.

I decided not to check that l2 and l1 are used only for referring to
the next page table but not leaf PTE. So it is safe to overwrite it
each time (the addresses of page tables are the same all the time) and
probably it will be better from optimization point of view to ignore if
clauses.

And it is needed in case of L0 because it is possible that some
addressed were marked with specific flag ( execution, read, write ) and
so not to overwrite the flags set before the check is needed.

> the next page table but not leaf PTE.But here you still only check
> for a single level.
> 
> Furthermore, I would strongly suggest to also check the valid PTE is
> the 
> same as you intend to write to catch any override (they are a pain to
> debug).
but if load addresses and linker addresses don't overlap is it possible
situation that valid PTE will be overridden?
> 
> > +        {
> > +            /* Update level0 table */
> > +            zeroeth[index0] = paddr_to_pte((page_addr - map_start)
> > + pa_start);
> > +            zeroeth[index0].pte |= flags;
> > +        }
> > +
> > +        /* Point to next page */
> > +        page_addr += XEN_PT_LEVEL_SIZE(0);
> > +    }
> > +}
> > +
> > +/*
> > + * setup_initial_pagetables:
> > + *
> > + * Build the page tables for Xen that map the following:
> > + *   load addresses to linker addresses
> > + */
> > +void __init setup_initial_pagetables(void)
> > +{
> > +    pte_t *second;
> > +    pte_t *first;
> > +    pte_t *zeroeth;
> > +
> > +    unsigned long load_addr_start   = boot_info.load_start;
> > +    unsigned long load_addr_end     = boot_info.load_end;
> > +    unsigned long linker_addr_start = boot_info.linker_start;
> > +    unsigned long linker_addr_end   = boot_info.linker_end;
> > +
> > +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> > +    if (load_addr_start != linker_addr_start)
> > +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end
> > > linker_addr_start));
> 
> I would suggest to switch to a panic() with an error message as this 
> would help the user understanding what this is breaking.
> 
> Alternatively, you could document what this check is for.
I think I will document it for now as panic() isn't ready for use now.
> 
> > +
> > +    /* Get the addresses where the page tables were loaded */
> > +    second  = (pte_t *)(&xen_second_pagetable);
> > +    first   = (pte_t *)(&xen_first_pagetable);
> > +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
> > +
> > +    setup_initial_mapping(second, first, zeroeth,
> > +                          LOAD_TO_LINK((unsigned long)&_stext),
> > +                          LOAD_TO_LINK((unsigned long)&_etext),
> > +                          (unsigned long)&_stext,
> > +                          PTE_LEAF_DEFAULT);
> > +
> > +    setup_initial_mapping(second, first, zeroeth,
> > +                          LOAD_TO_LINK((unsigned
> > long)&__init_begin),
> > +                          LOAD_TO_LINK((unsigned
> > long)&__init_end),
> > +                          (unsigned long)&__init_begin,
> > +                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
> > +
> > +    setup_initial_mapping(second, first, zeroeth,
> > +                          LOAD_TO_LINK((unsigned long)&_srodata),
> > +                          LOAD_TO_LINK((unsigned long)&_erodata),
> > +                          (unsigned long)(&_srodata),
> > +                          PTE_VALID | PTE_READABLE);
> > +
> > +    setup_initial_mapping(second, first, zeroeth,
> > +                          linker_addr_start,
> > +                          linker_addr_end,
> > +                          load_addr_start,
> > +                          PTE_LEAF_DEFAULT | PTE_READABLE);
> 
> As I said in v1, you need to use a different set of page-table here.
If I understand you correctly I have to use a different set of page-
table in case when it is possible that size of Xen will be larger then
PAGE_SIZE. So I added to xen.lds.S a check to be sure that the size
fits into PAGE_SIZE.

> Also, where do you guarantee that Xen will be loaded at a 2MB aligned
> address? (For a fact I know that UEFI is only guarantee 4KB
> alignment).
There is a check in setup_initial_pagetables:
     BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);

> 
> > +}
> > diff --git a/xen/arch/riscv/riscv64/head.S
> > b/xen/arch/riscv/riscv64/head.S
> > index 8887f0cbd4..f4a0582727 100644
> > --- a/xen/arch/riscv/riscv64/head.S
> > +++ b/xen/arch/riscv/riscv64/head.S
> > @@ -1,4 +1,5 @@
> >   #include <asm/asm.h>
> > +#include <asm/asm-offsets.h>
> >   #include <asm/riscv_encoding.h>
> >   
> >           .section .text.header, "ax", %progbits
> > @@ -32,3 +33,67 @@ ENTRY(start)
> >           add     sp, sp, t0
> >   
> >           tail    start_xen
> > +
> > +ENTRY(enable_mmu)
> > +        /* Calculate physical offset between linker and load
> > addresses */
> > +        la      t0, boot_info
> > +        REG_L   t1, BI_LINKER_START(t0)
> > +        REG_L   t2, BI_LOAD_START(t0)
> > +        sub     t1, t1, t2
> > +
> > +        /*
> > +         * Calculate and update a linker time address of the
> > .L_mmu_is_enabled
> > +         * label and update CSR_STVEC with it.
> > +         * MMU is configured in a way where linker addresses are
> > mapped
> > +         * on load addresses so case when linker addresses are not
> > equal to
> > +         * load addresses, and thereby, after MMU is enabled, it
> > will cause
> > +         * an exception and jump to linker time addresses
> > +         */
> > +        la      t3, .L_mmu_is_enabled
> > +        add     t3, t3, t1
> > +        csrw    CSR_STVEC, t3
> > +
> > +        /* Calculate a value for SATP register */
> > +        li      t5, SATP_MODE_SV39
> > +        li      t6, SATP_MODE_SHIFT
> > +        sll     t5, t5, t6
> > +
> > +        la      t4, xen_second_pagetable
> > +        srl     t4, t4, PAGE_SHIFT
> > +        or      t4, t4, t5
> > +        sfence.vma
> > +        csrw    CSR_SATP, t4
> > +
> > +        .align 2
> > +.L_mmu_is_enabled:
> > +        /*
> > +         * Stack should be re-inited as:
> > +         * 1. Right now an address of the stack is relative to
> > load time
> > +         *    addresses what will cause an issue in case of load
> > start address
> > +         *    isn't equal to linker start address.
> > +         * 2. Addresses in stack are all load time relative which
> > can be an
> > +         *    issue in case when load start address isn't equal to
> > linker
> > +         *    start address.
> > +         */
> 
> Hmmm... I am not sure you can reset the stack and then return to the 
> caller because it may have stash some variable on the stack.
> 
> So if you want to reset, then you should jump to a brand new
> function.
Agree. it should be a new function. I'll take it into account.
> 
> > +        la      sp, cpu0_boot_stack
> > +        li      t0, STACK_SIZE
> > +        add     sp, sp, t0
> > +
> > +        /*
> > +         * Re-init an address of exception handler as it was
> > overwritten  with
> > +         * the address of the .L_mmu_is_enabled label.
> > +         * Before jump to trap_init save return address of
> > enable_mmu() to
> > +         * know where we should back after enable_mmu() will be
> > finished.
> > +         */
> > +        mv      s0, ra
> > +        lla     t0, trap_init
> > +        jalr    ra, t0
> > +
> > +        /*
> > +         * Re-calculate the return address of enable_mmu()
> > function for case
> > +         * when linker start address isn't equal to load start
> > address
> > +         */
> > +        add     s0, s0, t1
> > +        mv      ra, s0
> > +
> > +        ret
> 
> Missing ENDPROC?
I haven't seen the usage of ENDPROC for RISC-V so it looks like it is
not necessary.
> 
> > diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
> > index eed457c492..e4ac4e84b6 100644
> > --- a/xen/arch/riscv/xen.lds.S
> > +++ b/xen/arch/riscv/xen.lds.S
> > @@ -179,3 +179,5 @@ SECTIONS
> >   
> >   ASSERT(!SIZEOF(.got),      ".got non-empty")
> >   ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
> > +
> > +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot
> > assumptions")
> 
~ Oleksii
Julien Grall March 22, 2023, 1:55 p.m. UTC | #8
Hi,

On 22/03/2023 08:19, Jan Beulich wrote:
> On 21.03.2023 18:58, Julien Grall wrote:
>> Also, where do you guarantee that Xen will be loaded at a 2MB aligned
>> address? (For a fact I know that UEFI is only guarantee 4KB alignment).
> 
> I don't think this is true. We rely on UEFI honoring larger alignment on
> x86, and I believe the PE loader code is uniform across architectures.
> Of course you need to specify large enough alignment in the PE header's
> SectionAlignment field (right now in the Arm64 binary it's only 4k).

Hmmm... I have always been told this was ignored. Hence why we need to 
deal with 4KB. In any case, the problem stay the same. We didn't seem to 
have documented the expected alignment.

So in theory anyone can load RISC-V at any address. My suggestion would 
be to create docs/misc/riscv/booting.txt and document how one is meant 
to load Xen.

Cheers,
Julien Grall March 22, 2023, 2:21 p.m. UTC | #9
On 22/03/2023 09:55, Oleksii wrote:
> Hi Jullien,

Hi,

> On Tue, 2023-03-21 at 17:58 +0000, Julien Grall wrote:
>> Hi,
>>
>> I will try to not repeat the comment already made.
>>
>> On 16/03/2023 16:43, Oleksii Kurochko wrote:
>>> Mostly the code for setup_initial_pages was taken from Bobby's
>>> repo except for the following changes:
>>> * Use only a minimal part of the code enough to enable MMU
>>> * rename {_}setup_initial_pagetables functions
>>> * add an argument for setup_initial_mapping to have
>>>     an opportunity to make set PTE flags.
>>> * update setup_initial_pagetables function to map sections
>>>     with correct PTE flags.
>>> * introduce separate enable_mmu() to be able for proper
>>>     handling the case when load start address isn't equal to
>>>     linker start address.
>>> * map linker addresses range to load addresses range without
>>>     1:1 mapping.
>>> * add safety checks such as:
>>>     * Xen size is less than page size
>>>     * linker addresses range doesn't overlap load addresses
>>>       range
>>> * Rework macros {THIRD,SECOND,FIRST,ZEROETH}_{SHIFT,MASK}
>>> * change PTE_LEAF_DEFAULT to RX instead of RWX.
>>> * Remove phys_offset as it isn't used now.
>>> * Remove alignment  of {map, pa}_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>> in
>>>     setup_inital_mapping() as they should be already aligned.
>>> * Remove clear_pagetables() as initial pagetables will be
>>>     zeroed during bss initialization
>>> * Remove __attribute__((section(".entry")) for
>>> setup_initial_pagetables()
>>>     as there is no such section in xen.lds.S
>>> * Update the argument of pte_is_valid() to "const pte_t *p"
>>>
>>> Origin:
>>> https://gitlab.com/xen-on-risc-v/xen/-/tree/riscv-rebase 4af165b468
>>> af
>>> Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
>>> ---
>>> Changes in V2:
>>>    * update the commit message:
>>>    * Remove {ZEROETH,FIRST,...}_{SHIFT,MASK, SIZE,...} and
>>>      introduce instead of them XEN_PT_LEVEL_*() and LEVEL_*
>>>    * Rework pt_linear_offset() and pt_index based on
>>> XEN_PT_LEVEL_*()
>>>    * Remove clear_pagetables() functions as pagetables were zeroed
>>> during
>>>      .bss initialization
>>>    * Rename _setup_initial_pagetables() to setup_initial_mapping()
>>>    * Make PTE_DEFAULT equal to RX.
>>>    * Update prototype of setup_initial_mapping(..., bool writable) -
>>>>
>>>      setup_initial_mapping(..., UL flags)
>>>    * Update calls of setup_initial_mapping according to new
>>> prototype
>>>    * Remove unnecessary call of:
>>>      _setup_initial_pagetables(..., load_addr_start, load_addr_end,
>>> load_addr_start, ...)
>>>    * Define index* in the loop of setup_initial_mapping
>>>    * Remove attribute "__attribute__((section(".entry")))" for
>>> setup_initial_pagetables()
>>>      as we don't have such section
>>>    * make arguments of paddr_to_pte() and pte_is_valid() as const.
>>>    * make xen_second_pagetable static.
>>>    * use <xen/kernel.h> instead of declaring extern unsigned long
>>> _stext, 0etext, _srodata, _erodata
>>>    * update  'extern unsigned long __init_begin' to 'extern unsigned
>>> long __init_begin[]'
>>>    * use aligned() instead of
>>> "__attribute__((__aligned__(PAGE_SIZE)))"
>>>    * set __section(".bss.page_aligned") for page tables arrays
>>>    * fix identatations
>>>    * Change '__attribute__((section(".entry")))' to '__init'
>>>    * Remove phys_offset as it isn't used now.
>>>    * Remove alignment  of {map, pa}_start &=
>>> XEN_PT_LEVEL_MAP_MASK(0); in
>>>      setup_inital_mapping() as they should be already aligned.
>>>    * Remove clear_pagetables() as initial pagetables will be
>>>      zeroed during bss initialization
>>>    * Remove __attribute__((section(".entry")) for
>>> setup_initial_pagetables()
>>>      as there is no such section in xen.lds.S
>>>    * Update the argument of pte_is_valid() to "const pte_t *p"
>>> ---
>>>    xen/arch/riscv/Makefile           |   1 +
>>>    xen/arch/riscv/include/asm/mm.h   |   8 ++
>>>    xen/arch/riscv/include/asm/page.h |  67 +++++++++++++++++
>>>    xen/arch/riscv/mm.c               | 121
>>> ++++++++++++++++++++++++++++++
>>>    xen/arch/riscv/riscv64/head.S     |  65 ++++++++++++++++
>>>    xen/arch/riscv/xen.lds.S          |   2 +
>>>    6 files changed, 264 insertions(+)
>>>    create mode 100644 xen/arch/riscv/include/asm/mm.h
>>>    create mode 100644 xen/arch/riscv/include/asm/page.h
>>>    create mode 100644 xen/arch/riscv/mm.c
>>>
>>> diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
>>> index 443f6bf15f..956ceb02df 100644
>>> --- a/xen/arch/riscv/Makefile
>>> +++ b/xen/arch/riscv/Makefile
>>> @@ -1,5 +1,6 @@
>>>    obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
>>>    obj-y += entry.o
>>> +obj-y += mm.o
>>>    obj-$(CONFIG_RISCV_64) += riscv64/
>>>    obj-y += sbi.o
>>>    obj-y += setup.o
>>> diff --git a/xen/arch/riscv/include/asm/mm.h
>>> b/xen/arch/riscv/include/asm/mm.h
>>> new file mode 100644
>>> index 0000000000..3cc98fe45b
>>> --- /dev/null
>>> +++ b/xen/arch/riscv/include/asm/mm.h
>>> @@ -0,0 +1,8 @@
>>> +#ifndef _ASM_RISCV_MM_H
>>> +#define _ASM_RISCV_MM_H
>>> +
>>> +void setup_initial_pagetables(void);
>>> +
>>> +extern void enable_mmu(void);
>>> +
>>> +#endif /* _ASM_RISCV_MM_H */
>>> diff --git a/xen/arch/riscv/include/asm/page.h
>>> b/xen/arch/riscv/include/asm/page.h
>>> new file mode 100644
>>> index 0000000000..fb8329a191
>>> --- /dev/null
>>> +++ b/xen/arch/riscv/include/asm/page.h
>>> @@ -0,0 +1,67 @@
>>> +#ifndef _ASM_RISCV_PAGE_H
>>> +#define _ASM_RISCV_PAGE_H
>>> +
>>> +#include <xen/const.h>
>>> +#include <xen/types.h>
>>> +
>>> +#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
>>> +#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES
>>> - 1))
>>> +
>>> +#define PAGE_ORDER                  (12)
>>> +
>>> +#ifdef CONFIG_RISCV_64
>>> +#define PAGETABLE_ORDER             (9)
>>> +#else /* CONFIG_RISCV_32 */
>>> +#define PAGETABLE_ORDER             (10)
>>> +#endif
>>> +
>>> +#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
>>> +#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) +
>>> PAGE_ORDER)
>>> +#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) <<
>>> LEVEL_SHIFT(lvl))
>>> +
>>> +#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
>>> +#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
>>> +#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
>>> +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) -
>>> 1))
>>> +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK <<
>>> XEN_PT_LEVEL_SHIFT(lvl))
>>> +
>>> +#define PTE_SHIFT                   10
>>> +
>>> +#define PTE_VALID                   BIT(0, UL)
>>> +#define PTE_READABLE                BIT(1, UL)
>>> +#define PTE_WRITABLE                BIT(2, UL)
>>> +#define PTE_EXECUTABLE              BIT(3, UL)
>>> +#define PTE_USER                    BIT(4, UL)
>>> +#define PTE_GLOBAL                  BIT(5, UL)
>>> +#define PTE_ACCESSED                BIT(6, UL)
>>> +#define PTE_DIRTY                   BIT(7, UL)
>>> +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
>>> +
>>> +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE |
>>> PTE_EXECUTABLE)
>>> +#define PTE_TABLE                   (PTE_VALID)
>>> +
>>> +/* Calculate the offsets into the pagetables for a given VA */
>>> +#define pt_linear_offset(lvl, va)   ((va) >>
>>> XEN_PT_LEVEL_SHIFT(lvl))
>>> +
>>> +#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) &
>>> XEN_PT_LEVEL_MASK(lvl))
>>> +
>>> +/* Page Table entry */
>>> +typedef struct {
>>> +    uint64_t pte;
>>> +} pte_t;
>>> +
>>> +/* Shift the VPN[x] or PPN[x] fields of a virtual or physical
>>> address
>>> + * to become the shifted PPN[x] fields of a page table entry */
>>> +#define addr_to_ppn(x) (((x) >> PAGE_SHIFT) << PTE_SHIFT)
>>> +
>>> +static inline pte_t paddr_to_pte(const unsigned long paddr)
>>> +{
>>> +    return (pte_t) { .pte = addr_to_ppn(paddr) };
>>> +}
>>> +
>>> +static inline bool pte_is_valid(const pte_t *p)
>>> +{
>>> +    return p->pte & PTE_VALID;
>>> +}
>>> +
>>> +#endif /* _ASM_RISCV_PAGE_H */
>>> diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
>>> new file mode 100644
>>> index 0000000000..0df6b47441
>>> --- /dev/null
>>> +++ b/xen/arch/riscv/mm.c
>>> @@ -0,0 +1,121 @@
>>> +#include <xen/compiler.h>
>>> +#include <xen/init.h>
>>> +#include <xen/kernel.h>
>>> +#include <xen/lib.h>
>>> +#include <xen/page-size.h>
>>> +
>>> +#include <asm/boot-info.h>
>>> +#include <asm/config.h>
>>> +#include <asm/csr.h>
>>> +#include <asm/mm.h>
>>> +#include <asm/page.h>
>>> +#include <asm/traps.h>
>>> +
>>> +/*
>>> + * xen_second_pagetable is indexed with the VPN[2] page table
>>> entry field
>>> + * xen_first_pagetable is accessed from the VPN[1] page table
>>> entry field
>>> + * xen_zeroeth_pagetable is accessed from the VPN[0] page table
>>> entry field
>>> + */
>>> +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
>>> +    xen_second_pagetable[PAGE_ENTRIES];
>>> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
>>> +    xen_first_pagetable[PAGE_ENTRIES];
>>> +static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
>>> +    xen_zeroeth_pagetable[PAGE_ENTRIES];
>>> +
>>> +extern unsigned long __init_begin[];
>>> +extern unsigned long __init_end[];
>>> +extern unsigned char cpu0_boot_stack[STACK_SIZE];
>>> +
>>> +static void __init
>>> +setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
>>> +                      unsigned long map_start,
>>> +                      unsigned long map_end,
>>> +                      unsigned long pa_start,
>>> +                      unsigned long flags)
>>> +{
>>> +    unsigned long page_addr;
>>> +
>>> +    // /* align start addresses */
>>> +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>> +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>
>> They should be switched to ASSERT() or BUG_ON().
> Sure. Thanks. I'll update.
>>
>>> +
>>> +    page_addr = map_start;
>>> +    while ( page_addr < map_end )
>>
>> This loop is continue to assume that only the mapping can first in
>> 2MB
>> section (or less if the start is not 2MB aligned).
>>
>> I am OK if you want to assume it, but I think this should be
>> documented
>> (with words and ASSERT()/BUG_ON()) to avoid any mistake.
> I add a check in setup_initial_pagetables:
>       BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> Probably this is not a correct place and should be moved to
> setup_initial_mapping() instead of setup_initial_pagetables()

Yes it should be moved in setup_initial_mapping().

>>
>>> +    {
>>> +        unsigned long index2 = pt_index(2, page_addr);
>>> +        unsigned long index1 = pt_index(1, page_addr);
>>> +        unsigned long index0 = pt_index(0, page_addr);
>>> +
>>> +        /* Setup level2 table */
>>> +        second[index2] = paddr_to_pte((unsigned long)first);
>>> +        second[index2].pte |= PTE_TABLE;
>>> +
>>> +        /* Setup level1 table */
>>> +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
>>> +        first[index1].pte |= PTE_TABLE;
>>> +
>>> +
>>
>> NIT: Spurious line?
> Yeah, should be removed. Thanks.
>>
>>> +        /* Setup level0 table */
>>> +        if ( !pte_is_valid(&zeroeth[index0]) )
>>
>> On the previous version, you said it should be checked for each
>> level.
> I had a terrible internet connection, and my message wasn't sent.

No worries.

> 
> I decided not to check that l2 and l1 are used only for referring to
> the next page table but not leaf PTE. So it is safe to overwrite it
> each time (the addresses of page tables are the same all the time)

You are letting the caller to decide which page-table to use for each 
level. So you are at the mercy that caller will do the right thing.

IHMO, this is a pretty bad idea because debugging page-tables error are 
difficult. So it is better to have safety in place. This is not worth...

  and
> probably it will be better from optimization point of view to ignore if
> clauses.

... the optimization in particular when this is at boot time.

> 
> And it is needed in case of L0 because it is possible that some
> addressed were marked with specific flag ( execution, read, write ) and
> so not to overwrite the flags set before the check is needed.
In which case you should really report an error because the caller may 
have decide to set execution flag and you don't honor. So when the code 
is executed, you will receive a fault and this may be hard to find out 
what happen.

> 
>> the next page table but not leaf PTE.But here you still only check
>> for a single level.
>>
>> Furthermore, I would strongly suggest to also check the valid PTE is
>> the
>> same as you intend to write to catch any override (they are a pain to
>> debug).
> but if load addresses and linker addresses don't overlap is it possible
> situation that valid PTE will be overridden?

A bug in the code. In fact, if you add the check you would have notice 
that your existing code is buggy (see below).

>>
>>> +        {
>>> +            /* Update level0 table */
>>> +            zeroeth[index0] = paddr_to_pte((page_addr - map_start)
>>> + pa_start);
>>> +            zeroeth[index0].pte |= flags;
>>> +        }
>>> +
>>> +        /* Point to next page */
>>> +        page_addr += XEN_PT_LEVEL_SIZE(0);
>>> +    }
>>> +}
>>> +
>>> +/*
>>> + * setup_initial_pagetables:
>>> + *
>>> + * Build the page tables for Xen that map the following:
>>> + *   load addresses to linker addresses

I would suggest to expand because this is not entirely what you exactly 
are doing. In fact...

>>> + */
>>> +void __init setup_initial_pagetables(void)
>>> +{
>>> +    pte_t *second;
>>> +    pte_t *first;
>>> +    pte_t *zeroeth;
>>> +
>>> +    unsigned long load_addr_start   = boot_info.load_start;
>>> +    unsigned long load_addr_end     = boot_info.load_end;
>>> +    unsigned long linker_addr_start = boot_info.linker_start;
>>> +    unsigned long linker_addr_end   = boot_info.linker_end;
>>> +
>>> +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
>>> +    if (load_addr_start != linker_addr_start)
>>> +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end
>>>> linker_addr_start));
>>
>> I would suggest to switch to a panic() with an error message as this
>> would help the user understanding what this is breaking.
>>
>> Alternatively, you could document what this check is for.
> I think I will document it for now as panic() isn't ready for use now.
>>
>>> +
>>> +    /* Get the addresses where the page tables were loaded */
>>> +    second  = (pte_t *)(&xen_second_pagetable);
>>> +    first   = (pte_t *)(&xen_first_pagetable);
>>> +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
>>> +
>>> +    setup_initial_mapping(second, first, zeroeth,
>>> +                          LOAD_TO_LINK((unsigned long)&_stext),
>>> +                          LOAD_TO_LINK((unsigned long)&_etext),
>>> +                          (unsigned long)&_stext,
>>> +                          PTE_LEAF_DEFAULT);
>>> +
>>> +    setup_initial_mapping(second, first, zeroeth,
>>> +                          LOAD_TO_LINK((unsigned
>>> long)&__init_begin),
>>> +                          LOAD_TO_LINK((unsigned
>>> long)&__init_end),
>>> +                          (unsigned long)&__init_begin,
>>> +                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
>>> +
>>> +    setup_initial_mapping(second, first, zeroeth,
>>> +                          LOAD_TO_LINK((unsigned long)&_srodata),
>>> +                          LOAD_TO_LINK((unsigned long)&_erodata),
>>> +                          (unsigned long)(&_srodata),
>>> +                          PTE_VALID | PTE_READABLE);
>>> +
>>> +    setup_initial_mapping(second, first, zeroeth,
>>> +                          linker_addr_start,
>>> +                          linker_addr_end,
>>> +                          load_addr_start,
>>> +                          PTE_LEAF_DEFAULT | PTE_READABLE);

... this is not cover above. AFAIU, this is the one for the 1:1 mapping.

>>
>> As I said in v1, you need to use a different set of page-table here.
> If I understand you correctly I have to use a different set of page-
> table in case when it is possible that size of Xen will be larger then
> PAGE_SIZE. So I added to xen.lds.S a check to be sure that the size
> fits into PAGE_SIZE.

This is not what I was referring to. I was pointing out that second, 
first, zeroeth are exactly the same for all the callers. You want to 
introduce a second set of zeroeth table. You will want to do the same 
for first but it is not always used.

Otherwise, this is not going to work if Xen is loaded at a different 
address than the runtime.

That said, when I spoke with Andrew yesterday, he mentioned that your 
initial goal is to support the case where Xen is loaded at the runtime 
address. I understand this simplifies a lot the code and I told him I 
was OK with that. However, it would be good to document what are your 
goals in each series (this is not always clear what you skip on purpose).

> 
>> Also, where do you guarantee that Xen will be loaded at a 2MB aligned
>> address? (For a fact I know that UEFI is only guarantee 4KB
>> alignment).
> There is a check in setup_initial_pagetables:
>       BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);

This is not very obvious the check is to confirm that Xen is probably 
aligned. I would suggest to add a comment.

Also, you might want to use XEN_PT_LEVEL_SIZE(..) to make it more 
obvious what sort of alignment you are trying to enforce.

>>> +        la      sp, cpu0_boot_stack
>>> +        li      t0, STACK_SIZE
>>> +        add     sp, sp, t0
>>> +
>>> +        /*
>>> +         * Re-init an address of exception handler as it was
>>> overwritten  with
>>> +         * the address of the .L_mmu_is_enabled label.
>>> +         * Before jump to trap_init save return address of
>>> enable_mmu() to
>>> +         * know where we should back after enable_mmu() will be
>>> finished.
>>> +         */
>>> +        mv      s0, ra
>>> +        lla     t0, trap_init
>>> +        jalr    ra, t0
>>> +
>>> +        /*
>>> +         * Re-calculate the return address of enable_mmu()
>>> function for case
>>> +         * when linker start address isn't equal to load start
>>> address
>>> +         */
>>> +        add     s0, s0, t1
>>> +        mv      ra, s0
>>> +
>>> +        ret
>>
>> Missing ENDPROC?
> I haven't seen the usage of ENDPROC for RISC-V so it looks like it is
> not necessary.

Ok. Would the objdump be able to report the function properly then? I 
know that on Arm, it was necessary report assembly function properly.

>>
>>> diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
>>> index eed457c492..e4ac4e84b6 100644
>>> --- a/xen/arch/riscv/xen.lds.S
>>> +++ b/xen/arch/riscv/xen.lds.S
>>> @@ -179,3 +179,5 @@ SECTIONS
>>>    
>>>    ASSERT(!SIZEOF(.got),      ".got non-empty")
>>>    ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
>>> +
>>> +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot
>>> assumptions")
>>
> ~ Oleksii
> 

Cheers,
Oleksii Kurochko March 23, 2023, 11:18 a.m. UTC | #10
Hi Julien,

On Wed, 2023-03-22 at 14:21 +0000, Julien Grall wrote:
...
> 
> > > > +    unsigned long page_addr;
> > > > +
> > > > +    // /* align start addresses */
> > > > +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > > > +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > > 
> > > They should be switched to ASSERT() or BUG_ON().
> > Sure. Thanks. I'll update.
> > > 
> > > > +
> > > > +    page_addr = map_start;
> > > > +    while ( page_addr < map_end )
> > > 
> > > This loop is continue to assume that only the mapping can first
> > > in
> > > 2MB
> > > section (or less if the start is not 2MB aligned).
> > > 
> > > I am OK if you want to assume it, but I think this should be
> > > documented
> > > (with words and ASSERT()/BUG_ON()) to avoid any mistake.
> > I add a check in setup_initial_pagetables:
> >       BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> > Probably this is not a correct place and should be moved to
> > setup_initial_mapping() instead of setup_initial_pagetables()
> 
> Yes it should be moved in setup_initial_mapping().
Then I'll moved it to setup_initial_mapping()
> 
> > > 
> > > > +    {
> > > > +        unsigned long index2 = pt_index(2, page_addr);
> > > > +        unsigned long index1 = pt_index(1, page_addr);
> > > > +        unsigned long index0 = pt_index(0, page_addr);
> > > > +
> > > > +        /* Setup level2 table */
> > > > +        second[index2] = paddr_to_pte((unsigned long)first);
> > > > +        second[index2].pte |= PTE_TABLE;
> > > > +
> > > > +        /* Setup level1 table */
> > > > +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
> > > > +        first[index1].pte |= PTE_TABLE;
> > > > +
> > > > +
> > > 
> > > NIT: Spurious line?
> > Yeah, should be removed. Thanks.
> > > 
> > > > +        /* Setup level0 table */
> > > > +        if ( !pte_is_valid(&zeroeth[index0]) )
> > > 
> > > On the previous version, you said it should be checked for each
> > > level.
> > I had a terrible internet connection, and my message wasn't sent.
> 
> No worries.
> 
> > 
> > I decided not to check that l2 and l1 are used only for referring
> > to
> > the next page table but not leaf PTE. So it is safe to overwrite it
> > each time (the addresses of page tables are the same all the time)
> 
> You are letting the caller to decide which page-table to use for each
> level. So you are at the mercy that caller will do the right thing.
> 
> IHMO, this is a pretty bad idea because debugging page-tables error
> are 
> difficult. So it is better to have safety in place. This is not
> worth...
> 
>   and
> > probably it will be better from optimization point of view to
> > ignore if
> > clauses.
> 
> ... the optimization in particular when this is at boot time.
I didn't think about that caller will do always the right thing so
I will add the check.
> 
> > 
> > And it is needed in case of L0 because it is possible that some
> > addressed were marked with specific flag ( execution, read, write )
> > and
> > so not to overwrite the flags set before the check is needed.
> In which case you should really report an error because the caller
> may 
> have decide to set execution flag and you don't honor. So when the
> code 
> is executed, you will receive a fault and this may be hard to find
> out 
> what happen.

Right now, it is expected situation that the caller will try to change
execution flag during the setup of initial page tables.

It is possible in the currently implemented logic of the setup of
initial page tables.

Let me explain what I mean.

The first step of setup_initial_pagetables() is to map .text, .init,
.rodata with necessary flags RX, RX, R.
Remaining sections will have RW flags, and to map them,
setup_initial_mapping() is called for the whole range of [linker_start,
linker_end] not to map them one by one thereby during this step
setup_initial_mapping() will try to remap addresses ranges which
overlap with .text, .init, .rodata with RW flags but it should leave
with the previously set flags.
> 
> > 
> > > the next page table but not leaf PTE.But here you still only
> > > check
> > > for a single level.
> > > 
> > > Furthermore, I would strongly suggest to also check the valid PTE
> > > is
> > > the
> > > same as you intend to write to catch any override (they are a
> > > pain to
> > > debug).
> > but if load addresses and linker addresses don't overlap is it
> > possible
> > situation that valid PTE will be overridden?
> 
> A bug in the code. In fact, if you add the check you would have
> notice 
> that your existing code is buggy (see below).
> 
> > > 
> > > > +        {
> > > > +            /* Update level0 table */
> > > > +            zeroeth[index0] = paddr_to_pte((page_addr -
> > > > map_start)
> > > > + pa_start);
> > > > +            zeroeth[index0].pte |= flags;
> > > > +        }
> > > > +
> > > > +        /* Point to next page */
> > > > +        page_addr += XEN_PT_LEVEL_SIZE(0);
> > > > +    }
> > > > +}
> > > > +
> > > > +/*
> > > > + * setup_initial_pagetables:
> > > > + *
> > > > + * Build the page tables for Xen that map the following:
> > > > + *   load addresses to linker addresses
> 
> I would suggest to expand because this is not entirely what you
> exactly 
> are doing. In fact...
> 
> > > > + */
> > > > +void __init setup_initial_pagetables(void)
> > > > +{
> > > > +    pte_t *second;
> > > > +    pte_t *first;
> > > > +    pte_t *zeroeth;
> > > > +
> > > > +    unsigned long load_addr_start   = boot_info.load_start;
> > > > +    unsigned long load_addr_end     = boot_info.load_end;
> > > > +    unsigned long linker_addr_start = boot_info.linker_start;
> > > > +    unsigned long linker_addr_end   = boot_info.linker_end;
> > > > +
> > > > +    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> > > > +    if (load_addr_start != linker_addr_start)
> > > > +        BUG_ON((linker_addr_end > load_addr_start &&
> > > > load_addr_end
> > > > > linker_addr_start));
> > > 
> > > I would suggest to switch to a panic() with an error message as
> > > this
> > > would help the user understanding what this is breaking.
> > > 
> > > Alternatively, you could document what this check is for.
> > I think I will document it for now as panic() isn't ready for use
> > now.
> > > 
> > > > +
> > > > +    /* Get the addresses where the page tables were loaded */
> > > > +    second  = (pte_t *)(&xen_second_pagetable);
> > > > +    first   = (pte_t *)(&xen_first_pagetable);
> > > > +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
> > > > +
> > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&_stext),
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&_etext),
> > > > +                          (unsigned long)&_stext,
> > > > +                          PTE_LEAF_DEFAULT);
> > > > +
> > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&__init_begin),
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&__init_end),
> > > > +                          (unsigned long)&__init_begin,
> > > > +                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
> > > > +
> > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&_srodata),
> > > > +                          LOAD_TO_LINK((unsigned
> > > > long)&_erodata),
> > > > +                          (unsigned long)(&_srodata),
> > > > +                          PTE_VALID | PTE_READABLE);
> > > > +
> > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > +                          linker_addr_start,
> > > > +                          linker_addr_end,
> > > > +                          load_addr_start,
> > > > +                          PTE_LEAF_DEFAULT | PTE_READABLE);
> 
> ... this is not cover above. AFAIU, this is the one for the 1:1
> mapping.
But there is no 1:1 mapping.
I thought that 1:1 mapping is when the physical address is equal to
0x10020 then after MMU is enabled the virtual address will be the same
0x10020 and mapped to 0x10020.

Probably I missed something but this code maps all linker addresses
to correspondent load addresses and it will be 1:1 only in case when
load address is equal to linker address.
> 
> > > 
> > > As I said in v1, you need to use a different set of page-table
> > > here.
> > If I understand you correctly I have to use a different set of
> > page-
> > table in case when it is possible that size of Xen will be larger
> > then
> > PAGE_SIZE. So I added to xen.lds.S a check to be sure that the size
> > fits into PAGE_SIZE.
> 
> This is not what I was referring to. I was pointing out that second, 
> first, zeroeth are exactly the same for all the callers. You want to 
> introduce a second set of zeroeth table. You will want to do the same
> for first but it is not always used.
> 
> Otherwise, this is not going to work if Xen is loaded at a different 
> address than the runtime.
Ok.

I understand what do you mean in general but still I am not sure that I
understand a particular case when I am sure that the size of Xen is no
bigger then 2MB and load address is aligned on 2Mb boundary.

The size of one l0 page table is 512 (1 << 9 ) entries which covers 4K
(1 << 12) * 512 = 2 Mb of memory so it should be fine.
All the callers in my case are trying to map addresses from
[linker_start, linker_end] which is not bigger then 2 MB and is aligned
on 2 MB boundary.

> 
> That said, when I spoke with Andrew yesterday, he mentioned that your
> initial goal is to support the case where Xen is loaded at the
> runtime 
> address. I understand this simplifies a lot the code and I told him I
> was OK with that. However, it would be good to document what are your
> goals in each series (this is not always clear what you skip on
> purpose).
> 
> > 
> > > Also, where do you guarantee that Xen will be loaded at a 2MB
> > > aligned
> > > address? (For a fact I know that UEFI is only guarantee 4KB
> > > alignment).
> > There is a check in setup_initial_pagetables:
> >       BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
> 
> This is not very obvious the check is to confirm that Xen is probably
> aligned. I would suggest to add a comment.
> 
> Also, you might want to use XEN_PT_LEVEL_SIZE(..) to make it more 
> obvious what sort of alignment you are trying to enforce.
I will update add the comment and use XEN_PT_LEVEL_SIZE(...) instead.
> 
> > > > +        la      sp, cpu0_boot_stack
> > > > +        li      t0, STACK_SIZE
> > > > +        add     sp, sp, t0
> > > > +
> > > > +        /*
> > > > +         * Re-init an address of exception handler as it was
> > > > overwritten  with
> > > > +         * the address of the .L_mmu_is_enabled label.
> > > > +         * Before jump to trap_init save return address of
> > > > enable_mmu() to
> > > > +         * know where we should back after enable_mmu() will
> > > > be
> > > > finished.
> > > > +         */
> > > > +        mv      s0, ra
> > > > +        lla     t0, trap_init
> > > > +        jalr    ra, t0
> > > > +
> > > > +        /*
> > > > +         * Re-calculate the return address of enable_mmu()
> > > > function for case
> > > > +         * when linker start address isn't equal to load start
> > > > address
> > > > +         */
> > > > +        add     s0, s0, t1
> > > > +        mv      ra, s0
> > > > +
> > > > +        ret
> > > 
> > > Missing ENDPROC?
> > I haven't seen the usage of ENDPROC for RISC-V so it looks like it
> > is
> > not necessary.
> 
> Ok. Would the objdump be able to report the function properly then? I
> know that on Arm, it was necessary report assembly function properly.
It is fine for RISC-V:

Disassembly of section .text:

0000000000200000 <_start>:
  200000:       10401073                csrw    sie,zero
  200004:       6299                    lui     t0,0x6
...
  20003c:       00000013                nop

0000000000200040 <enable_mmu>:
  200040:       0003a297                auipc   t0,0x3a
...
  20009e:       941a                    add     s0,s0,t1
  2000a0:       80a2                    mv      ra,s0
  2000a2:       8082                    ret
        ...

00000000002000b0 <__bitmap_empty>:
  2000b0:       1141                    addi    sp,sp,-16
  2000b2:       e422                    sd      s0,8(sp)
  2000b4:       0800                    addi    s0,sp,16
...

> 
> > > 
> > > > diff --git a/xen/arch/riscv/xen.lds.S
> > > > b/xen/arch/riscv/xen.lds.S
> > > > index eed457c492..e4ac4e84b6 100644
> > > > --- a/xen/arch/riscv/xen.lds.S
> > > > +++ b/xen/arch/riscv/xen.lds.S
> > > > @@ -179,3 +179,5 @@ SECTIONS
> > > >    
> > > >    ASSERT(!SIZEOF(.got),      ".got non-empty")
> > > >    ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
> > > > +
> > > > +ASSERT(_end - _start <= MB(2), "Xen too large for early-boot
> > > > assumptions")
> > > 
> > ~ Oleksii
> > 
> 
~ Oleksii
Julien Grall March 23, 2023, 11:57 a.m. UTC | #11
On 23/03/2023 11:18, Oleksii wrote:
> Hi Julien,

Hi Oleksii,

> On Wed, 2023-03-22 at 14:21 +0000, Julien Grall wrote:
> ...
>>
>>>>> +    unsigned long page_addr;
>>>>> +
>>>>> +    // /* align start addresses */
>>>>> +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>>>> +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>>>
>>>> They should be switched to ASSERT() or BUG_ON().
>>> Sure. Thanks. I'll update.
>>>>
>>>>> +
>>>>> +    page_addr = map_start;
>>>>> +    while ( page_addr < map_end )
>>>>
>>>> This loop is continue to assume that only the mapping can first
>>>> in
>>>> 2MB
>>>> section (or less if the start is not 2MB aligned).
>>>>
>>>> I am OK if you want to assume it, but I think this should be
>>>> documented
>>>> (with words and ASSERT()/BUG_ON()) to avoid any mistake.
>>> I add a check in setup_initial_pagetables:
>>>        BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
>>> Probably this is not a correct place and should be moved to
>>> setup_initial_mapping() instead of setup_initial_pagetables()
>>
>> Yes it should be moved in setup_initial_mapping().
> Then I'll moved it to setup_initial_mapping()
>>
>>>>
>>>>> +    {
>>>>> +        unsigned long index2 = pt_index(2, page_addr);
>>>>> +        unsigned long index1 = pt_index(1, page_addr);
>>>>> +        unsigned long index0 = pt_index(0, page_addr);
>>>>> +
>>>>> +        /* Setup level2 table */
>>>>> +        second[index2] = paddr_to_pte((unsigned long)first);
>>>>> +        second[index2].pte |= PTE_TABLE;
>>>>> +
>>>>> +        /* Setup level1 table */
>>>>> +        first[index1] = paddr_to_pte((unsigned long)zeroeth);
>>>>> +        first[index1].pte |= PTE_TABLE;
>>>>> +
>>>>> +
>>>>
>>>> NIT: Spurious line?
>>> Yeah, should be removed. Thanks.
>>>>
>>>>> +        /* Setup level0 table */
>>>>> +        if ( !pte_is_valid(&zeroeth[index0]) )
>>>>
>>>> On the previous version, you said it should be checked for each
>>>> level.
>>> I had a terrible internet connection, and my message wasn't sent.
>>
>> No worries.
>>
>>>
>>> I decided not to check that l2 and l1 are used only for referring
>>> to
>>> the next page table but not leaf PTE. So it is safe to overwrite it
>>> each time (the addresses of page tables are the same all the time)
>>
>> You are letting the caller to decide which page-table to use for each
>> level. So you are at the mercy that caller will do the right thing.
>>
>> IHMO, this is a pretty bad idea because debugging page-tables error
>> are
>> difficult. So it is better to have safety in place. This is not
>> worth...
>>
>>    and
>>> probably it will be better from optimization point of view to
>>> ignore if
>>> clauses.
>>
>> ... the optimization in particular when this is at boot time.
> I didn't think about that caller will do always the right thing so
> I will add the check.
>>
>>>
>>> And it is needed in case of L0 because it is possible that some
>>> addressed were marked with specific flag ( execution, read, write )
>>> and
>>> so not to overwrite the flags set before the check is needed.
>> In which case you should really report an error because the caller
>> may
>> have decide to set execution flag and you don't honor. So when the
>> code
>> is executed, you will receive a fault and this may be hard to find
>> out
>> what happen.
> 
> Right now, it is expected situation that the caller will try to change
> execution flag during the setup of initial page tables. >
> It is possible in the currently implemented logic of the setup of
> initial page tables.

This sounds like a bug in your caller implementation. You should not try 
to workaround this in your code updating the page-tables.

> Let me explain what I mean.
> 
> The first step of setup_initial_pagetables() is to map .text, .init,
> .rodata with necessary flags RX, RX, R.
> Remaining sections will have RW flags, and to map them,
> setup_initial_mapping() is called for the whole range of [linker_start,
> linker_end] not to map them one by one thereby during this step
> setup_initial_mapping() will try to remap addresses ranges which
> overlap with .text, .init, .rodata with RW flags but it should leave
> with the previously set flags.
Why do you need to call setup_init_mapping() with the whole range? In 
fact the only reason I can think this is useful is when you when to 
create a 1:1 mapping when the linker and load address is different. But...

>>>>> +
>>>>> +    /* Get the addresses where the page tables were loaded */
>>>>> +    second  = (pte_t *)(&xen_second_pagetable);
>>>>> +    first   = (pte_t *)(&xen_first_pagetable);
>>>>> +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
>>>>> +
>>>>> +    setup_initial_mapping(second, first, zeroeth,
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&_stext),
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&_etext),
>>>>> +                          (unsigned long)&_stext,
>>>>> +                          PTE_LEAF_DEFAULT);
>>>>> +
>>>>> +    setup_initial_mapping(second, first, zeroeth,
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&__init_begin),
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&__init_end),
>>>>> +                          (unsigned long)&__init_begin,
>>>>> +                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
>>>>> +
>>>>> +    setup_initial_mapping(second, first, zeroeth,
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&_srodata),
>>>>> +                          LOAD_TO_LINK((unsigned
>>>>> long)&_erodata),
>>>>> +                          (unsigned long)(&_srodata),
>>>>> +                          PTE_VALID | PTE_READABLE);
>>>>> +
>>>>> +    setup_initial_mapping(second, first, zeroeth,
>>>>> +                          linker_addr_start,
>>>>> +                          linker_addr_end,
>>>>> +                          load_addr_start,
>>>>> +                          PTE_LEAF_DEFAULT | PTE_READABLE);
>>
>> ... this is not cover above. AFAIU, this is the one for the 1:1
>> mapping.
> But there is no 1:1 mapping.
> I thought that 1:1 mapping is when the physical address is equal to
> 0x10020 then after MMU is enabled the virtual address will be the same
> 0x10020 and mapped to 0x10020.
> 
> Probably I missed something but this code maps all linker addresses
> to correspondent load addresses and it will be 1:1 only in case when
> load address is equal to linker address.

... here you say this is not the purpose.

Also, if you don't intend to deal with load != link address yet, then 
the following BUG_ON() needs to be updated:

+    if (load_addr_start != linker_addr_start)
+        BUG_ON((linker_addr_end > load_addr_start && load_addr_end > 
linker_addr_start));

>>
>>>>
>>>> As I said in v1, you need to use a different set of page-table
>>>> here.
>>> If I understand you correctly I have to use a different set of
>>> page-
>>> table in case when it is possible that size of Xen will be larger
>>> then
>>> PAGE_SIZE. So I added to xen.lds.S a check to be sure that the size
>>> fits into PAGE_SIZE.
>>
>> This is not what I was referring to. I was pointing out that second,
>> first, zeroeth are exactly the same for all the callers. You want to
>> introduce a second set of zeroeth table. You will want to do the same
>> for first but it is not always used.
>>
>> Otherwise, this is not going to work if Xen is loaded at a different
>> address than the runtime.
> Ok.
> 
> I understand what do you mean in general but still I am not sure that I
> understand a particular case when I am sure that the size of Xen is no
> bigger then 2MB and load address is aligned on 2Mb boundary.
> 
> The size of one l0 page table is 512 (1 << 9 ) entries which covers 4K
> (1 << 12) * 512 = 2 Mb of memory so it should be fine.
> All the callers in my case are trying to map addresses from
> [linker_start, linker_end] which is not bigger then 2 MB and is aligned
> on 2 MB boundary.

I interpreted that your last call was meant to be for the 1:1 mapping 
when load != link address. It looks like I was wrong, so the same 
page-table should be OK.

> 
>>
>> That said, when I spoke with Andrew yesterday, he mentioned that your
>> initial goal is to support the case where Xen is loaded at the
>> runtime
>> address. I understand this simplifies a lot the code and I told him I
>> was OK with that. However, it would be good to document what are your
>> goals in each series (this is not always clear what you skip on
>> purpose).
>>
>>>
>>>> Also, where do you guarantee that Xen will be loaded at a 2MB
>>>> aligned
>>>> address? (For a fact I know that UEFI is only guarantee 4KB
>>>> alignment).
>>> There is a check in setup_initial_pagetables:
>>>        BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
>>
>> This is not very obvious the check is to confirm that Xen is probably
>> aligned. I would suggest to add a comment.
>>
>> Also, you might want to use XEN_PT_LEVEL_SIZE(..) to make it more
>> obvious what sort of alignment you are trying to enforce.
> I will update add the comment and use XEN_PT_LEVEL_SIZE(...) instead.
>>
>>>>> +        la      sp, cpu0_boot_stack
>>>>> +        li      t0, STACK_SIZE
>>>>> +        add     sp, sp, t0
>>>>> +
>>>>> +        /*
>>>>> +         * Re-init an address of exception handler as it was
>>>>> overwritten  with
>>>>> +         * the address of the .L_mmu_is_enabled label.
>>>>> +         * Before jump to trap_init save return address of
>>>>> enable_mmu() to
>>>>> +         * know where we should back after enable_mmu() will
>>>>> be
>>>>> finished.
>>>>> +         */
>>>>> +        mv      s0, ra
>>>>> +        lla     t0, trap_init
>>>>> +        jalr    ra, t0
>>>>> +
>>>>> +        /*
>>>>> +         * Re-calculate the return address of enable_mmu()
>>>>> function for case
>>>>> +         * when linker start address isn't equal to load start
>>>>> address
>>>>> +         */
>>>>> +        add     s0, s0, t1
>>>>> +        mv      ra, s0
>>>>> +
>>>>> +        ret
>>>>
>>>> Missing ENDPROC?
>>> I haven't seen the usage of ENDPROC for RISC-V so it looks like it
>>> is
>>> not necessary.
>>
>> Ok. Would the objdump be able to report the function properly then? I
>> know that on Arm, it was necessary report assembly function properly.
> It is fine for RISC-V:
> 
> Disassembly of section .text:
> 
> 0000000000200000 <_start>:
>    200000:       10401073                csrw    sie,zero
>    200004:       6299                    lui     t0,0x6
> ...
>    20003c:       00000013                nop
> 
> 0000000000200040 <enable_mmu>:
>    200040:       0003a297                auipc   t0,0x3a
> ...
>    20009e:       941a                    add     s0,s0,t1
>    2000a0:       80a2                    mv      ra,s0
>    2000a2:       8082                    ret
>          ...
> 
> 00000000002000b0 <__bitmap_empty>:
>    2000b0:       1141                    addi    sp,sp,-16
>    2000b2:       e422                    sd      s0,8(sp)
>    2000b4:       0800                    addi    s0,sp,16
> ...

Perfect thanks!

Cheers,
Oleksii Kurochko March 23, 2023, 12:30 p.m. UTC | #12
On Thu, 2023-03-23 at 11:57 +0000, Julien Grall wrote:
> On 23/03/2023 11:18, Oleksii wrote:
> > Hi Julien,
> 
> Hi Oleksii,
> 
> > On Wed, 2023-03-22 at 14:21 +0000, Julien Grall wrote:
> > ...
> > > 
> > > > > > +    unsigned long page_addr;
> > > > > > +
> > > > > > +    // /* align start addresses */
> > > > > > +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > > > > > +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
> > > > > 
> > > > > They should be switched to ASSERT() or BUG_ON().
> > > > Sure. Thanks. I'll update.
> > > > > 
> > > > > > +
> > > > > > +    page_addr = map_start;
> > > > > > +    while ( page_addr < map_end )
> > > > > 
> > > > > This loop is continue to assume that only the mapping can
> > > > > first
> > > > > in
> > > > > 2MB
> > > > > section (or less if the start is not 2MB aligned).
> > > > > 
> > > > > I am OK if you want to assume it, but I think this should be
> > > > > documented
> > > > > (with words and ASSERT()/BUG_ON()) to avoid any mistake.
> > > > I add a check in setup_initial_pagetables:
> > > >        BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) !=
> > > > 0);
> > > > Probably this is not a correct place and should be moved to
> > > > setup_initial_mapping() instead of setup_initial_pagetables()
> > > 
> > > Yes it should be moved in setup_initial_mapping().
> > Then I'll moved it to setup_initial_mapping()
> > > 
> > > > > 
> > > > > > +    {
> > > > > > +        unsigned long index2 = pt_index(2, page_addr);
> > > > > > +        unsigned long index1 = pt_index(1, page_addr);
> > > > > > +        unsigned long index0 = pt_index(0, page_addr);
> > > > > > +
> > > > > > +        /* Setup level2 table */
> > > > > > +        second[index2] = paddr_to_pte((unsigned
> > > > > > long)first);
> > > > > > +        second[index2].pte |= PTE_TABLE;
> > > > > > +
> > > > > > +        /* Setup level1 table */
> > > > > > +        first[index1] = paddr_to_pte((unsigned
> > > > > > long)zeroeth);
> > > > > > +        first[index1].pte |= PTE_TABLE;
> > > > > > +
> > > > > > +
> > > > > 
> > > > > NIT: Spurious line?
> > > > Yeah, should be removed. Thanks.
> > > > > 
> > > > > > +        /* Setup level0 table */
> > > > > > +        if ( !pte_is_valid(&zeroeth[index0]) )
> > > > > 
> > > > > On the previous version, you said it should be checked for
> > > > > each
> > > > > level.
> > > > I had a terrible internet connection, and my message wasn't
> > > > sent.
> > > 
> > > No worries.
> > > 
> > > > 
> > > > I decided not to check that l2 and l1 are used only for
> > > > referring
> > > > to
> > > > the next page table but not leaf PTE. So it is safe to
> > > > overwrite it
> > > > each time (the addresses of page tables are the same all the
> > > > time)
> > > 
> > > You are letting the caller to decide which page-table to use for
> > > each
> > > level. So you are at the mercy that caller will do the right
> > > thing.
> > > 
> > > IHMO, this is a pretty bad idea because debugging page-tables
> > > error
> > > are
> > > difficult. So it is better to have safety in place. This is not
> > > worth...
> > > 
> > >    and
> > > > probably it will be better from optimization point of view to
> > > > ignore if
> > > > clauses.
> > > 
> > > ... the optimization in particular when this is at boot time.
> > I didn't think about that caller will do always the right thing so
> > I will add the check.
> > > 
> > > > 
> > > > And it is needed in case of L0 because it is possible that some
> > > > addressed were marked with specific flag ( execution, read,
> > > > write )
> > > > and
> > > > so not to overwrite the flags set before the check is needed.
> > > In which case you should really report an error because the
> > > caller
> > > may
> > > have decide to set execution flag and you don't honor. So when
> > > the
> > > code
> > > is executed, you will receive a fault and this may be hard to
> > > find
> > > out
> > > what happen.
> > 
> > Right now, it is expected situation that the caller will try to
> > change
> > execution flag during the setup of initial page tables. >
> > It is possible in the currently implemented logic of the setup of
> > initial page tables.
> 
> This sounds like a bug in your caller implementation. You should not
> try 
> to workaround this in your code updating the page-tables.
> 
> > Let me explain what I mean.
> > 
> > The first step of setup_initial_pagetables() is to map .text,
> > .init,
> > .rodata with necessary flags RX, RX, R.
> > Remaining sections will have RW flags, and to map them,
> > setup_initial_mapping() is called for the whole range of
> > [linker_start,
> > linker_end] not to map them one by one thereby during this step
> > setup_initial_mapping() will try to remap addresses ranges which
> > overlap with .text, .init, .rodata with RW flags but it should
> > leave
> > with the previously set flags.
> Why do you need to call setup_init_mapping() with the whole range? In
> fact the only reason I can think this is useful is when you when to 
> create a 1:1 mapping when the linker and load address is different.
It is needed to not map each section separately one by one as most of
the sections have the same PTE_FLAGS (Read, Write, eXectuable, etc )

So it was decided to map the following sections separately as they have
'unique' flags:
 - .text -> RX
 - .rodata -> R
 - .init.text -> RX

All other sections are RW and could be covered by one call of
setup_init_mapping() for the whole range:
 - .data.ro_after_init
 - .data.read_mostly 
 - .data 
 - .init.data 
 - .bss
So some ranges ( .text, .rodata, .init.text ) from the whole range will
be skipped as already mapped and the rest sections will be mapped
during one call of setup_init_mapping().

> But...
> 
> > > > > > +
> > > > > > +    /* Get the addresses where the page tables were loaded
> > > > > > */
> > > > > > +    second  = (pte_t *)(&xen_second_pagetable);
> > > > > > +    first   = (pte_t *)(&xen_first_pagetable);
> > > > > > +    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
> > > > > > +
> > > > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&_stext),
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&_etext),
> > > > > > +                          (unsigned long)&_stext,
> > > > > > +                          PTE_LEAF_DEFAULT);
> > > > > > +
> > > > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&__init_begin),
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&__init_end),
> > > > > > +                          (unsigned long)&__init_begin,
> > > > > > +                          PTE_LEAF_DEFAULT |
> > > > > > PTE_WRITABLE);
> > > > > > +
> > > > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&_srodata),
> > > > > > +                          LOAD_TO_LINK((unsigned
> > > > > > long)&_erodata),
> > > > > > +                          (unsigned long)(&_srodata),
> > > > > > +                          PTE_VALID | PTE_READABLE);
> > > > > > +
> > > > > > +    setup_initial_mapping(second, first, zeroeth,
> > > > > > +                          linker_addr_start,
> > > > > > +                          linker_addr_end,
> > > > > > +                          load_addr_start,
> > > > > > +                          PTE_LEAF_DEFAULT |
> > > > > > PTE_READABLE);
> > > 
> > > ... this is not cover above. AFAIU, this is the one for the 1:1
> > > mapping.
> > But there is no 1:1 mapping.
> > I thought that 1:1 mapping is when the physical address is equal to
> > 0x10020 then after MMU is enabled the virtual address will be the
> > same
> > 0x10020 and mapped to 0x10020.
> > 
> > Probably I missed something but this code maps all linker addresses
> > to correspondent load addresses and it will be 1:1 only in case
> > when
> > load address is equal to linker address.
> 
> ... here you say this is not the purpose.
> 
> Also, if you don't intend to deal with load != link address yet, then
> the following BUG_ON() needs to be updated:
> 
> +    if (load_addr_start != linker_addr_start)
> +        BUG_ON((linker_addr_end > load_addr_start && load_addr_end >
> linker_addr_start));
I think that I'll cover it in the new patch series as I clarified all
necessary information and I don't expect that it will be too hard to
add from the start.
> 
> > > 
> > > > > 
> > > > > As I said in v1, you need to use a different set of page-
> > > > > table
> > > > > here.
> > > > If I understand you correctly I have to use a different set of
> > > > page-
> > > > table in case when it is possible that size of Xen will be
> > > > larger
> > > > then
> > > > PAGE_SIZE. So I added to xen.lds.S a check to be sure that the
> > > > size
> > > > fits into PAGE_SIZE.
> > > 
> > > This is not what I was referring to. I was pointing out that
> > > second,
> > > first, zeroeth are exactly the same for all the callers. You want
> > > to
> > > introduce a second set of zeroeth table. You will want to do the
> > > same
> > > for first but it is not always used.
> > > 
> > > Otherwise, this is not going to work if Xen is loaded at a
> > > different
> > > address than the runtime.
> > Ok.
> > 
> > I understand what do you mean in general but still I am not sure
> > that I
> > understand a particular case when I am sure that the size of Xen is
> > no
> > bigger then 2MB and load address is aligned on 2Mb boundary.
> > 
> > The size of one l0 page table is 512 (1 << 9 ) entries which covers
> > 4K
> > (1 << 12) * 512 = 2 Mb of memory so it should be fine.
> > All the callers in my case are trying to map addresses from
> > [linker_start, linker_end] which is not bigger then 2 MB and is
> > aligned
> > on 2 MB boundary.
> 
> I interpreted that your last call was meant to be for the 1:1 mapping
> when load != link address. It looks like I was wrong, so the same 
> page-table should be OK.
Thanks. Now we are on the same page.

But actually I like an idea to have more page tables and remove a
limitation to be aligned on 2MB boundary from the start.

I experimented with adding of additional table ( ( if it is necessary )
so it shouldn't be hard to add that code too.
> 
> > 
> > > 
> > > That said, when I spoke with Andrew yesterday, he mentioned that
> > > your
> > > initial goal is to support the case where Xen is loaded at the
> > > runtime
> > > address. I understand this simplifies a lot the code and I told
> > > him I
> > > was OK with that. However, it would be good to document what are
> > > your
> > > goals in each series (this is not always clear what you skip on
> > > purpose).
> > > 
> > > > 
> > > > > Also, where do you guarantee that Xen will be loaded at a 2MB
> > > > > aligned
> > > > > address? (For a fact I know that UEFI is only guarantee 4KB
> > > > > alignment).
> > > > There is a check in setup_initial_pagetables:
> > > >        BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) !=
> > > > 0);
> > > 
> > > This is not very obvious the check is to confirm that Xen is
> > > probably
> > > aligned. I would suggest to add a comment.
> > > 
> > > Also, you might want to use XEN_PT_LEVEL_SIZE(..) to make it more
> > > obvious what sort of alignment you are trying to enforce.
> > I will update add the comment and use XEN_PT_LEVEL_SIZE(...)
> > instead.
> > > 
> > > > > > +        la      sp, cpu0_boot_stack
> > > > > > +        li      t0, STACK_SIZE
> > > > > > +        add     sp, sp, t0
> > > > > > +
> > > > > > +        /*
> > > > > > +         * Re-init an address of exception handler as it
> > > > > > was
> > > > > > overwritten  with
> > > > > > +         * the address of the .L_mmu_is_enabled label.
> > > > > > +         * Before jump to trap_init save return address of
> > > > > > enable_mmu() to
> > > > > > +         * know where we should back after enable_mmu()
> > > > > > will
> > > > > > be
> > > > > > finished.
> > > > > > +         */
> > > > > > +        mv      s0, ra
> > > > > > +        lla     t0, trap_init
> > > > > > +        jalr    ra, t0
> > > > > > +
> > > > > > +        /*
> > > > > > +         * Re-calculate the return address of enable_mmu()
> > > > > > function for case
> > > > > > +         * when linker start address isn't equal to load
> > > > > > start
> > > > > > address
> > > > > > +         */
> > > > > > +        add     s0, s0, t1
> > > > > > +        mv      ra, s0
> > > > > > +
> > > > > > +        ret
> > > > > 
> > > > > Missing ENDPROC?
> > > > I haven't seen the usage of ENDPROC for RISC-V so it looks like
> > > > it
> > > > is
> > > > not necessary.
> > > 
> > > Ok. Would the objdump be able to report the function properly
> > > then? I
> > > know that on Arm, it was necessary report assembly function
> > > properly.
> > It is fine for RISC-V:
> > 
> > Disassembly of section .text:
> > 
> > 0000000000200000 <_start>:
> >    200000:       10401073                csrw    sie,zero
> >    200004:       6299                    lui     t0,0x6
> > ...
> >    20003c:       00000013                nop
> > 
> > 0000000000200040 <enable_mmu>:
> >    200040:       0003a297                auipc   t0,0x3a
> > ...
> >    20009e:       941a                    add     s0,s0,t1
> >    2000a0:       80a2                    mv      ra,s0
> >    2000a2:       8082                    ret
> >          ...
> > 
> > 00000000002000b0 <__bitmap_empty>:
> >    2000b0:       1141                    addi    sp,sp,-16
> >    2000b2:       e422                    sd      s0,8(sp)
> >    2000b4:       0800                    addi    s0,sp,16
> > ...
> 
> Perfect thanks!
> 
~ Oleksii
Julien Grall March 23, 2023, 12:44 p.m. UTC | #13
Hi Oleksii,

On 23/03/2023 12:30, Oleksii wrote:
> On Thu, 2023-03-23 at 11:57 +0000, Julien Grall wrote:
>> On 23/03/2023 11:18, Oleksii wrote:
>>> Hi Julien,
>>
>> Hi Oleksii,
>>
>>> On Wed, 2023-03-22 at 14:21 +0000, Julien Grall wrote:
>>> ...
>>>>
>>>>>>> +    unsigned long page_addr;
>>>>>>> +
>>>>>>> +    // /* align start addresses */
>>>>>>> +    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>>>>>> +    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
>>>>>>
>>>>>> They should be switched to ASSERT() or BUG_ON().
>>>>> Sure. Thanks. I'll update.
>>>>>>
>>>>>>> +
>>>>>>> +    page_addr = map_start;
>>>>>>> +    while ( page_addr < map_end )
>>>>>>
>>>>>> This loop is continue to assume that only the mapping can
>>>>>> first
>>>>>> in
>>>>>> 2MB
>>>>>> section (or less if the start is not 2MB aligned).
>>>>>>
>>>>>> I am OK if you want to assume it, but I think this should be
>>>>>> documented
>>>>>> (with words and ASSERT()/BUG_ON()) to avoid any mistake.
>>>>> I add a check in setup_initial_pagetables:
>>>>>         BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) !=
>>>>> 0);
>>>>> Probably this is not a correct place and should be moved to
>>>>> setup_initial_mapping() instead of setup_initial_pagetables()
>>>>
>>>> Yes it should be moved in setup_initial_mapping().
>>> Then I'll moved it to setup_initial_mapping()
>>>>
>>>>>>
>>>>>>> +    {
>>>>>>> +        unsigned long index2 = pt_index(2, page_addr);
>>>>>>> +        unsigned long index1 = pt_index(1, page_addr);
>>>>>>> +        unsigned long index0 = pt_index(0, page_addr);
>>>>>>> +
>>>>>>> +        /* Setup level2 table */
>>>>>>> +        second[index2] = paddr_to_pte((unsigned
>>>>>>> long)first);
>>>>>>> +        second[index2].pte |= PTE_TABLE;
>>>>>>> +
>>>>>>> +        /* Setup level1 table */
>>>>>>> +        first[index1] = paddr_to_pte((unsigned
>>>>>>> long)zeroeth);
>>>>>>> +        first[index1].pte |= PTE_TABLE;
>>>>>>> +
>>>>>>> +
>>>>>>
>>>>>> NIT: Spurious line?
>>>>> Yeah, should be removed. Thanks.
>>>>>>
>>>>>>> +        /* Setup level0 table */
>>>>>>> +        if ( !pte_is_valid(&zeroeth[index0]) )
>>>>>>
>>>>>> On the previous version, you said it should be checked for
>>>>>> each
>>>>>> level.
>>>>> I had a terrible internet connection, and my message wasn't
>>>>> sent.
>>>>
>>>> No worries.
>>>>
>>>>>
>>>>> I decided not to check that l2 and l1 are used only for
>>>>> referring
>>>>> to
>>>>> the next page table but not leaf PTE. So it is safe to
>>>>> overwrite it
>>>>> each time (the addresses of page tables are the same all the
>>>>> time)
>>>>
>>>> You are letting the caller to decide which page-table to use for
>>>> each
>>>> level. So you are at the mercy that caller will do the right
>>>> thing.
>>>>
>>>> IHMO, this is a pretty bad idea because debugging page-tables
>>>> error
>>>> are
>>>> difficult. So it is better to have safety in place. This is not
>>>> worth...
>>>>
>>>>     and
>>>>> probably it will be better from optimization point of view to
>>>>> ignore if
>>>>> clauses.
>>>>
>>>> ... the optimization in particular when this is at boot time.
>>> I didn't think about that caller will do always the right thing so
>>> I will add the check.
>>>>
>>>>>
>>>>> And it is needed in case of L0 because it is possible that some
>>>>> addressed were marked with specific flag ( execution, read,
>>>>> write )
>>>>> and
>>>>> so not to overwrite the flags set before the check is needed.
>>>> In which case you should really report an error because the
>>>> caller
>>>> may
>>>> have decide to set execution flag and you don't honor. So when
>>>> the
>>>> code
>>>> is executed, you will receive a fault and this may be hard to
>>>> find
>>>> out
>>>> what happen.
>>>
>>> Right now, it is expected situation that the caller will try to
>>> change
>>> execution flag during the setup of initial page tables. >
>>> It is possible in the currently implemented logic of the setup of
>>> initial page tables.
>>
>> This sounds like a bug in your caller implementation. You should not
>> try
>> to workaround this in your code updating the page-tables.
>>
>>> Let me explain what I mean.
>>>
>>> The first step of setup_initial_pagetables() is to map .text,
>>> .init,
>>> .rodata with necessary flags RX, RX, R.
>>> Remaining sections will have RW flags, and to map them,
>>> setup_initial_mapping() is called for the whole range of
>>> [linker_start,
>>> linker_end] not to map them one by one thereby during this step
>>> setup_initial_mapping() will try to remap addresses ranges which
>>> overlap with .text, .init, .rodata with RW flags but it should
>>> leave
>>> with the previously set flags.
>> Why do you need to call setup_init_mapping() with the whole range? In
>> fact the only reason I can think this is useful is when you when to
>> create a 1:1 mapping when the linker and load address is different.
> It is needed to not map each section separately one by one as most of
> the sections have the same PTE_FLAGS (Read, Write, eXectuable, etc )
> 
> So it was decided to map the following sections separately as they have
> 'unique' flags:
>   - .text -> RX
>   - .rodata -> R
>   - .init.text -> RX
> 
> All other sections are RW and could be covered by one call of
> setup_init_mapping() for the whole range:
>   - .data.ro_after_init
>   - .data.read_mostly
>   - .data
>   - .init.data
>   - .bss
> So some ranges ( .text, .rodata, .init.text ) from the whole range will
> be skipped as already mapped and the rest sections will be mapped
> during one call of setup_init_mapping().

This approach is very fragile and not scalable because:
  * You can't use setup_initial_mapping() to change the permission flags.
  * You can't catch caller mistakes (e.g. imagine you end up to use a 
different physical region).

I can see two solutions:

   1) Loop through the region page and page and check within permission 
you want (see the loop in setup_pagetables() for Arm).
   2) Re-order the calls so you want first all Xen and then update the 
permission flags as it fits.

I don't have a strong preference between the two options here.

Cheers,
Oleksii Kurochko March 27, 2023, 10:50 a.m. UTC | #14
Hello Julien,

On Tue, 2023-03-21 at 17:58 +0000, Julien Grall wrote:
> > +        /* Setup level0 table */
> > +        if ( !pte_is_valid(&zeroeth[index0]) )
> 
> On the previous version, you said it should be checked for each
> level. 
> But here you still only check for a single level.
> 
> Furthermore, I would strongly suggest to also check the valid PTE is
> the 
> same as you intend to write to catch any override (they are a pain to
> debug).
> 
Do you mean that I have to get a virtual address, extract the page
table index bits from it, traverse the page table hierarchy to locate
the PTE for the virtual address and the compare the gotten PTE address
with zeroeh[index0]?

> > +        {
> > +            /* Update level0 table */
> > +            zeroeth[index0] = paddr_to_pte((page_addr - map_start)
> > + pa_start);
> > +            zeroeth[index0].pte |= flags;
> > +        }
> > +
> > +        /* Point to next page */
> > +        page_addr += XEN_PT_LEVEL_SIZE(0);
> > +    }
> > +}

~ Oleksii
Julien Grall March 27, 2023, 10:54 a.m. UTC | #15
On 27/03/2023 11:50, Oleksii wrote:
> Hello Julien,

Hi,

> On Tue, 2023-03-21 at 17:58 +0000, Julien Grall wrote:
>>> +        /* Setup level0 table */
>>> +        if ( !pte_is_valid(&zeroeth[index0]) )
>>
>> On the previous version, you said it should be checked for each
>> level.
>> But here you still only check for a single level.
>>
>> Furthermore, I would strongly suggest to also check the valid PTE is
>> the
>> same as you intend to write to catch any override (they are a pain to
>> debug).
>>
> Do you mean that I have to get a virtual address, extract the page
> table index bits from it, traverse the page table hierarchy to locate
> the PTE for the virtual address and the compare the gotten PTE address
> with zeroeh[index0]?

No. I was suggesting to check the PTE you intend to write is the exact 
same that the one you are overwriting.

This could potentially be relaxed to allow permission flags to be changed.

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/riscv/Makefile b/xen/arch/riscv/Makefile
index 443f6bf15f..956ceb02df 100644
--- a/xen/arch/riscv/Makefile
+++ b/xen/arch/riscv/Makefile
@@ -1,5 +1,6 @@ 
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 obj-y += entry.o
+obj-y += mm.o
 obj-$(CONFIG_RISCV_64) += riscv64/
 obj-y += sbi.o
 obj-y += setup.o
diff --git a/xen/arch/riscv/include/asm/mm.h b/xen/arch/riscv/include/asm/mm.h
new file mode 100644
index 0000000000..3cc98fe45b
--- /dev/null
+++ b/xen/arch/riscv/include/asm/mm.h
@@ -0,0 +1,8 @@ 
+#ifndef _ASM_RISCV_MM_H
+#define _ASM_RISCV_MM_H
+
+void setup_initial_pagetables(void);
+
+extern void enable_mmu(void);
+
+#endif /* _ASM_RISCV_MM_H */
diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
new file mode 100644
index 0000000000..fb8329a191
--- /dev/null
+++ b/xen/arch/riscv/include/asm/page.h
@@ -0,0 +1,67 @@ 
+#ifndef _ASM_RISCV_PAGE_H
+#define _ASM_RISCV_PAGE_H
+
+#include <xen/const.h>
+#include <xen/types.h>
+
+#define PAGE_ENTRIES                (1 << PAGETABLE_ORDER)
+#define VPN_MASK                    ((unsigned long)(PAGE_ENTRIES - 1))
+
+#define PAGE_ORDER                  (12)
+
+#ifdef CONFIG_RISCV_64
+#define PAGETABLE_ORDER             (9)
+#else /* CONFIG_RISCV_32 */
+#define PAGETABLE_ORDER             (10)
+#endif
+
+#define LEVEL_ORDER(lvl)            (lvl * PAGETABLE_ORDER)
+#define LEVEL_SHIFT(lvl)            (LEVEL_ORDER(lvl) + PAGE_ORDER)
+#define LEVEL_SIZE(lvl)             (_AT(paddr_t, 1) << LEVEL_SHIFT(lvl))
+
+#define XEN_PT_LEVEL_SHIFT(lvl)     LEVEL_SHIFT(lvl)
+#define XEN_PT_LEVEL_ORDER(lvl)     LEVEL_ORDER(lvl)
+#define XEN_PT_LEVEL_SIZE(lvl)      LEVEL_SIZE(lvl)
+#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) - 1))
+#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK << XEN_PT_LEVEL_SHIFT(lvl))
+
+#define PTE_SHIFT                   10
+
+#define PTE_VALID                   BIT(0, UL)
+#define PTE_READABLE                BIT(1, UL)
+#define PTE_WRITABLE                BIT(2, UL)
+#define PTE_EXECUTABLE              BIT(3, UL)
+#define PTE_USER                    BIT(4, UL)
+#define PTE_GLOBAL                  BIT(5, UL)
+#define PTE_ACCESSED                BIT(6, UL)
+#define PTE_DIRTY                   BIT(7, UL)
+#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
+
+#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE | PTE_EXECUTABLE)
+#define PTE_TABLE                   (PTE_VALID)
+
+/* Calculate the offsets into the pagetables for a given VA */
+#define pt_linear_offset(lvl, va)   ((va) >> XEN_PT_LEVEL_SHIFT(lvl))
+
+#define pt_index(lvl, va)   pt_linear_offset(lvl, (va) & XEN_PT_LEVEL_MASK(lvl))
+
+/* Page Table entry */
+typedef struct {
+    uint64_t pte;
+} pte_t;
+
+/* Shift the VPN[x] or PPN[x] fields of a virtual or physical address
+ * to become the shifted PPN[x] fields of a page table entry */
+#define addr_to_ppn(x) (((x) >> PAGE_SHIFT) << PTE_SHIFT)
+
+static inline pte_t paddr_to_pte(const unsigned long paddr)
+{
+    return (pte_t) { .pte = addr_to_ppn(paddr) };
+}
+
+static inline bool pte_is_valid(const pte_t *p)
+{
+    return p->pte & PTE_VALID;
+}
+
+#endif /* _ASM_RISCV_PAGE_H */
diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
new file mode 100644
index 0000000000..0df6b47441
--- /dev/null
+++ b/xen/arch/riscv/mm.c
@@ -0,0 +1,121 @@ 
+#include <xen/compiler.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
+#include <xen/page-size.h>
+
+#include <asm/boot-info.h>
+#include <asm/config.h>
+#include <asm/csr.h>
+#include <asm/mm.h>
+#include <asm/page.h>
+#include <asm/traps.h>
+
+/*
+ * xen_second_pagetable is indexed with the VPN[2] page table entry field
+ * xen_first_pagetable is accessed from the VPN[1] page table entry field
+ * xen_zeroeth_pagetable is accessed from the VPN[0] page table entry field
+ */
+pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
+    xen_second_pagetable[PAGE_ENTRIES];
+static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
+    xen_first_pagetable[PAGE_ENTRIES];
+static pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
+    xen_zeroeth_pagetable[PAGE_ENTRIES];
+
+extern unsigned long __init_begin[];
+extern unsigned long __init_end[];
+extern unsigned char cpu0_boot_stack[STACK_SIZE];
+
+static void __init
+setup_initial_mapping(pte_t *second, pte_t *first, pte_t *zeroeth,
+                      unsigned long map_start,
+                      unsigned long map_end,
+                      unsigned long pa_start,
+                      unsigned long flags)
+{
+    unsigned long page_addr;
+
+    // /* align start addresses */
+    // map_start &= XEN_PT_LEVEL_MAP_MASK(0);
+    // pa_start &= XEN_PT_LEVEL_MAP_MASK(0);
+
+    page_addr = map_start;
+    while ( page_addr < map_end )
+    {
+        unsigned long index2 = pt_index(2, page_addr);
+        unsigned long index1 = pt_index(1, page_addr);
+        unsigned long index0 = pt_index(0, page_addr);
+
+        /* Setup level2 table */
+        second[index2] = paddr_to_pte((unsigned long)first);
+        second[index2].pte |= PTE_TABLE;
+
+        /* Setup level1 table */
+        first[index1] = paddr_to_pte((unsigned long)zeroeth);
+        first[index1].pte |= PTE_TABLE;
+
+
+        /* Setup level0 table */
+        if ( !pte_is_valid(&zeroeth[index0]) )
+        {
+            /* Update level0 table */
+            zeroeth[index0] = paddr_to_pte((page_addr - map_start) + pa_start);
+            zeroeth[index0].pte |= flags;
+        }
+
+        /* Point to next page */
+        page_addr += XEN_PT_LEVEL_SIZE(0);
+    }
+}
+
+/*
+ * setup_initial_pagetables:
+ *
+ * Build the page tables for Xen that map the following:
+ *   load addresses to linker addresses
+ */
+void __init setup_initial_pagetables(void)
+{
+    pte_t *second;
+    pte_t *first;
+    pte_t *zeroeth;
+
+    unsigned long load_addr_start   = boot_info.load_start;
+    unsigned long load_addr_end     = boot_info.load_end;
+    unsigned long linker_addr_start = boot_info.linker_start;
+    unsigned long linker_addr_end   = boot_info.linker_end;
+
+    BUG_ON(load_addr_start % (PAGE_ENTRIES * PAGE_SIZE) != 0);
+    if (load_addr_start != linker_addr_start)
+        BUG_ON((linker_addr_end > load_addr_start && load_addr_end > linker_addr_start));
+
+    /* Get the addresses where the page tables were loaded */
+    second  = (pte_t *)(&xen_second_pagetable);
+    first   = (pte_t *)(&xen_first_pagetable);
+    zeroeth = (pte_t *)(&xen_zeroeth_pagetable);
+
+    setup_initial_mapping(second, first, zeroeth,
+                          LOAD_TO_LINK((unsigned long)&_stext),
+                          LOAD_TO_LINK((unsigned long)&_etext),
+                          (unsigned long)&_stext,
+                          PTE_LEAF_DEFAULT);
+
+    setup_initial_mapping(second, first, zeroeth,
+                          LOAD_TO_LINK((unsigned long)&__init_begin),
+                          LOAD_TO_LINK((unsigned long)&__init_end),
+                          (unsigned long)&__init_begin,
+                          PTE_LEAF_DEFAULT | PTE_WRITABLE);
+
+    setup_initial_mapping(second, first, zeroeth,
+                          LOAD_TO_LINK((unsigned long)&_srodata),
+                          LOAD_TO_LINK((unsigned long)&_erodata),
+                          (unsigned long)(&_srodata),
+                          PTE_VALID | PTE_READABLE);
+
+    setup_initial_mapping(second, first, zeroeth,
+                          linker_addr_start,
+                          linker_addr_end,
+                          load_addr_start,
+                          PTE_LEAF_DEFAULT | PTE_READABLE);
+}
diff --git a/xen/arch/riscv/riscv64/head.S b/xen/arch/riscv/riscv64/head.S
index 8887f0cbd4..f4a0582727 100644
--- a/xen/arch/riscv/riscv64/head.S
+++ b/xen/arch/riscv/riscv64/head.S
@@ -1,4 +1,5 @@ 
 #include <asm/asm.h>
+#include <asm/asm-offsets.h>
 #include <asm/riscv_encoding.h>
 
         .section .text.header, "ax", %progbits
@@ -32,3 +33,67 @@  ENTRY(start)
         add     sp, sp, t0
 
         tail    start_xen
+
+ENTRY(enable_mmu)
+        /* Calculate physical offset between linker and load addresses */
+        la      t0, boot_info
+        REG_L   t1, BI_LINKER_START(t0)
+        REG_L   t2, BI_LOAD_START(t0)
+        sub     t1, t1, t2
+
+        /*
+         * Calculate and update a linker time address of the .L_mmu_is_enabled
+         * label and update CSR_STVEC with it.
+         * MMU is configured in a way where linker addresses are mapped
+         * on load addresses so case when linker addresses are not equal to
+         * load addresses, and thereby, after MMU is enabled, it will cause
+         * an exception and jump to linker time addresses
+         */
+        la      t3, .L_mmu_is_enabled
+        add     t3, t3, t1
+        csrw    CSR_STVEC, t3
+
+        /* Calculate a value for SATP register */
+        li      t5, SATP_MODE_SV39
+        li      t6, SATP_MODE_SHIFT
+        sll     t5, t5, t6
+
+        la      t4, xen_second_pagetable
+        srl     t4, t4, PAGE_SHIFT
+        or      t4, t4, t5
+        sfence.vma
+        csrw    CSR_SATP, t4
+
+        .align 2
+.L_mmu_is_enabled:
+        /*
+         * Stack should be re-inited as:
+         * 1. Right now an address of the stack is relative to load time
+         *    addresses what will cause an issue in case of load start address
+         *    isn't equal to linker start address.
+         * 2. Addresses in stack are all load time relative which can be an
+         *    issue in case when load start address isn't equal to linker
+         *    start address.
+         */
+        la      sp, cpu0_boot_stack
+        li      t0, STACK_SIZE
+        add     sp, sp, t0
+
+        /*
+         * Re-init an address of exception handler as it was overwritten  with
+         * the address of the .L_mmu_is_enabled label.
+         * Before jump to trap_init save return address of enable_mmu() to
+         * know where we should back after enable_mmu() will be finished.
+         */
+        mv      s0, ra
+        lla     t0, trap_init
+        jalr    ra, t0
+
+        /*
+         * Re-calculate the return address of enable_mmu() function for case
+         * when linker start address isn't equal to load start address
+         */
+        add     s0, s0, t1
+        mv      ra, s0
+
+        ret
diff --git a/xen/arch/riscv/xen.lds.S b/xen/arch/riscv/xen.lds.S
index eed457c492..e4ac4e84b6 100644
--- a/xen/arch/riscv/xen.lds.S
+++ b/xen/arch/riscv/xen.lds.S
@@ -179,3 +179,5 @@  SECTIONS
 
 ASSERT(!SIZEOF(.got),      ".got non-empty")
 ASSERT(!SIZEOF(.got.plt),  ".got.plt non-empty")
+
+ASSERT(_end - _start <= MB(2), "Xen too large for early-boot assumptions")