diff mbox series

[v2,5/8] xen/riscv: introduce asm/pmap.h header

Message ID c7331e4c2f481f069571976a708c4aba04d2c0e4.1720799926.git.oleksii.kurochko@gmail.com (mailing list archive)
State Superseded
Headers show
Series RISCV device tree mapping | expand

Commit Message

Oleksii Kurochko July 12, 2024, 4:22 p.m. UTC
Introduces arch_pmap_{un}map functions and select HAS_PMAP
for CONFIG_RISCV.

Additionaly it was necessary to introduce functions:
 - mfn_to_xen_entry
 - mfn_to_pte

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V2:
 - newly introduced patch
---
 xen/arch/riscv/Kconfig            |  1 +
 xen/arch/riscv/include/asm/page.h |  2 ++
 xen/arch/riscv/include/asm/pmap.h | 28 ++++++++++++++++++++++++++++
 xen/arch/riscv/mm.c               | 14 ++++++++++++++
 4 files changed, 45 insertions(+)
 create mode 100644 xen/arch/riscv/include/asm/pmap.h

Comments

Julien Grall July 21, 2024, 8:51 a.m. UTC | #1
Hi Oleksii,

On 12/07/2024 17:22, Oleksii Kurochko wrote:
> Introduces arch_pmap_{un}map functions and select HAS_PMAP
> for CONFIG_RISCV.
> 
> Additionaly it was necessary to introduce functions:
>   - mfn_to_xen_entry
>   - mfn_to_pte
> 
> Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> ---
> Changes in V2:
>   - newly introduced patch
> ---
>   xen/arch/riscv/Kconfig            |  1 +
>   xen/arch/riscv/include/asm/page.h |  2 ++
>   xen/arch/riscv/include/asm/pmap.h | 28 ++++++++++++++++++++++++++++
>   xen/arch/riscv/mm.c               | 14 ++++++++++++++
>   4 files changed, 45 insertions(+)
>   create mode 100644 xen/arch/riscv/include/asm/pmap.h
> 
> diff --git a/xen/arch/riscv/Kconfig b/xen/arch/riscv/Kconfig
> index 259eea8d3b..0112aa8778 100644
> --- a/xen/arch/riscv/Kconfig
> +++ b/xen/arch/riscv/Kconfig
> @@ -3,6 +3,7 @@ config RISCV
>   	select FUNCTION_ALIGNMENT_16B
>   	select GENERIC_BUG_FRAME
>   	select HAS_DEVICE_TREE
> +	select HAS_PMAP
>   
>   config RISCV_64
>   	def_bool y
> diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
> index cbbf3656d1..339074d502 100644
> --- a/xen/arch/riscv/include/asm/page.h
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -51,6 +51,8 @@ typedef struct {
>   #endif
>   } pte_t;
>   
> +pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr);
> +
>   static inline pte_t paddr_to_pte(paddr_t paddr,
>                                    unsigned int permissions)
>   {
> diff --git a/xen/arch/riscv/include/asm/pmap.h b/xen/arch/riscv/include/asm/pmap.h
> new file mode 100644
> index 0000000000..eb4c48515c
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/pmap.h
> @@ -0,0 +1,28 @@
> +#ifndef __ASM_PMAP_H__
> +#define __ASM_PMAP_H__
> +
> +#include <xen/bug.h>
> +#include <xen/mm.h>
> +
> +#include <asm/fixmap.h>
> +
> +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> +{
> +    pte_t *entry = &xen_fixmap[slot];
> +    pte_t pte;
> +
> +    ASSERT(!pte_is_valid(*entry));
> +
> +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> +    pte.pte |= PTE_LEAF_DEFAULT;
> +    write_pte(entry, pte);
> +}
> +
> +static inline void arch_pmap_unmap(unsigned int slot)
> +{
> +    pte_t pte = {};
> +
> +    write_pte(&xen_fixmap[slot], pte);
> +}
> +
> +#endif /* __ASM_PMAP_H__ */
> diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
> index d69a174b5d..445319af08 100644
> --- a/xen/arch/riscv/mm.c
> +++ b/xen/arch/riscv/mm.c
> @@ -370,3 +370,17 @@ int map_pages_to_xen(unsigned long virt,
>       BUG_ON("unimplemented");
>       return -1;
>   }
> +
> +static inline pte_t mfn_to_pte(mfn_t mfn)
> +{
> +    unsigned long pte = mfn_x(mfn) << PTE_PPN_SHIFT;
> +    return (pte_t){ .pte = pte};
> +}
> +
> +inline pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr)
> +{
> +    /* there is no attr field in RISC-V's pte */
> +    (void) attr;

Surely you have a way to say indicate whether an entry is readable/writable?

> +
> +    return mfn_to_pte(mfn);
> +}

Cheers,
Jan Beulich July 22, 2024, 12:54 p.m. UTC | #2
On 12.07.2024 18:22, Oleksii Kurochko wrote:
> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/pmap.h
> @@ -0,0 +1,28 @@
> +#ifndef __ASM_PMAP_H__
> +#define __ASM_PMAP_H__
> +
> +#include <xen/bug.h>
> +#include <xen/mm.h>
> +
> +#include <asm/fixmap.h>
> +
> +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> +{
> +    pte_t *entry = &xen_fixmap[slot];
> +    pte_t pte;
> +
> +    ASSERT(!pte_is_valid(*entry));
> +
> +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> +    pte.pte |= PTE_LEAF_DEFAULT;
> +    write_pte(entry, pte);
> +}
> +
> +static inline void arch_pmap_unmap(unsigned int slot)
> +{
> +    pte_t pte = {};
> +
> +    write_pte(&xen_fixmap[slot], pte);
> +}

Why are these not using set_fixmap() / clear_fixmap() respectively?

> --- a/xen/arch/riscv/mm.c
> +++ b/xen/arch/riscv/mm.c
> @@ -370,3 +370,17 @@ int map_pages_to_xen(unsigned long virt,
>      BUG_ON("unimplemented");
>      return -1;
>  }
> +
> +static inline pte_t mfn_to_pte(mfn_t mfn)

This name suggests (to me) that you're getting _the_ (single) PTE for
a given MFN. However, what the function is doing is make a PTE using
the given MFN. On x86 at least the common way to name such a function
would be pte_from_mfn().

> +{
> +    unsigned long pte = mfn_x(mfn) << PTE_PPN_SHIFT;
> +    return (pte_t){ .pte = pte};

Nit: Blank missing.

Jan
Jan Beulich July 22, 2024, 12:58 p.m. UTC | #3
On 21.07.2024 10:51, Julien Grall wrote:
> On 12/07/2024 17:22, Oleksii Kurochko wrote:
>> +inline pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr)
>> +{
>> +    /* there is no attr field in RISC-V's pte */
>> +    (void) attr;
> 
> Surely you have a way to say indicate whether an entry is readable/writable?

I'm puzzled by this question. The sole outlier in Arm code is pmap.h, in
passing PAGE_HYPERVISOR_RW to this function when all others pass memory
types (MT_*). Afaics only the low three bits are then used in the
function, discarding access control bits altogether. R/W access appears
to be implied.

Jan
Oleksii Kurochko July 22, 2024, 2:40 p.m. UTC | #4
Hi Julien,

On Sun, 2024-07-21 at 09:51 +0100, Julien Grall wrote:
> Hi Oleksii,
> 
> On 12/07/2024 17:22, Oleksii Kurochko wrote:
> > Introduces arch_pmap_{un}map functions and select HAS_PMAP
> > for CONFIG_RISCV.
> > 
> > Additionaly it was necessary to introduce functions:
> >   - mfn_to_xen_entry
> >   - mfn_to_pte
> > 
> > Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> > ---
> > Changes in V2:
> >   - newly introduced patch
> > ---
> >   xen/arch/riscv/Kconfig            |  1 +
> >   xen/arch/riscv/include/asm/page.h |  2 ++
> >   xen/arch/riscv/include/asm/pmap.h | 28
> > ++++++++++++++++++++++++++++
> >   xen/arch/riscv/mm.c               | 14 ++++++++++++++
> >   4 files changed, 45 insertions(+)
> >   create mode 100644 xen/arch/riscv/include/asm/pmap.h
> > 
> > diff --git a/xen/arch/riscv/Kconfig b/xen/arch/riscv/Kconfig
> > index 259eea8d3b..0112aa8778 100644
> > --- a/xen/arch/riscv/Kconfig
> > +++ b/xen/arch/riscv/Kconfig
> > @@ -3,6 +3,7 @@ config RISCV
> >   	select FUNCTION_ALIGNMENT_16B
> >   	select GENERIC_BUG_FRAME
> >   	select HAS_DEVICE_TREE
> > +	select HAS_PMAP
> >   
> >   config RISCV_64
> >   	def_bool y
> > diff --git a/xen/arch/riscv/include/asm/page.h
> > b/xen/arch/riscv/include/asm/page.h
> > index cbbf3656d1..339074d502 100644
> > --- a/xen/arch/riscv/include/asm/page.h
> > +++ b/xen/arch/riscv/include/asm/page.h
> > @@ -51,6 +51,8 @@ typedef struct {
> >   #endif
> >   } pte_t;
> >   
> > +pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr);
> > +
> >   static inline pte_t paddr_to_pte(paddr_t paddr,
> >                                    unsigned int permissions)
> >   {
> > diff --git a/xen/arch/riscv/include/asm/pmap.h
> > b/xen/arch/riscv/include/asm/pmap.h
> > new file mode 100644
> > index 0000000000..eb4c48515c
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/pmap.h
> > @@ -0,0 +1,28 @@
> > +#ifndef __ASM_PMAP_H__
> > +#define __ASM_PMAP_H__
> > +
> > +#include <xen/bug.h>
> > +#include <xen/mm.h>
> > +
> > +#include <asm/fixmap.h>
> > +
> > +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> > +{
> > +    pte_t *entry = &xen_fixmap[slot];
> > +    pte_t pte;
> > +
> > +    ASSERT(!pte_is_valid(*entry));
> > +
> > +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> > +    pte.pte |= PTE_LEAF_DEFAULT;
> > +    write_pte(entry, pte);
> > +}
> > +
> > +static inline void arch_pmap_unmap(unsigned int slot)
> > +{
> > +    pte_t pte = {};
> > +
> > +    write_pte(&xen_fixmap[slot], pte);
> > +}
> > +
> > +#endif /* __ASM_PMAP_H__ */
> > diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
> > index d69a174b5d..445319af08 100644
> > --- a/xen/arch/riscv/mm.c
> > +++ b/xen/arch/riscv/mm.c
> > @@ -370,3 +370,17 @@ int map_pages_to_xen(unsigned long virt,
> >       BUG_ON("unimplemented");
> >       return -1;
> >   }
> > +
> > +static inline pte_t mfn_to_pte(mfn_t mfn)
> > +{
> > +    unsigned long pte = mfn_x(mfn) << PTE_PPN_SHIFT;
> > +    return (pte_t){ .pte = pte};
> > +}
> > +
> > +inline pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr)
> > +{
> > +    /* there is no attr field in RISC-V's pte */
> > +    (void) attr;
> 
> Surely you have a way to say indicate whether an entry is
> readable/writable?
Sure, there is a way. But probably I misinterpreted attr for Arm and
decided not to use them here. By that I mean, that Arm has MT_NORMAL,
MT_DEVICE which RISC-V doesn't have.

If it is about readable/writable then for sure, I will start to use
attr.

~ Oleksii

> 
> > +
> > +    return mfn_to_pte(mfn);
> > +}
> 
> Cheers,
>
Oleksii Kurochko July 22, 2024, 2:44 p.m. UTC | #5
On Mon, 2024-07-22 at 14:54 +0200, Jan Beulich wrote:
> On 12.07.2024 18:22, Oleksii Kurochko wrote:
> > --- /dev/null
> > +++ b/xen/arch/riscv/include/asm/pmap.h
> > @@ -0,0 +1,28 @@
> > +#ifndef __ASM_PMAP_H__
> > +#define __ASM_PMAP_H__
> > +
> > +#include <xen/bug.h>
> > +#include <xen/mm.h>
> > +
> > +#include <asm/fixmap.h>
> > +
> > +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> > +{
> > +    pte_t *entry = &xen_fixmap[slot];
> > +    pte_t pte;
> > +
> > +    ASSERT(!pte_is_valid(*entry));
> > +
> > +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> > +    pte.pte |= PTE_LEAF_DEFAULT;
> > +    write_pte(entry, pte);
> > +}
> > +
> > +static inline void arch_pmap_unmap(unsigned int slot)
> > +{
> > +    pte_t pte = {};
> > +
> > +    write_pte(&xen_fixmap[slot], pte);
> > +}
> 
> Why are these not using set_fixmap() / clear_fixmap() respectively?
They haven't been introduced yet. And I thought that these fucntion are
used only in pmap_{un}map() and that is the reason why I decided to not
introduce them. But while writing the answer on another comment, I
found other places where set_fixmap() / clear_fixmap() are used, so I
will introduce them and reuse here.

> 
> > --- a/xen/arch/riscv/mm.c
> > +++ b/xen/arch/riscv/mm.c
> > @@ -370,3 +370,17 @@ int map_pages_to_xen(unsigned long virt,
> >      BUG_ON("unimplemented");
> >      return -1;
> >  }
> > +
> > +static inline pte_t mfn_to_pte(mfn_t mfn)
> 
> This name suggests (to me) that you're getting _the_ (single) PTE for
> a given MFN. However, what the function is doing is make a PTE using
> the given MFN. On x86 at least the common way to name such a function
> would be pte_from_mfn().
If it is a common way then I will rename it. Thanks.

~ Oleksii


> 
> > +{
> > +    unsigned long pte = mfn_x(mfn) << PTE_PPN_SHIFT;
> > +    return (pte_t){ .pte = pte};
> 
> Nit: Blank missing.
> 
> Jan
Julien Grall July 22, 2024, 2:48 p.m. UTC | #6
Hi,

On 22/07/2024 15:44, Oleksii Kurochko wrote:
> On Mon, 2024-07-22 at 14:54 +0200, Jan Beulich wrote:
>> On 12.07.2024 18:22, Oleksii Kurochko wrote:
>>> --- /dev/null
>>> +++ b/xen/arch/riscv/include/asm/pmap.h
>>> @@ -0,0 +1,28 @@
>>> +#ifndef __ASM_PMAP_H__
>>> +#define __ASM_PMAP_H__
>>> +
>>> +#include <xen/bug.h>
>>> +#include <xen/mm.h>
>>> +
>>> +#include <asm/fixmap.h>
>>> +
>>> +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
>>> +{
>>> +    pte_t *entry = &xen_fixmap[slot];
>>> +    pte_t pte;
>>> +
>>> +    ASSERT(!pte_is_valid(*entry));
>>> +
>>> +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
>>> +    pte.pte |= PTE_LEAF_DEFAULT;
>>> +    write_pte(entry, pte);
>>> +}
>>> +
>>> +static inline void arch_pmap_unmap(unsigned int slot)
>>> +{
>>> +    pte_t pte = {};
>>> +
>>> +    write_pte(&xen_fixmap[slot], pte);
>>> +}
>>
>> Why are these not using set_fixmap() / clear_fixmap() respectively?
> They haven't been introduced yet. And I thought that these fucntion are
> used only in pmap_{un}map() and that is the reason why I decided to not
> introduce them. But while writing the answer on another comment, I
> found other places where set_fixmap() / clear_fixmap() are used, so I
> will introduce them and reuse here.

I am guessing you are going to implement set_fixmap()/clear_fixmap() 
using map_pages_to_xen(). If so, for early boot you are going to end up 
in a circular loop because map_pages_to_xen() will likely use pmap() 
which will call set_fixmap().

There is a big comment in common/pmap.c which explain why arch_pmap_* 
was introduced rather than calling *_fixmap() directly:

     /*
      * We cannot use set_fixmap() here. We use PMAP when the domain map
      * page infrastructure is not yet initialized, so 
map_pages_to_xen() called
      * by set_fixmap() needs to map pages on demand, which then calls 
pmap()
      * again, resulting in a loop. Modify the PTEs directly instead. 
The same
      * is true for pmap_unmap().
      */

Cheers,
Julien Grall July 22, 2024, 2:57 p.m. UTC | #7
On 22/07/2024 13:58, Jan Beulich wrote:
> On 21.07.2024 10:51, Julien Grall wrote:
>> On 12/07/2024 17:22, Oleksii Kurochko wrote:
>>> +inline pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr)
>>> +{
>>> +    /* there is no attr field in RISC-V's pte */
>>> +    (void) attr;
>>
>> Surely you have a way to say indicate whether an entry is readable/writable?
> 
> I'm puzzled by this question. The sole outlier in Arm code is pmap.h, in
> passing PAGE_HYPERVISOR_RW to this function when all others pass memory
> types (MT_*).  > Afaics only the low three bits are then used in the
> function, discarding access control bits altogether. R/W access appears
> to be implied.

Arm is not exempt of odd interfaces. However, from just this patch, it 
is not clear why RISC-V would continue to use the same appropach if the 
attributes doesn't exist.

Looking at the rest of the series, it seems to be because we wanted to 
have a generic page-table code. If there are a desire to continue 
towards this direction, then I would assume we would need a arch 
specific way to set the read/write bit.

At which point it makes a lot more sense to push setting the access bits 
in mfn_to_xen_entry().

Even if we don't do any code consolidation, I think it is odd for Arm 
that a caller will assume mfn_to_xen_entry() will always return a 
read-writable page and update as necessary. It would be nicer to push 
this decision to mfn_to_xen_entry().

Cheers,
Oleksii Kurochko July 22, 2024, 5:09 p.m. UTC | #8
On Mon, 2024-07-22 at 15:48 +0100, Julien Grall wrote:
> Hi,
> 
> On 22/07/2024 15:44, Oleksii Kurochko wrote:
> > On Mon, 2024-07-22 at 14:54 +0200, Jan Beulich wrote:
> > > On 12.07.2024 18:22, Oleksii Kurochko wrote:
> > > > --- /dev/null
> > > > +++ b/xen/arch/riscv/include/asm/pmap.h
> > > > @@ -0,0 +1,28 @@
> > > > +#ifndef __ASM_PMAP_H__
> > > > +#define __ASM_PMAP_H__
> > > > +
> > > > +#include <xen/bug.h>
> > > > +#include <xen/mm.h>
> > > > +
> > > > +#include <asm/fixmap.h>
> > > > +
> > > > +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> > > > +{
> > > > +    pte_t *entry = &xen_fixmap[slot];
> > > > +    pte_t pte;
> > > > +
> > > > +    ASSERT(!pte_is_valid(*entry));
> > > > +
> > > > +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> > > > +    pte.pte |= PTE_LEAF_DEFAULT;
> > > > +    write_pte(entry, pte);
> > > > +}
> > > > +
> > > > +static inline void arch_pmap_unmap(unsigned int slot)
> > > > +{
> > > > +    pte_t pte = {};
> > > > +
> > > > +    write_pte(&xen_fixmap[slot], pte);
> > > > +}
> > > 
> > > Why are these not using set_fixmap() / clear_fixmap()
> > > respectively?
> > They haven't been introduced yet. And I thought that these fucntion
> > are
> > used only in pmap_{un}map() and that is the reason why I decided to
> > not
> > introduce them. But while writing the answer on another comment, I
> > found other places where set_fixmap() / clear_fixmap() are used, so
> > I
> > will introduce them and reuse here.
> 
> I am guessing you are going to implement set_fixmap()/clear_fixmap() 
> using map_pages_to_xen(). If so, for early boot you are going to end
> up 
> in a circular loop because map_pages_to_xen() will likely use pmap() 
> which will call set_fixmap().
I am going to implement that in the following way as I faced the
described by you issue when I first time tried to implement it using
map_pages_to_xen():
   /* Map a 4k page in a fixmap entry */
   void set_fixmap(unsigned map, mfn_t mfn, unsigned int flags)
   {
       pte_t pte;
   
       pte = mfn_to_xen_entry(mfn, flags);
       pte.pte |= PTE_LEAF_DEFAULT;
       write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
   }
   
   /* Remove a mapping from a fixmap entry */
   void clear_fixmap(unsigned map)
   {
       pte_t pte = {0};
       write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
   }
   
~ Oleksii

> 
> There is a big comment in common/pmap.c which explain why arch_pmap_*
> was introduced rather than calling *_fixmap() directly:
> 
>      /*
>       * We cannot use set_fixmap() here. We use PMAP when the domain
> map
>       * page infrastructure is not yet initialized, so 
> map_pages_to_xen() called
>       * by set_fixmap() needs to map pages on demand, which then
> calls 
> pmap()
>       * again, resulting in a loop. Modify the PTEs directly instead.
> The same
>       * is true for pmap_unmap().
>       */
> 
> Cheers,
>
Julien Grall July 22, 2024, 5:21 p.m. UTC | #9
Hi,

On 22/07/2024 18:09, Oleksii Kurochko wrote:
> On Mon, 2024-07-22 at 15:48 +0100, Julien Grall wrote:
>> Hi,
>>
>> On 22/07/2024 15:44, Oleksii Kurochko wrote:
>>> On Mon, 2024-07-22 at 14:54 +0200, Jan Beulich wrote:
>>>> On 12.07.2024 18:22, Oleksii Kurochko wrote:
>>>>> --- /dev/null
>>>>> +++ b/xen/arch/riscv/include/asm/pmap.h
>>>>> @@ -0,0 +1,28 @@
>>>>> +#ifndef __ASM_PMAP_H__
>>>>> +#define __ASM_PMAP_H__
>>>>> +
>>>>> +#include <xen/bug.h>
>>>>> +#include <xen/mm.h>
>>>>> +
>>>>> +#include <asm/fixmap.h>
>>>>> +
>>>>> +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
>>>>> +{
>>>>> +    pte_t *entry = &xen_fixmap[slot];
>>>>> +    pte_t pte;
>>>>> +
>>>>> +    ASSERT(!pte_is_valid(*entry));
>>>>> +
>>>>> +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
>>>>> +    pte.pte |= PTE_LEAF_DEFAULT;
>>>>> +    write_pte(entry, pte);
>>>>> +}
>>>>> +
>>>>> +static inline void arch_pmap_unmap(unsigned int slot)
>>>>> +{
>>>>> +    pte_t pte = {};
>>>>> +
>>>>> +    write_pte(&xen_fixmap[slot], pte);
>>>>> +}
>>>>
>>>> Why are these not using set_fixmap() / clear_fixmap()
>>>> respectively?
>>> They haven't been introduced yet. And I thought that these fucntion
>>> are
>>> used only in pmap_{un}map() and that is the reason why I decided to
>>> not
>>> introduce them. But while writing the answer on another comment, I
>>> found other places where set_fixmap() / clear_fixmap() are used, so
>>> I
>>> will introduce them and reuse here.
>>
>> I am guessing you are going to implement set_fixmap()/clear_fixmap()
>> using map_pages_to_xen(). If so, for early boot you are going to end
>> up
>> in a circular loop because map_pages_to_xen() will likely use pmap()
>> which will call set_fixmap().
> I am going to implement that in the following way as I faced the
> described by you issue when I first time tried to implement it using
> map_pages_to_xen():
What's wrong with keeping the arch_pmap_*() as-is and call 
map_pages_to_xen() in the fixmap?

At least this would be consistent with what other architectures does and 
less risky (see below).

>     /* Map a 4k page in a fixmap entry */
>     void set_fixmap(unsigned map, mfn_t mfn, unsigned int flags)
>     {
>         pte_t pte;
>     
>         pte = mfn_to_xen_entry(mfn, flags);
>         pte.pte |= PTE_LEAF_DEFAULT;
>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);

It would be saner to check if you are not overwriting any existing 
mapping as otherwise you will probably need a TLB flush.

>     }
>     
>     /* Remove a mapping from a fixmap entry */
>     void clear_fixmap(unsigned map)
>     {
>         pte_t pte = {0};
>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);

Don't you need a TLB flush?

Cheers,
Oleksii Kurochko July 23, 2024, 8:02 a.m. UTC | #10
Hi Julien,

On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org> wrote:

> Hi,
>
> >> On 22/07/2024 15:44, Oleksii Kurochko wrote:
> >>> On Mon, 2024-07-22 at 14:54 +0200, Jan Beulich wrote:
> >>>> On 12.07.2024 18:22, Oleksii Kurochko wrote:
> >>>>> --- /dev/null
> >>>>> +++ b/xen/arch/riscv/include/asm/pmap.h
> >>>>> @@ -0,0 +1,28 @@
> >>>>> +#ifndef __ASM_PMAP_H__
> >>>>> +#define __ASM_PMAP_H__
> >>>>> +
> >>>>> +#include <xen/bug.h>
> >>>>> +#include <xen/mm.h>
> >>>>> +
> >>>>> +#include <asm/fixmap.h>
> >>>>> +
> >>>>> +static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
> >>>>> +{
> >>>>> +    pte_t *entry = &xen_fixmap[slot];
> >>>>> +    pte_t pte;
> >>>>> +
> >>>>> +    ASSERT(!pte_is_valid(*entry));
> >>>>> +
> >>>>> +    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
> >>>>> +    pte.pte |= PTE_LEAF_DEFAULT;
> >>>>> +    write_pte(entry, pte);
> >>>>> +}
> >>>>> +
> >>>>> +static inline void arch_pmap_unmap(unsigned int slot)
> >>>>> +{
> >>>>> +    pte_t pte = {};
> >>>>> +
> >>>>> +    write_pte(&xen_fixmap[slot], pte);
> >>>>> +}
> >>>>
> >>>> Why are these not using set_fixmap() / clear_fixmap()
> >>>> respectively?
> >>> They haven't been introduced yet. And I thought that these fucntion
> >>> are
> >>> used only in pmap_{un}map() and that is the reason why I decided to
> >>> not
> >>> introduce them. But while writing the answer on another comment, I
> >>> found other places where set_fixmap() / clear_fixmap() are used, so
> >>> I
> >>> will introduce them and reuse here.
> >>
> >> I am guessing you are going to implement set_fixmap()/clear_fixmap()
> >> using map_pages_to_xen(). If so, for early boot you are going to end
> >> up
> >> in a circular loop because map_pages_to_xen() will likely use pmap()
> >> which will call set_fixmap().
> > I am going to implement that in the following way as I faced the
> > described by you issue when I first time tried to implement it using
> > map_pages_to_xen():
> What's wrong with keeping the arch_pmap_*() as-is and call
> map_pages_to_xen() in the fixmap?
>
> At least this would be consistent with what other architectures does and
> less risky (see below).
>
Then I misunderstood you, if not to use {set/clear}_fixmap() in arch_pmap()
then everything should be fine.
Then I think it is needed to add the comment also above arch_pmap_*()
function why it isn't used {set/clear}_fixmap()
inside. ( or update the commit message )


>
> >     /* Map a 4k page in a fixmap entry */
> >     void set_fixmap(unsigned map, mfn_t mfn, unsigned int flags)
> >     {
> >         pte_t pte;
> >
> >         pte = mfn_to_xen_entry(mfn, flags);
> >         pte.pte |= PTE_LEAF_DEFAULT;
> >         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
>
> It would be saner to check if you are not overwriting any existing
> mapping as otherwise you will probably need a TLB flush.
>
> >     }
> >
> >     /* Remove a mapping from a fixmap entry */
> >     void clear_fixmap(unsigned map)
> >     {
> >         pte_t pte = {0};
> >         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
>
> Don't you need a TLB flush?
>
Inside write_pte() there is "sfence.vma".

But probably it would be better to add flush_xen_tlb_range_va_local() or
something similar here in case if someone will decide to update write_pte().

~ Oleksii
Jan Beulich July 23, 2024, 8:36 a.m. UTC | #11
On 23.07.2024 10:02, Oleksii Kurochko wrote:
> On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org> wrote:
>>>> On 22/07/2024 15:44, Oleksii Kurochko wrote:
>>>     /* Map a 4k page in a fixmap entry */
>>>     void set_fixmap(unsigned map, mfn_t mfn, unsigned int flags)
>>>     {
>>>         pte_t pte;
>>>
>>>         pte = mfn_to_xen_entry(mfn, flags);
>>>         pte.pte |= PTE_LEAF_DEFAULT;
>>>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
>>
>> It would be saner to check if you are not overwriting any existing
>> mapping as otherwise you will probably need a TLB flush.
>>
>>>     }
>>>
>>>     /* Remove a mapping from a fixmap entry */
>>>     void clear_fixmap(unsigned map)
>>>     {
>>>         pte_t pte = {0};
>>>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))], pte);
>>
>> Don't you need a TLB flush?
>>
> Inside write_pte() there is "sfence.vma".

That's just a fence though, not a TLB flush.

Jan
Oleksii Kurochko July 23, 2024, 8:55 a.m. UTC | #12
On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
> On 23.07.2024 10:02, Oleksii Kurochko wrote:
> > On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org>
> > wrote:
> > > > > On 22/07/2024 15:44, Oleksii Kurochko wrote:
> > > >     /* Map a 4k page in a fixmap entry */
> > > >     void set_fixmap(unsigned map, mfn_t mfn, unsigned int
> > > > flags)
> > > >     {
> > > >         pte_t pte;
> > > > 
> > > >         pte = mfn_to_xen_entry(mfn, flags);
> > > >         pte.pte |= PTE_LEAF_DEFAULT;
> > > >         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))],
> > > > pte);
> > > 
> > > It would be saner to check if you are not overwriting any
> > > existing
> > > mapping as otherwise you will probably need a TLB flush.
> > > 
> > > >     }
> > > > 
> > > >     /* Remove a mapping from a fixmap entry */
> > > >     void clear_fixmap(unsigned map)
> > > >     {
> > > >         pte_t pte = {0};
> > > >         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))],
> > > > pte);
> > > 
> > > Don't you need a TLB flush?
> > > 
> > Inside write_pte() there is "sfence.vma".
> 
> That's just a fence though, not a TLB flush.
From the privileged doc:
   ```
   SFENCE.VMA is also used to invalidate entries in the
   address-translation cache associated with a hart (see Section 4.3.2). 
   ...
   The SFENCE.VMA is used to flush any local hardware caches related to
   address translation.
   It is specified as a fence rather than a TLB flush to provide cleaner
   semantics with respect to
   which instructions are affected by the flush operation and to support a
   wider variety of dynamic
   caching structures and memory-management schemes. SFENCE.VMA is also
   used by higher
   privilege levels to synchronize page table writes and the address
   translation hardware.
   ...
   ```
I read this as SFENCE.VMA is used not only for ordering of load/stores,
but also to flush TLB ( which is a type of more general term as
address-translation cache, IIUIC ).

Also, Linux kernel uses sfence.vma to flush TLB:
https://elixir.bootlin.com/linux/v6.0/source/arch/riscv/include/asm/tlbflush.h#L23

~ Oleksii
Jan Beulich July 23, 2024, 10:02 a.m. UTC | #13
On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
> On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
>> On 23.07.2024 10:02, Oleksii Kurochko wrote:
>>> On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org>
>>> wrote:
>>>>>> On 22/07/2024 15:44, Oleksii Kurochko wrote:
>>>>>     /* Map a 4k page in a fixmap entry */
>>>>>     void set_fixmap(unsigned map, mfn_t mfn, unsigned int
>>>>> flags)
>>>>>     {
>>>>>         pte_t pte;
>>>>>
>>>>>         pte = mfn_to_xen_entry(mfn, flags);
>>>>>         pte.pte |= PTE_LEAF_DEFAULT;
>>>>>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))],
>>>>> pte);
>>>>
>>>> It would be saner to check if you are not overwriting any
>>>> existing
>>>> mapping as otherwise you will probably need a TLB flush.
>>>>
>>>>>     }
>>>>>
>>>>>     /* Remove a mapping from a fixmap entry */
>>>>>     void clear_fixmap(unsigned map)
>>>>>     {
>>>>>         pte_t pte = {0};
>>>>>         write_pte(&xen_fixmap[pt_index(0, FIXMAP_ADDR(map))],
>>>>> pte);
>>>>
>>>> Don't you need a TLB flush?
>>>>
>>> Inside write_pte() there is "sfence.vma".
>>
>> That's just a fence though, not a TLB flush.
> From the privileged doc:
>    ```
>    SFENCE.VMA is also used to invalidate entries in the
>    address-translation cache associated with a hart (see Section 4.3.2). 
>    ...
>    The SFENCE.VMA is used to flush any local hardware caches related to
>    address translation.
>    It is specified as a fence rather than a TLB flush to provide cleaner
>    semantics with respect to
>    which instructions are affected by the flush operation and to support a
>    wider variety of dynamic
>    caching structures and memory-management schemes. SFENCE.VMA is also
>    used by higher
>    privilege levels to synchronize page table writes and the address
>    translation hardware.
>    ...
>    ```
> I read this as SFENCE.VMA is used not only for ordering of load/stores,
> but also to flush TLB ( which is a type of more general term as
> address-translation cache, IIUIC ).

Oh, I see. Kind of unexpected for an instruction of that name. Yet note
how they talk about the local hart only. You need a wider scope TLB
flush here.

Jan
Oleksii Kurochko July 23, 2024, 3:36 p.m. UTC | #14
On Tue, 2024-07-23 at 12:02 +0200, Jan Beulich wrote:
> On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
> > On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
> > > On 23.07.2024 10:02, Oleksii Kurochko wrote:
> > > > On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org>
> > > > wrote:
> > > > > > > On 22/07/2024 15:44, Oleksii Kurochko wrote:
> > > > > >     /* Map a 4k page in a fixmap entry */
> > > > > >     void set_fixmap(unsigned map, mfn_t mfn, unsigned int
> > > > > > flags)
> > > > > >     {
> > > > > >         pte_t pte;
> > > > > > 
> > > > > >         pte = mfn_to_xen_entry(mfn, flags);
> > > > > >         pte.pte |= PTE_LEAF_DEFAULT;
> > > > > >         write_pte(&xen_fixmap[pt_index(0,
> > > > > > FIXMAP_ADDR(map))],
> > > > > > pte);
> > > > > 
> > > > > It would be saner to check if you are not overwriting any
> > > > > existing
> > > > > mapping as otherwise you will probably need a TLB flush.
> > > > > 
> > > > > >     }
> > > > > > 
> > > > > >     /* Remove a mapping from a fixmap entry */
> > > > > >     void clear_fixmap(unsigned map)
> > > > > >     {
> > > > > >         pte_t pte = {0};
> > > > > >         write_pte(&xen_fixmap[pt_index(0,
> > > > > > FIXMAP_ADDR(map))],
> > > > > > pte);
> > > > > 
> > > > > Don't you need a TLB flush?
> > > > > 
> > > > Inside write_pte() there is "sfence.vma".
> > > 
> > > That's just a fence though, not a TLB flush.
> > From the privileged doc:
> >    ```
> >    SFENCE.VMA is also used to invalidate entries in the
> >    address-translation cache associated with a hart (see Section
> > 4.3.2). 
> >    ...
> >    The SFENCE.VMA is used to flush any local hardware caches
> > related to
> >    address translation.
> >    It is specified as a fence rather than a TLB flush to provide
> > cleaner
> >    semantics with respect to
> >    which instructions are affected by the flush operation and to
> > support a
> >    wider variety of dynamic
> >    caching structures and memory-management schemes. SFENCE.VMA is
> > also
> >    used by higher
> >    privilege levels to synchronize page table writes and the
> > address
> >    translation hardware.
> >    ...
> >    ```
> > I read this as SFENCE.VMA is used not only for ordering of
> > load/stores,
> > but also to flush TLB ( which is a type of more general term as
> > address-translation cache, IIUIC ).
> 
> Oh, I see. Kind of unexpected for an instruction of that name. Yet
> note
> how they talk about the local hart only. You need a wider scope TLB
> flush here.
Could you please clarify why it is needed wider?

Arm Xen flushed only local TLB.
RISC-V Linux kernel for fixmap also uses: local_flush_tlb_page().

~ Oleksii
Julien Grall July 23, 2024, 3:49 p.m. UTC | #15
Hi Oleksii,

On 23/07/2024 16:36, oleksii.kurochko@gmail.com wrote:
> On Tue, 2024-07-23 at 12:02 +0200, Jan Beulich wrote:
>> On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
>>> On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
>>>> On 23.07.2024 10:02, Oleksii Kurochko wrote:
>>>>> On Mon, Jul 22, 2024 at 7:27 PM Julien Grall <julien@xen.org>
>>>>> wrote:
>>>>>>>> On 22/07/2024 15:44, Oleksii Kurochko wrote:
>>>>>>>      /* Map a 4k page in a fixmap entry */
>>>>>>>      void set_fixmap(unsigned map, mfn_t mfn, unsigned int
>>>>>>> flags)
>>>>>>>      {
>>>>>>>          pte_t pte;
>>>>>>>
>>>>>>>          pte = mfn_to_xen_entry(mfn, flags);
>>>>>>>          pte.pte |= PTE_LEAF_DEFAULT;
>>>>>>>          write_pte(&xen_fixmap[pt_index(0,
>>>>>>> FIXMAP_ADDR(map))],
>>>>>>> pte);
>>>>>>
>>>>>> It would be saner to check if you are not overwriting any
>>>>>> existing
>>>>>> mapping as otherwise you will probably need a TLB flush.
>>>>>>
>>>>>>>      }
>>>>>>>
>>>>>>>      /* Remove a mapping from a fixmap entry */
>>>>>>>      void clear_fixmap(unsigned map)
>>>>>>>      {
>>>>>>>          pte_t pte = {0};
>>>>>>>          write_pte(&xen_fixmap[pt_index(0,
>>>>>>> FIXMAP_ADDR(map))],
>>>>>>> pte);
>>>>>>
>>>>>> Don't you need a TLB flush?
>>>>>>
>>>>> Inside write_pte() there is "sfence.vma".
>>>>
>>>> That's just a fence though, not a TLB flush.
>>>  From the privileged doc:
>>>     ```
>>>     SFENCE.VMA is also used to invalidate entries in the
>>>     address-translation cache associated with a hart (see Section
>>> 4.3.2).
>>>     ...
>>>     The SFENCE.VMA is used to flush any local hardware caches
>>> related to
>>>     address translation.
>>>     It is specified as a fence rather than a TLB flush to provide
>>> cleaner
>>>     semantics with respect to
>>>     which instructions are affected by the flush operation and to
>>> support a
>>>     wider variety of dynamic
>>>     caching structures and memory-management schemes. SFENCE.VMA is
>>> also
>>>     used by higher
>>>     privilege levels to synchronize page table writes and the
>>> address
>>>     translation hardware.
>>>     ...
>>>     ```
>>> I read this as SFENCE.VMA is used not only for ordering of
>>> load/stores,
>>> but also to flush TLB ( which is a type of more general term as
>>> address-translation cache, IIUIC ).
I have to admit, I am a little because concerned with calling sfence.vma 
in write_pte() (this may only be because I am not very familiar with 
RISC-V).

We have cases where multiple entry will be written in a single 
map_pages_to_xen() call. So wouldn't this means that the local TLBs 
would be nuked for every write rather than once?


>>
>> Oh, I see. Kind of unexpected for an instruction of that name. Yet
>> note
>> how they talk about the local hart only. You need a wider scope TLB
>> flush here.
> Could you please clarify why it is needed wider?
> 
> Arm Xen flushed only local TLB.

Which code are you looking at? set_fixmap() will propagate the TLB flush 
to all innershareable CPUs.

The PMAP interface will do a local TLB flush because the interface can 
only be used during early boot where there is a single CPU running.

> RISC-V Linux kernel for fixmap also uses: local_flush_tlb_page().

I don't know how Linux is using set_fixmap(). But what matters is how 
Xen is using set_fixmap(). We have a couple of places in Xen where the 
fixmap needs to be accessed by all the CPUs.

Given this is a common interface in Xen, I think it makes sense to 
follow the same approach to avoid any confusion.

Cheers,
Oleksii Kurochko July 23, 2024, 5:25 p.m. UTC | #16
On Tue, 2024-07-23 at 16:49 +0100, Julien Grall wrote:
> Hi Oleksii,
> 
> On 23/07/2024 16:36, oleksii.kurochko@gmail.com wrote:
> > On Tue, 2024-07-23 at 12:02 +0200, Jan Beulich wrote:
> > > On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
> > > > On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
> > > > > On 23.07.2024 10:02, Oleksii Kurochko wrote:
> > > > > > On Mon, Jul 22, 2024 at 7:27 PM Julien Grall
> > > > > > <julien@xen.org>
> > > > > > wrote:
> > > > > > > > > On 22/07/2024 15:44, Oleksii Kurochko wrote:
> > > > > > > >      /* Map a 4k page in a fixmap entry */
> > > > > > > >      void set_fixmap(unsigned map, mfn_t mfn, unsigned
> > > > > > > > int
> > > > > > > > flags)
> > > > > > > >      {
> > > > > > > >          pte_t pte;
> > > > > > > > 
> > > > > > > >          pte = mfn_to_xen_entry(mfn, flags);
> > > > > > > >          pte.pte |= PTE_LEAF_DEFAULT;
> > > > > > > >          write_pte(&xen_fixmap[pt_index(0,
> > > > > > > > FIXMAP_ADDR(map))],
> > > > > > > > pte);
> > > > > > > 
> > > > > > > It would be saner to check if you are not overwriting any
> > > > > > > existing
> > > > > > > mapping as otherwise you will probably need a TLB flush.
> > > > > > > 
> > > > > > > >      }
> > > > > > > > 
> > > > > > > >      /* Remove a mapping from a fixmap entry */
> > > > > > > >      void clear_fixmap(unsigned map)
> > > > > > > >      {
> > > > > > > >          pte_t pte = {0};
> > > > > > > >          write_pte(&xen_fixmap[pt_index(0,
> > > > > > > > FIXMAP_ADDR(map))],
> > > > > > > > pte);
> > > > > > > 
> > > > > > > Don't you need a TLB flush?
> > > > > > > 
> > > > > > Inside write_pte() there is "sfence.vma".
> > > > > 
> > > > > That's just a fence though, not a TLB flush.
> > > >  From the privileged doc:
> > > >     ```
> > > >     SFENCE.VMA is also used to invalidate entries in the
> > > >     address-translation cache associated with a hart (see
> > > > Section
> > > > 4.3.2).
> > > >     ...
> > > >     The SFENCE.VMA is used to flush any local hardware caches
> > > > related to
> > > >     address translation.
> > > >     It is specified as a fence rather than a TLB flush to
> > > > provide
> > > > cleaner
> > > >     semantics with respect to
> > > >     which instructions are affected by the flush operation and
> > > > to
> > > > support a
> > > >     wider variety of dynamic
> > > >     caching structures and memory-management schemes.
> > > > SFENCE.VMA is
> > > > also
> > > >     used by higher
> > > >     privilege levels to synchronize page table writes and the
> > > > address
> > > >     translation hardware.
> > > >     ...
> > > >     ```
> > > > I read this as SFENCE.VMA is used not only for ordering of
> > > > load/stores,
> > > > but also to flush TLB ( which is a type of more general term as
> > > > address-translation cache, IIUIC ).
> I have to admit, I am a little because concerned with calling
> sfence.vma 
> in write_pte() (this may only be because I am not very familiar with 
> RISC-V).
> 
> We have cases where multiple entry will be written in a single 
> map_pages_to_xen() call. So wouldn't this means that the local TLBs 
> would be nuked for every write rather than once?
Yes, it will be nuked. It is bad from perfomance point of view.
I just wanted to be sure that I won't miss to put sfence.vma when it is
necessary and then reworked that a little bit after. But it seems it
would be better not to call sfence.vma in write_pte() just from the
start.


> 
> 
> > > 
> > > Oh, I see. Kind of unexpected for an instruction of that name.
> > > Yet
> > > note
> > > how they talk about the local hart only. You need a wider scope
> > > TLB
> > > flush here.
> > Could you please clarify why it is needed wider?
> > 
> > Arm Xen flushed only local TLB.
> 
> Which code are you looking at? set_fixmap() will propagate the TLB
> flush 
> to all innershareable CPUs.
Yes, here I agree that set_fixmap() uses map_pages_to_xen which
somewhere inside uses flush_xen_tlb_range_va() ( not
flush_xen_tlb_range_va() ) so TLB flush will happen for all
innershareable CPUs.
> 
> The PMAP interface will do a local TLB flush because the interface
> can 
> only be used during early boot where there is a single CPU running.

Yes, I am looking at PMAP:
   static inline void arch_pmap_unmap(unsigned int slot)
   {
       lpae_t pte = {};
   
       write_pte(&xen_fixmap[slot], pte);
   
       flush_xen_tlb_range_va(FIXMAP_ADDR(slot), PAGE_SIZE);
   }
IIUC, originaly Jan told about arch_pmap_unmap() case so that is why I
decided to clarify additionaly.

> 
> > RISC-V Linux kernel for fixmap also uses: local_flush_tlb_page().
> 
> I don't know how Linux is using set_fixmap(). But what matters is how
> Xen is using set_fixmap(). We have a couple of places in Xen where
> the 
> fixmap needs to be accessed by all the CPUs.
> 
> Given this is a common interface in Xen, I think it makes sense to 
> follow the same approach to avoid any confusion.
Sure. Thanks for claryfying. I will flush_xen_tlb_range_va() for
set_fixmap().


~ Oleksii
Oleksii Kurochko July 23, 2024, 5:28 p.m. UTC | #17
On Tue, 2024-07-23 at 19:25 +0200, oleksii.kurochko@gmail.com wrote:
> On Tue, 2024-07-23 at 16:49 +0100, Julien Grall wrote:
> > Hi Oleksii,
> > 
> > On 23/07/2024 16:36, oleksii.kurochko@gmail.com wrote:
> > > On Tue, 2024-07-23 at 12:02 +0200, Jan Beulich wrote:
> > > > On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
> > > > > On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
> > > > > > On 23.07.2024 10:02, Oleksii Kurochko wrote:
> > > > > > > On Mon, Jul 22, 2024 at 7:27 PM Julien Grall
> > > > > > > <julien@xen.org>
> > > > > > > wrote:
> > > > > > > > > > On 22/07/2024 15:44, Oleksii Kurochko wrote:
> > > > > > > > >      /* Map a 4k page in a fixmap entry */
> > > > > > > > >      void set_fixmap(unsigned map, mfn_t mfn,
> > > > > > > > > unsigned
> > > > > > > > > int
> > > > > > > > > flags)
> > > > > > > > >      {
> > > > > > > > >          pte_t pte;
> > > > > > > > > 
> > > > > > > > >          pte = mfn_to_xen_entry(mfn, flags);
> > > > > > > > >          pte.pte |= PTE_LEAF_DEFAULT;
> > > > > > > > >          write_pte(&xen_fixmap[pt_index(0,
> > > > > > > > > FIXMAP_ADDR(map))],
> > > > > > > > > pte);
> > > > > > > > 
> > > > > > > > It would be saner to check if you are not overwriting
> > > > > > > > any
> > > > > > > > existing
> > > > > > > > mapping as otherwise you will probably need a TLB
> > > > > > > > flush.
> > > > > > > > 
> > > > > > > > >      }
> > > > > > > > > 
> > > > > > > > >      /* Remove a mapping from a fixmap entry */
> > > > > > > > >      void clear_fixmap(unsigned map)
> > > > > > > > >      {
> > > > > > > > >          pte_t pte = {0};
> > > > > > > > >          write_pte(&xen_fixmap[pt_index(0,
> > > > > > > > > FIXMAP_ADDR(map))],
> > > > > > > > > pte);
> > > > > > > > 
> > > > > > > > Don't you need a TLB flush?
> > > > > > > > 
> > > > > > > Inside write_pte() there is "sfence.vma".
> > > > > > 
> > > > > > That's just a fence though, not a TLB flush.
> > > > >  From the privileged doc:
> > > > >     ```
> > > > >     SFENCE.VMA is also used to invalidate entries in the
> > > > >     address-translation cache associated with a hart (see
> > > > > Section
> > > > > 4.3.2).
> > > > >     ...
> > > > >     The SFENCE.VMA is used to flush any local hardware caches
> > > > > related to
> > > > >     address translation.
> > > > >     It is specified as a fence rather than a TLB flush to
> > > > > provide
> > > > > cleaner
> > > > >     semantics with respect to
> > > > >     which instructions are affected by the flush operation
> > > > > and
> > > > > to
> > > > > support a
> > > > >     wider variety of dynamic
> > > > >     caching structures and memory-management schemes.
> > > > > SFENCE.VMA is
> > > > > also
> > > > >     used by higher
> > > > >     privilege levels to synchronize page table writes and the
> > > > > address
> > > > >     translation hardware.
> > > > >     ...
> > > > >     ```
> > > > > I read this as SFENCE.VMA is used not only for ordering of
> > > > > load/stores,
> > > > > but also to flush TLB ( which is a type of more general term
> > > > > as
> > > > > address-translation cache, IIUIC ).
> > I have to admit, I am a little because concerned with calling
> > sfence.vma 
> > in write_pte() (this may only be because I am not very familiar
> > with 
> > RISC-V).
> > 
> > We have cases where multiple entry will be written in a single 
> > map_pages_to_xen() call. So wouldn't this means that the local TLBs
> > would be nuked for every write rather than once?
> Yes, it will be nuked. It is bad from perfomance point of view.
> I just wanted to be sure that I won't miss to put sfence.vma when it
> is
> necessary and then reworked that a little bit after. But it seems it
> would be better not to call sfence.vma in write_pte() just from the
> start.
> 
> 
> > 
> > 
> > > > 
> > > > Oh, I see. Kind of unexpected for an instruction of that name.
> > > > Yet
> > > > note
> > > > how they talk about the local hart only. You need a wider scope
> > > > TLB
> > > > flush here.
> > > Could you please clarify why it is needed wider?
> > > 
> > > Arm Xen flushed only local TLB.
> > 
> > Which code are you looking at? set_fixmap() will propagate the TLB
> > flush 
> > to all innershareable CPUs.
> Yes, here I agree that set_fixmap() uses map_pages_to_xen which
> somewhere inside uses flush_xen_tlb_range_va() ( not
> flush_xen_tlb_range_va() ) so TLB flush will happen for all
> innershareable CPUs.
> > 
> > The PMAP interface will do a local TLB flush because the interface
> > can 
> > only be used during early boot where there is a single CPU running.
> 
> Yes, I am looking at PMAP:
>    static inline void arch_pmap_unmap(unsigned int slot)
>    {
>        lpae_t pte = {};
>    
>        write_pte(&xen_fixmap[slot], pte);
>    
>        flush_xen_tlb_range_va(FIXMAP_ADDR(slot), PAGE_SIZE);
>    }
> IIUC, originaly Jan told about arch_pmap_unmap() case so that is why
> I
> decided to clarify additionaly.

Julien,
I have a questation related to Arm's version of arch_pmap_map():

   static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
   {
       lpae_t *entry = &xen_fixmap[slot];
       lpae_t pte;
   
       ASSERT(!lpae_is_valid(*entry));
   
       pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
       pte.pt.table = 1;
       write_pte(entry, pte);
       /*
        * The new entry will be used very soon after arch_pmap_map()
   returns.
        * So ensure the DSB in write_pte() has completed before
   continuing.
        */
       isb();
   }
   
Is the code above isb() is correct? is it insure the DSB not ISB?
And isn't need to do TLB flush here?

~ Oleksii


> 
> > 
> > > RISC-V Linux kernel for fixmap also uses: local_flush_tlb_page().
> > 
> > I don't know how Linux is using set_fixmap(). But what matters is
> > how
> > Xen is using set_fixmap(). We have a couple of places in Xen where
> > the 
> > fixmap needs to be accessed by all the CPUs.
> > 
> > Given this is a common interface in Xen, I think it makes sense to 
> > follow the same approach to avoid any confusion.
> Sure. Thanks for claryfying. I will flush_xen_tlb_range_va() for
> set_fixmap().
> 
> 
> ~ Oleksii
Julien Grall July 23, 2024, 6:44 p.m. UTC | #18
On 23/07/2024 18:28, oleksii.kurochko@gmail.com wrote:
> On Tue, 2024-07-23 at 19:25 +0200, oleksii.kurochko@gmail.com wrote:
>> On Tue, 2024-07-23 at 16:49 +0100, Julien Grall wrote:
>>> Hi Oleksii,
>>>
>>> On 23/07/2024 16:36, oleksii.kurochko@gmail.com wrote:
>>>> On Tue, 2024-07-23 at 12:02 +0200, Jan Beulich wrote:
>>>>> On 23.07.2024 10:55, oleksii.kurochko@gmail.com wrote:
>>>>>> On Tue, 2024-07-23 at 10:36 +0200, Jan Beulich wrote:
>>>>>>> On 23.07.2024 10:02, Oleksii Kurochko wrote:
>>>>>>>> On Mon, Jul 22, 2024 at 7:27 PM Julien Grall
>>>>>>>> <julien@xen.org>
>>>>>>>> wrote:
>>>>>>>>>>> On 22/07/2024 15:44, Oleksii Kurochko wrote:
>>>>>>>>>>       /* Map a 4k page in a fixmap entry */
>>>>>>>>>>       void set_fixmap(unsigned map, mfn_t mfn,
>>>>>>>>>> unsigned
>>>>>>>>>> int
>>>>>>>>>> flags)
>>>>>>>>>>       {
>>>>>>>>>>           pte_t pte;
>>>>>>>>>>
>>>>>>>>>>           pte = mfn_to_xen_entry(mfn, flags);
>>>>>>>>>>           pte.pte |= PTE_LEAF_DEFAULT;
>>>>>>>>>>           write_pte(&xen_fixmap[pt_index(0,
>>>>>>>>>> FIXMAP_ADDR(map))],
>>>>>>>>>> pte);
>>>>>>>>>
>>>>>>>>> It would be saner to check if you are not overwriting
>>>>>>>>> any
>>>>>>>>> existing
>>>>>>>>> mapping as otherwise you will probably need a TLB
>>>>>>>>> flush.
>>>>>>>>>
>>>>>>>>>>       }
>>>>>>>>>>
>>>>>>>>>>       /* Remove a mapping from a fixmap entry */
>>>>>>>>>>       void clear_fixmap(unsigned map)
>>>>>>>>>>       {
>>>>>>>>>>           pte_t pte = {0};
>>>>>>>>>>           write_pte(&xen_fixmap[pt_index(0,
>>>>>>>>>> FIXMAP_ADDR(map))],
>>>>>>>>>> pte);
>>>>>>>>>
>>>>>>>>> Don't you need a TLB flush?
>>>>>>>>>
>>>>>>>> Inside write_pte() there is "sfence.vma".
>>>>>>>
>>>>>>> That's just a fence though, not a TLB flush.
>>>>>>   From the privileged doc:
>>>>>>      ```
>>>>>>      SFENCE.VMA is also used to invalidate entries in the
>>>>>>      address-translation cache associated with a hart (see
>>>>>> Section
>>>>>> 4.3.2).
>>>>>>      ...
>>>>>>      The SFENCE.VMA is used to flush any local hardware caches
>>>>>> related to
>>>>>>      address translation.
>>>>>>      It is specified as a fence rather than a TLB flush to
>>>>>> provide
>>>>>> cleaner
>>>>>>      semantics with respect to
>>>>>>      which instructions are affected by the flush operation
>>>>>> and
>>>>>> to
>>>>>> support a
>>>>>>      wider variety of dynamic
>>>>>>      caching structures and memory-management schemes.
>>>>>> SFENCE.VMA is
>>>>>> also
>>>>>>      used by higher
>>>>>>      privilege levels to synchronize page table writes and the
>>>>>> address
>>>>>>      translation hardware.
>>>>>>      ...
>>>>>>      ```
>>>>>> I read this as SFENCE.VMA is used not only for ordering of
>>>>>> load/stores,
>>>>>> but also to flush TLB ( which is a type of more general term
>>>>>> as
>>>>>> address-translation cache, IIUIC ).
>>> I have to admit, I am a little because concerned with calling
>>> sfence.vma
>>> in write_pte() (this may only be because I am not very familiar
>>> with
>>> RISC-V).
>>>
>>> We have cases where multiple entry will be written in a single
>>> map_pages_to_xen() call. So wouldn't this means that the local TLBs
>>> would be nuked for every write rather than once?
>> Yes, it will be nuked. It is bad from perfomance point of view.
>> I just wanted to be sure that I won't miss to put sfence.vma when it
>> is
>> necessary and then reworked that a little bit after. But it seems it
>> would be better not to call sfence.vma in write_pte() just from the
>> start.
>>
>>
>>>
>>>
>>>>>
>>>>> Oh, I see. Kind of unexpected for an instruction of that name.
>>>>> Yet
>>>>> note
>>>>> how they talk about the local hart only. You need a wider scope
>>>>> TLB
>>>>> flush here.
>>>> Could you please clarify why it is needed wider?
>>>>
>>>> Arm Xen flushed only local TLB.
>>>
>>> Which code are you looking at? set_fixmap() will propagate the TLB
>>> flush
>>> to all innershareable CPUs.
>> Yes, here I agree that set_fixmap() uses map_pages_to_xen which
>> somewhere inside uses flush_xen_tlb_range_va() ( not
>> flush_xen_tlb_range_va() ) so TLB flush will happen for all
>> innershareable CPUs.
>>>
>>> The PMAP interface will do a local TLB flush because the interface
>>> can
>>> only be used during early boot where there is a single CPU running.
>>
>> Yes, I am looking at PMAP:
>>     static inline void arch_pmap_unmap(unsigned int slot)
>>     {
>>         lpae_t pte = {};
>>     
>>         write_pte(&xen_fixmap[slot], pte);
>>     
>>         flush_xen_tlb_range_va(FIXMAP_ADDR(slot), PAGE_SIZE);
>>     }
>> IIUC, originaly Jan told about arch_pmap_unmap() case so that is why
>> I
>> decided to clarify additionaly.
> 
> Julien,
> I have a questation related to Arm's version of arch_pmap_map():
> 
>     static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
>     {
>         lpae_t *entry = &xen_fixmap[slot];
>         lpae_t pte;
>     
>         ASSERT(!lpae_is_valid(*entry));
>     
>         pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
>         pte.pt.table = 1;
>         write_pte(entry, pte);
>         /*
>          * The new entry will be used very soon after arch_pmap_map()
>     returns.
>          * So ensure the DSB in write_pte() has completed before
>     continuing.
>          */
>         isb();
>     }
>     
> Is the code above isb() is correct? is it insure the DSB not ISB?

I guess you mean comment. If so, yes it is correct. write_pte() has a 
dsb() just after the PTE. This is to guarantee that implicit memory 
access (instruction fetch or from the Hardware translation table access) 
will only happen *after* the dsb() has completed.

I am not 100% sure whether the isb() is required for arch_pmap_map(). 
But I took the safest approach when implementing it.

> And isn't need to do TLB flush here?

It is not. On Arm, we have a strict policy that every unmap will be 
followed by a TLB flush and you can't modify an existing entry (see 
ASSERT a few lines above). So the TLBs will not contain an entry for 
mapping.

This policy was mainly dictacted by the Arm Arm because when modifying 
the output address you need to follow the break-before-make sequence 
which means you have to transition to an invalid mapping and flush the 
TLBs before the new entry is added. For simplicity, we decided to just 
not bother with trying to implement break-before-make for the hypervisor 
page-tables. But we do for the P2M.

Note that newer revision of the Armv8 spec relaxed the requirement (see 
FEAT_BBM).

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/riscv/Kconfig b/xen/arch/riscv/Kconfig
index 259eea8d3b..0112aa8778 100644
--- a/xen/arch/riscv/Kconfig
+++ b/xen/arch/riscv/Kconfig
@@ -3,6 +3,7 @@  config RISCV
 	select FUNCTION_ALIGNMENT_16B
 	select GENERIC_BUG_FRAME
 	select HAS_DEVICE_TREE
+	select HAS_PMAP
 
 config RISCV_64
 	def_bool y
diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
index cbbf3656d1..339074d502 100644
--- a/xen/arch/riscv/include/asm/page.h
+++ b/xen/arch/riscv/include/asm/page.h
@@ -51,6 +51,8 @@  typedef struct {
 #endif
 } pte_t;
 
+pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr);
+
 static inline pte_t paddr_to_pte(paddr_t paddr,
                                  unsigned int permissions)
 {
diff --git a/xen/arch/riscv/include/asm/pmap.h b/xen/arch/riscv/include/asm/pmap.h
new file mode 100644
index 0000000000..eb4c48515c
--- /dev/null
+++ b/xen/arch/riscv/include/asm/pmap.h
@@ -0,0 +1,28 @@ 
+#ifndef __ASM_PMAP_H__
+#define __ASM_PMAP_H__
+
+#include <xen/bug.h>
+#include <xen/mm.h>
+
+#include <asm/fixmap.h>
+
+static inline void arch_pmap_map(unsigned int slot, mfn_t mfn)
+{
+    pte_t *entry = &xen_fixmap[slot];
+    pte_t pte;
+
+    ASSERT(!pte_is_valid(*entry));
+
+    pte = mfn_to_xen_entry(mfn, PAGE_HYPERVISOR_RW);
+    pte.pte |= PTE_LEAF_DEFAULT;
+    write_pte(entry, pte);
+}
+
+static inline void arch_pmap_unmap(unsigned int slot)
+{
+    pte_t pte = {};
+
+    write_pte(&xen_fixmap[slot], pte);
+}
+
+#endif /* __ASM_PMAP_H__ */
diff --git a/xen/arch/riscv/mm.c b/xen/arch/riscv/mm.c
index d69a174b5d..445319af08 100644
--- a/xen/arch/riscv/mm.c
+++ b/xen/arch/riscv/mm.c
@@ -370,3 +370,17 @@  int map_pages_to_xen(unsigned long virt,
     BUG_ON("unimplemented");
     return -1;
 }
+
+static inline pte_t mfn_to_pte(mfn_t mfn)
+{
+    unsigned long pte = mfn_x(mfn) << PTE_PPN_SHIFT;
+    return (pte_t){ .pte = pte};
+}
+
+inline pte_t mfn_to_xen_entry(mfn_t mfn, unsigned int attr)
+{
+    /* there is no attr field in RISC-V's pte */
+    (void) attr;
+
+    return mfn_to_pte(mfn);
+}