diff mbox series

[RESEND,v2] x86: introduce ioremap_wc()

Message ID c56aadc8-e568-4625-5d6f-bd2b8981f3ae@suse.com (mailing list archive)
State New, archived
Headers show
Series [RESEND,v2] x86: introduce ioremap_wc() | expand

Commit Message

Jan Beulich Feb. 17, 2022, 11:01 a.m. UTC
In order for a to-be-introduced ERMS form of memcpy() to not regress
boot performance on certain systems when video output is active, we
first need to arrange for avoiding further dependency on firmware
setting up MTRRs in a way we can actually further modify. On many
systems, due to the continuously growing amounts of installed memory,
MTRRs get configured with at least one huge WB range, and with MMIO
ranges below 4Gb then forced to UC via overlapping MTRRs. mtrr_add(), as
it is today, can't deal with such a setup. Hence on such systems we
presently leave the frame buffer mapped UC, leading to significantly
reduced performance when using REP STOSB / REP MOVSB.

On post-PentiumII hardware (i.e. any that's capable of running 64-bit
code), an effective memory type of WC can be achieved without MTRRs, by
simply referencing the respective PAT entry from the PTEs. While this
will leave the switch to ERMS forms of memset() and memcpy() with
largely unchanged performance, the change here on its own improves
performance on affected systems quite significantly: Measuring just the
individual affected memcpy() invocations yielded a speedup by a factor
of over 250 on my initial (Skylake) test system. memset() isn't getting
improved by as much there, but still by a factor of about 20.

While adding {__,}PAGE_HYPERVISOR_WC, also add {__,}PAGE_HYPERVISOR_WT
to, at the very least, make clear what PTE flags this memory type uses.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
REPOST (in isolation) upon Roger's request. The header location change I
don't really consider a "re-base".

v2: Mark ioremap_wc() __init.
---
TBD: If the VGA range is WC in the fixed range MTRRs, reusing the low
     1st Mb mapping (like ioremap() does) would be an option.

Comments

Roger Pau Monné Feb. 17, 2022, 2:47 p.m. UTC | #1
On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
> In order for a to-be-introduced ERMS form of memcpy() to not regress
> boot performance on certain systems when video output is active, we
> first need to arrange for avoiding further dependency on firmware
> setting up MTRRs in a way we can actually further modify. On many
> systems, due to the continuously growing amounts of installed memory,
> MTRRs get configured with at least one huge WB range, and with MMIO
> ranges below 4Gb then forced to UC via overlapping MTRRs. mtrr_add(), as
> it is today, can't deal with such a setup. Hence on such systems we
> presently leave the frame buffer mapped UC, leading to significantly
> reduced performance when using REP STOSB / REP MOVSB.
> 
> On post-PentiumII hardware (i.e. any that's capable of running 64-bit
> code), an effective memory type of WC can be achieved without MTRRs, by
> simply referencing the respective PAT entry from the PTEs. While this
> will leave the switch to ERMS forms of memset() and memcpy() with
> largely unchanged performance, the change here on its own improves
> performance on affected systems quite significantly: Measuring just the
> individual affected memcpy() invocations yielded a speedup by a factor
> of over 250 on my initial (Skylake) test system. memset() isn't getting
> improved by as much there, but still by a factor of about 20.
> 
> While adding {__,}PAGE_HYPERVISOR_WC, also add {__,}PAGE_HYPERVISOR_WT
> to, at the very least, make clear what PTE flags this memory type uses.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> REPOST (in isolation) upon Roger's request. The header location change I
> don't really consider a "re-base".
> 
> v2: Mark ioremap_wc() __init.
> ---
> TBD: If the VGA range is WC in the fixed range MTRRs, reusing the low
>      1st Mb mapping (like ioremap() does) would be an option.
> 
> --- a/xen/arch/x86/include/asm/mm.h
> +++ b/xen/arch/x86/include/asm/mm.h
> @@ -602,6 +602,8 @@ void destroy_perdomain_mapping(struct do
>                                 unsigned int nr);
>  void free_perdomain_mappings(struct domain *);
>  
> +void __iomem *ioremap_wc(paddr_t, size_t);
> +
>  extern int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm);
>  
>  void domain_set_alloc_bitsize(struct domain *d);
> --- a/xen/arch/x86/include/asm/page.h
> +++ b/xen/arch/x86/include/asm/page.h
> @@ -349,8 +349,10 @@ void efi_update_l4_pgtable(unsigned int
>  #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
>  #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
>                                     _PAGE_DIRTY | _PAGE_RW)
> +#define __PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR | _PAGE_PWT)
>  #define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
>  #define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
> +#define __PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR | _PAGE_PAT)
>  #define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
>  
>  #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
> --- a/xen/arch/x86/include/asm/x86_64/page.h
> +++ b/xen/arch/x86/include/asm/x86_64/page.h
> @@ -152,6 +152,10 @@ static inline intpte_t put_pte_flags(uns
>                                   _PAGE_GLOBAL | _PAGE_NX)
>  #define PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR_UC | \
>                                   _PAGE_GLOBAL | _PAGE_NX)
> +#define PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR_WC | \
> +                                 _PAGE_GLOBAL | _PAGE_NX)
> +#define PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR_WT | \
> +                                 _PAGE_GLOBAL | _PAGE_NX)
>  
>  #endif /* __X86_64_PAGE_H__ */
>  
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -5895,6 +5895,20 @@ void __iomem *ioremap(paddr_t pa, size_t
>      return (void __force __iomem *)va;
>  }
>  
> +void __iomem *__init ioremap_wc(paddr_t pa, size_t len)
> +{
> +    mfn_t mfn = _mfn(PFN_DOWN(pa));
> +    unsigned int offs = pa & (PAGE_SIZE - 1);
> +    unsigned int nr = PFN_UP(offs + len);
> +    void *va;
> +
> +    WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
> +
> +    va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_WC, VMAP_DEFAULT);
> +
> +    return (void __force __iomem *)(va + offs);
> +}
> +
>  int create_perdomain_mapping(struct domain *d, unsigned long va,
>                               unsigned int nr, l1_pgentry_t **pl1tab,
>                               struct page_info **ppg)
> --- a/xen/drivers/video/vesa.c
> +++ b/xen/drivers/video/vesa.c
> @@ -9,9 +9,9 @@
>  #include <xen/param.h>
>  #include <xen/xmalloc.h>
>  #include <xen/kernel.h>
> +#include <xen/mm.h>
>  #include <xen/vga.h>
>  #include <asm/io.h>
> -#include <asm/page.h>
>  #include "font.h"
>  #include "lfb.h"
>  
> @@ -103,7 +103,7 @@ void __init vesa_init(void)
>      lfbp.text_columns = vlfb_info.width / font->width;
>      lfbp.text_rows = vlfb_info.height / font->height;
>  
> -    lfbp.lfb = lfb = ioremap(lfb_base(), vram_remap);
> +    lfbp.lfb = lfb = ioremap_wc(lfb_base(), vram_remap);
>      if ( !lfb )
>          return;
>  
> @@ -179,8 +179,7 @@ void __init vesa_mtrr_init(void)
>  
>  static void lfb_flush(void)
>  {
> -    if ( vesa_mtrr == 3 )
> -        __asm__ __volatile__ ("sfence" : : : "memory");
> +    __asm__ __volatile__ ("sfence" : : : "memory");

Now that the cache attribute is forced to WC using PAT don't we need
to drop vesa_mtrr_init and vesa_mtrr? The more that the option is
fully undocumented.

>  }
>  
>  void __init vesa_endboot(bool_t keep)
> --- a/xen/drivers/video/vga.c
> +++ b/xen/drivers/video/vga.c
> @@ -79,7 +79,7 @@ void __init video_init(void)
>      {
>      case XEN_VGATYPE_TEXT_MODE_3:
>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
>              return;
>          outw(0x200a, 0x3d4); /* disable cursor */
>          columns = vga_console_info.u.text_mode_3.columns;
> @@ -164,7 +164,11 @@ void __init video_endboot(void)
>      {
>      case XEN_VGATYPE_TEXT_MODE_3:
>          if ( !vgacon_keep )
> +        {
>              memset(video, 0, columns * lines * 2);
> +            iounmap(video);
> +            video = ZERO_BLOCK_PTR;
> +        }
>          break;
>      case XEN_VGATYPE_VESA_LFB:
>      case XEN_VGATYPE_EFI_LFB:

I think in vesa_endboot you also need to iounmap the framebuffer
iomem?

I would assume this was also required before your change, yet I'm not
finding any iounmap call that would do it.

Thanks, Roger.
Jan Beulich Feb. 17, 2022, 3:02 p.m. UTC | #2
On 17.02.2022 15:47, Roger Pau Monné wrote:
> On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
>> @@ -179,8 +179,7 @@ void __init vesa_mtrr_init(void)
>>  
>>  static void lfb_flush(void)
>>  {
>> -    if ( vesa_mtrr == 3 )
>> -        __asm__ __volatile__ ("sfence" : : : "memory");
>> +    __asm__ __volatile__ ("sfence" : : : "memory");
> 
> Now that the cache attribute is forced to WC using PAT don't we need
> to drop vesa_mtrr_init and vesa_mtrr? The more that the option is
> fully undocumented.

Yes indeed. You did ask to re-send this patch in isolation. This removal
is part of the full series.

>> --- a/xen/drivers/video/vga.c
>> +++ b/xen/drivers/video/vga.c
>> @@ -79,7 +79,7 @@ void __init video_init(void)
>>      {
>>      case XEN_VGATYPE_TEXT_MODE_3:
>>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
>> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
>> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
>>              return;
>>          outw(0x200a, 0x3d4); /* disable cursor */
>>          columns = vga_console_info.u.text_mode_3.columns;
>> @@ -164,7 +164,11 @@ void __init video_endboot(void)
>>      {
>>      case XEN_VGATYPE_TEXT_MODE_3:
>>          if ( !vgacon_keep )
>> +        {
>>              memset(video, 0, columns * lines * 2);
>> +            iounmap(video);
>> +            video = ZERO_BLOCK_PTR;
>> +        }
>>          break;
>>      case XEN_VGATYPE_VESA_LFB:
>>      case XEN_VGATYPE_EFI_LFB:
> 
> I think in vesa_endboot you also need to iounmap the framebuffer
> iomem?

Again part of the full series. I guess I was a little inconsistent
with leaving the VGA unmap in here, but breaking out the VESA part.
It's been a long time, but I guess I did so because the VESA part
needs to touch two files.

> I would assume this was also required before your change, yet I'm not
> finding any iounmap call that would do it.

Indeed, this has been missing all the time.

Jan
Roger Pau Monné Feb. 17, 2022, 3:50 p.m. UTC | #3
On Thu, Feb 17, 2022 at 04:02:39PM +0100, Jan Beulich wrote:
> On 17.02.2022 15:47, Roger Pau Monné wrote:
> > On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
> >> @@ -179,8 +179,7 @@ void __init vesa_mtrr_init(void)
> >>  
> >>  static void lfb_flush(void)
> >>  {
> >> -    if ( vesa_mtrr == 3 )
> >> -        __asm__ __volatile__ ("sfence" : : : "memory");
> >> +    __asm__ __volatile__ ("sfence" : : : "memory");
> > 
> > Now that the cache attribute is forced to WC using PAT don't we need
> > to drop vesa_mtrr_init and vesa_mtrr? The more that the option is
> > fully undocumented.
> 
> Yes indeed. You did ask to re-send this patch in isolation. This removal
> is part of the full series.
> 
> >> --- a/xen/drivers/video/vga.c
> >> +++ b/xen/drivers/video/vga.c
> >> @@ -79,7 +79,7 @@ void __init video_init(void)
> >>      {
> >>      case XEN_VGATYPE_TEXT_MODE_3:
> >>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
> >> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
> >> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
> >>              return;
> >>          outw(0x200a, 0x3d4); /* disable cursor */
> >>          columns = vga_console_info.u.text_mode_3.columns;
> >> @@ -164,7 +164,11 @@ void __init video_endboot(void)
> >>      {
> >>      case XEN_VGATYPE_TEXT_MODE_3:
> >>          if ( !vgacon_keep )
> >> +        {
> >>              memset(video, 0, columns * lines * 2);
> >> +            iounmap(video);
> >> +            video = ZERO_BLOCK_PTR;
> >> +        }
> >>          break;
> >>      case XEN_VGATYPE_VESA_LFB:
> >>      case XEN_VGATYPE_EFI_LFB:
> > 
> > I think in vesa_endboot you also need to iounmap the framebuffer
> > iomem?
> 
> Again part of the full series. I guess I was a little inconsistent
> with leaving the VGA unmap in here, but breaking out the VESA part.
> It's been a long time, but I guess I did so because the VESA part
> needs to touch two files.

I think you are hesitant to include the chunks for the above items? (or
maybe I'm not properly accounting for their complexity).

Thanks, Roger.
Jan Beulich Feb. 17, 2022, 3:57 p.m. UTC | #4
On 17.02.2022 16:50, Roger Pau Monné wrote:
> On Thu, Feb 17, 2022 at 04:02:39PM +0100, Jan Beulich wrote:
>> On 17.02.2022 15:47, Roger Pau Monné wrote:
>>> On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
>>>> --- a/xen/drivers/video/vga.c
>>>> +++ b/xen/drivers/video/vga.c
>>>> @@ -79,7 +79,7 @@ void __init video_init(void)
>>>>      {
>>>>      case XEN_VGATYPE_TEXT_MODE_3:
>>>>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
>>>> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
>>>> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
>>>>              return;
>>>>          outw(0x200a, 0x3d4); /* disable cursor */
>>>>          columns = vga_console_info.u.text_mode_3.columns;
>>>> @@ -164,7 +164,11 @@ void __init video_endboot(void)
>>>>      {
>>>>      case XEN_VGATYPE_TEXT_MODE_3:
>>>>          if ( !vgacon_keep )
>>>> +        {
>>>>              memset(video, 0, columns * lines * 2);
>>>> +            iounmap(video);
>>>> +            video = ZERO_BLOCK_PTR;
>>>> +        }
>>>>          break;
>>>>      case XEN_VGATYPE_VESA_LFB:
>>>>      case XEN_VGATYPE_EFI_LFB:
>>>
>>> I think in vesa_endboot you also need to iounmap the framebuffer
>>> iomem?
>>
>> Again part of the full series. I guess I was a little inconsistent
>> with leaving the VGA unmap in here, but breaking out the VESA part.
>> It's been a long time, but I guess I did so because the VESA part
>> needs to touch two files.
> 
> I think you are hesitant to include the chunks for the above items? (or
> maybe I'm not properly accounting for their complexity).

There's no complexity, it's really just that the zapping of the pointer
needs to be done in a different place from where the unmap is. See below.

Jan

video/vesa: unmap frame buffer when relinquishing console

There's no point in keeping the VA space occupied when no further output
will occur.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- unstable.orig/xen/drivers/video/lfb.c
+++ unstable/xen/drivers/video/lfb.c
@@ -168,4 +168,5 @@ void lfb_free(void)
     xfree(lfb.lbuf);
     xfree(lfb.text_buf);
     xfree(lfb.line_len);
+    lfb.lfbp.lfb = ZERO_BLOCK_PTR;
 }
--- unstable.orig/xen/drivers/video/vesa.c
+++ unstable/xen/drivers/video/vesa.c
@@ -197,5 +197,7 @@ void __init vesa_endboot(bool_t keep)
                    vlfb_info.width * bpp);
         lfb_flush();
         lfb_free();
+        iounmap(lfb);
+        lfb = ZERO_BLOCK_PTR;
     }
 }
Roger Pau Monné Feb. 18, 2022, 9:09 a.m. UTC | #5
On Thu, Feb 17, 2022 at 04:57:41PM +0100, Jan Beulich wrote:
> On 17.02.2022 16:50, Roger Pau Monné wrote:
> > On Thu, Feb 17, 2022 at 04:02:39PM +0100, Jan Beulich wrote:
> >> On 17.02.2022 15:47, Roger Pau Monné wrote:
> >>> On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
> >>>> --- a/xen/drivers/video/vga.c
> >>>> +++ b/xen/drivers/video/vga.c
> >>>> @@ -79,7 +79,7 @@ void __init video_init(void)
> >>>>      {
> >>>>      case XEN_VGATYPE_TEXT_MODE_3:
> >>>>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
> >>>> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
> >>>> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
> >>>>              return;
> >>>>          outw(0x200a, 0x3d4); /* disable cursor */
> >>>>          columns = vga_console_info.u.text_mode_3.columns;
> >>>> @@ -164,7 +164,11 @@ void __init video_endboot(void)
> >>>>      {
> >>>>      case XEN_VGATYPE_TEXT_MODE_3:
> >>>>          if ( !vgacon_keep )
> >>>> +        {
> >>>>              memset(video, 0, columns * lines * 2);
> >>>> +            iounmap(video);
> >>>> +            video = ZERO_BLOCK_PTR;
> >>>> +        }
> >>>>          break;
> >>>>      case XEN_VGATYPE_VESA_LFB:
> >>>>      case XEN_VGATYPE_EFI_LFB:
> >>>
> >>> I think in vesa_endboot you also need to iounmap the framebuffer
> >>> iomem?
> >>
> >> Again part of the full series. I guess I was a little inconsistent
> >> with leaving the VGA unmap in here, but breaking out the VESA part.
> >> It's been a long time, but I guess I did so because the VESA part
> >> needs to touch two files.
> > 
> > I think you are hesitant to include the chunks for the above items? (or
> > maybe I'm not properly accounting for their complexity).
> 
> There's no complexity, it's really just that the zapping of the pointer
> needs to be done in a different place from where the unmap is. See below.
> 
> Jan
> 
> video/vesa: unmap frame buffer when relinquishing console
> 
> There's no point in keeping the VA space occupied when no further output
> will occur.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

For both patches, the one inline here and "x86: introduce
ioremap_wc()".

While at it, I think you should also push "video/vesa: drop
"vesa-mtrr" command line option".

Thanks, Roger.
Jan Beulich Feb. 18, 2022, 9:23 a.m. UTC | #6
On 18.02.2022 10:09, Roger Pau Monné wrote:
> On Thu, Feb 17, 2022 at 04:57:41PM +0100, Jan Beulich wrote:
>> On 17.02.2022 16:50, Roger Pau Monné wrote:
>>> On Thu, Feb 17, 2022 at 04:02:39PM +0100, Jan Beulich wrote:
>>>> On 17.02.2022 15:47, Roger Pau Monné wrote:
>>>>> On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
>>>>>> --- a/xen/drivers/video/vga.c
>>>>>> +++ b/xen/drivers/video/vga.c
>>>>>> @@ -79,7 +79,7 @@ void __init video_init(void)
>>>>>>      {
>>>>>>      case XEN_VGATYPE_TEXT_MODE_3:
>>>>>>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
>>>>>> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
>>>>>> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
>>>>>>              return;
>>>>>>          outw(0x200a, 0x3d4); /* disable cursor */
>>>>>>          columns = vga_console_info.u.text_mode_3.columns;
>>>>>> @@ -164,7 +164,11 @@ void __init video_endboot(void)
>>>>>>      {
>>>>>>      case XEN_VGATYPE_TEXT_MODE_3:
>>>>>>          if ( !vgacon_keep )
>>>>>> +        {
>>>>>>              memset(video, 0, columns * lines * 2);
>>>>>> +            iounmap(video);
>>>>>> +            video = ZERO_BLOCK_PTR;
>>>>>> +        }
>>>>>>          break;
>>>>>>      case XEN_VGATYPE_VESA_LFB:
>>>>>>      case XEN_VGATYPE_EFI_LFB:
>>>>>
>>>>> I think in vesa_endboot you also need to iounmap the framebuffer
>>>>> iomem?
>>>>
>>>> Again part of the full series. I guess I was a little inconsistent
>>>> with leaving the VGA unmap in here, but breaking out the VESA part.
>>>> It's been a long time, but I guess I did so because the VESA part
>>>> needs to touch two files.
>>>
>>> I think you are hesitant to include the chunks for the above items? (or
>>> maybe I'm not properly accounting for their complexity).
>>
>> There's no complexity, it's really just that the zapping of the pointer
>> needs to be done in a different place from where the unmap is. See below.
>>
>> Jan
>>
>> video/vesa: unmap frame buffer when relinquishing console
>>
>> There's no point in keeping the VA space occupied when no further output
>> will occur.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> For both patches, the one inline here and "x86: introduce
> ioremap_wc()".

Thanks. Actually, while looking back at the original thread, to re-check
what pending objections there might have been, I did find the reason for
the split: In the patch here I would have introduced another leak, while
the other patch fixes an existing one.

> While at it, I think you should also push "video/vesa: drop
> "vesa-mtrr" command line option".

Yes, that one's merely dependent on the one here.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -602,6 +602,8 @@  void destroy_perdomain_mapping(struct do
                                unsigned int nr);
 void free_perdomain_mappings(struct domain *);
 
+void __iomem *ioremap_wc(paddr_t, size_t);
+
 extern int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm);
 
 void domain_set_alloc_bitsize(struct domain *d);
--- a/xen/arch/x86/include/asm/page.h
+++ b/xen/arch/x86/include/asm/page.h
@@ -349,8 +349,10 @@  void efi_update_l4_pgtable(unsigned int
 #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
 #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
                                    _PAGE_DIRTY | _PAGE_RW)
+#define __PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR | _PAGE_PWT)
 #define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
 #define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR | _PAGE_PAT)
 #define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
 
 #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
--- a/xen/arch/x86/include/asm/x86_64/page.h
+++ b/xen/arch/x86/include/asm/x86_64/page.h
@@ -152,6 +152,10 @@  static inline intpte_t put_pte_flags(uns
                                  _PAGE_GLOBAL | _PAGE_NX)
 #define PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR_UC | \
                                  _PAGE_GLOBAL | _PAGE_NX)
+#define PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR_WC | \
+                                 _PAGE_GLOBAL | _PAGE_NX)
+#define PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR_WT | \
+                                 _PAGE_GLOBAL | _PAGE_NX)
 
 #endif /* __X86_64_PAGE_H__ */
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5895,6 +5895,20 @@  void __iomem *ioremap(paddr_t pa, size_t
     return (void __force __iomem *)va;
 }
 
+void __iomem *__init ioremap_wc(paddr_t pa, size_t len)
+{
+    mfn_t mfn = _mfn(PFN_DOWN(pa));
+    unsigned int offs = pa & (PAGE_SIZE - 1);
+    unsigned int nr = PFN_UP(offs + len);
+    void *va;
+
+    WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
+
+    va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_WC, VMAP_DEFAULT);
+
+    return (void __force __iomem *)(va + offs);
+}
+
 int create_perdomain_mapping(struct domain *d, unsigned long va,
                              unsigned int nr, l1_pgentry_t **pl1tab,
                              struct page_info **ppg)
--- a/xen/drivers/video/vesa.c
+++ b/xen/drivers/video/vesa.c
@@ -9,9 +9,9 @@ 
 #include <xen/param.h>
 #include <xen/xmalloc.h>
 #include <xen/kernel.h>
+#include <xen/mm.h>
 #include <xen/vga.h>
 #include <asm/io.h>
-#include <asm/page.h>
 #include "font.h"
 #include "lfb.h"
 
@@ -103,7 +103,7 @@  void __init vesa_init(void)
     lfbp.text_columns = vlfb_info.width / font->width;
     lfbp.text_rows = vlfb_info.height / font->height;
 
-    lfbp.lfb = lfb = ioremap(lfb_base(), vram_remap);
+    lfbp.lfb = lfb = ioremap_wc(lfb_base(), vram_remap);
     if ( !lfb )
         return;
 
@@ -179,8 +179,7 @@  void __init vesa_mtrr_init(void)
 
 static void lfb_flush(void)
 {
-    if ( vesa_mtrr == 3 )
-        __asm__ __volatile__ ("sfence" : : : "memory");
+    __asm__ __volatile__ ("sfence" : : : "memory");
 }
 
 void __init vesa_endboot(bool_t keep)
--- a/xen/drivers/video/vga.c
+++ b/xen/drivers/video/vga.c
@@ -79,7 +79,7 @@  void __init video_init(void)
     {
     case XEN_VGATYPE_TEXT_MODE_3:
         if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) ||
-             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
+             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
             return;
         outw(0x200a, 0x3d4); /* disable cursor */
         columns = vga_console_info.u.text_mode_3.columns;
@@ -164,7 +164,11 @@  void __init video_endboot(void)
     {
     case XEN_VGATYPE_TEXT_MODE_3:
         if ( !vgacon_keep )
+        {
             memset(video, 0, columns * lines * 2);
+            iounmap(video);
+            video = ZERO_BLOCK_PTR;
+        }
         break;
     case XEN_VGATYPE_VESA_LFB:
     case XEN_VGATYPE_EFI_LFB: