Message ID | 1493756885-29704-1-git-send-email-sgoel@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
[adding some /dev/mem fans to cc] On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: > Port architecture specific xlate and unxlate functions for /dev/mem > read/write. This sets up the mapping for a valid physical address if a > kernel direct mapping is not already present. > > This is a generic issue as a user space app should not be allowed to crash > the kernel. > This issue was observed when systemd tried to access performance > pointer record from the FPDT table. Why is it doing that? Is there not a way to get this via /sys? > Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem > read/write") > > Crash Signature: > Unable to handle kernel paging request at virtual address ffff800008ff0000 > pgd = ffff8007de8b2200 > [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000 > Internal error: Oops: 96000007 [#1] SMP > ................ > CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1 > task: ffff8007c0820000 task.stack: ffff8007c0900000 > PC is at __arch_copy_to_user+0xb4/0x280 > LR is at read_mem+0xc0/0x138 > pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>] > pstate: 80000145 > sp : ffff8007c0903d40 > .................... > x3 : ffff800800000000 x2 : 0000000000000008 > x1 : ffff800008ff0000 x0 : 0000fffff6fdac00 > .................... > Call trace: > Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0) > [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280 > [<ffff0000082454d0>] __vfs_read+0x48/0x130 > [<ffff0000082467dc>] vfs_read+0x8c/0x148 > [<ffff000008247a34>] SyS_pread64+0x94/0xa8 > [<ffff0000080833b0>] el0_svc_naked+0x24/0x28 So this certainly looks like a kernel bug, but I don't think your patch is the right way to fix it. > Code: a88120c7 d503201f d503201f 36180082 (f8408423) > > Signed-off-by: Sameer Goel <sgoel@codeaurora.org> > Tested-by: Shanker Donthineni <shankerd@codeaurora.org> > --- > arch/arm64/include/asm/io.h | 5 +++++ > arch/arm64/mm/ioremap.c | 31 +++++++++++++++++++++++++++++++ > 2 files changed, 36 insertions(+) > > diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h > index 0c00c87..c869ea4 100644 > --- a/arch/arm64/include/asm/io.h > +++ b/arch/arm64/include/asm/io.h > @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) > #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) > #define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) > > +extern void *xlate_dev_mem_ptr(phys_addr_t phys); > +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); > + > +#define xlate_dev_mem_ptr xlate_dev_mem_ptr > +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr > #include <asm-generic/io.h> > > /* > diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c > index c4c8cd4..ba7e63b 100644 > --- a/arch/arm64/mm/ioremap.c > +++ b/arch/arm64/mm/ioremap.c > @@ -24,6 +24,7 @@ > #include <linux/mm.h> > #include <linux/vmalloc.h> > #include <linux/io.h> > +#include <linux/memblock.h> > > #include <asm/fixmap.h> > #include <asm/tlbflush.h> > @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) > EXPORT_SYMBOL(ioremap_cache); > > /* > + * Convert a physical pointer to a virtual kernel pointer for /dev/mem > + * access > + */ > +void *xlate_dev_mem_ptr(phys_addr_t phys) > +{ > + unsigned long start = phys & PAGE_MASK; > + unsigned long offset = phys & ~PAGE_MASK; > + void *vaddr; > + > + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ > + if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys)) > + return __va(phys); > + > + vaddr = ioremap_cache(start, PAGE_SIZE); Blindly using ioremap like this looks unsafe, since we could accidentally set conflict with the attributes of a mapping used by something else (e.g. firmware running on another CPU). I'd like to understand more about the crash, so we can see work out how to fix this properly. Will
On 5/3/2017 5:26 AM, Will Deacon wrote: > [adding some /dev/mem fans to cc] > > On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: >> Port architecture specific xlate and unxlate functions for /dev/mem >> read/write. This sets up the mapping for a valid physical address if a >> kernel direct mapping is not already present. >> >> This is a generic issue as a user space app should not be allowed to crash >> the kernel. > >> This issue was observed when systemd tried to access performance >> pointer record from the FPDT table. > > Why is it doing that? Is there not a way to get this via /sys? There is no ACPI FPDT implementation in the kernel, so the userspace systemd code is getting the FPDT table contents from /sys and parsing the entries. The performance pointer record is a reserved address populated by UEFI and the userspace code tries to access it using /dev/mem. The physical address is valid, so cannot push back on the user space code. https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf 5.2.23 > >> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem >> read/write") >> >> Crash Signature: >> Unable to handle kernel paging request at virtual address ffff800008ff0000 >> pgd = ffff8007de8b2200 >> [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000 >> Internal error: Oops: 96000007 [#1] SMP >> ................ >> CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1 >> task: ffff8007c0820000 task.stack: ffff8007c0900000 >> PC is at __arch_copy_to_user+0xb4/0x280 >> LR is at read_mem+0xc0/0x138 >> pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>] >> pstate: 80000145 >> sp : ffff8007c0903d40 >> .................... >> x3 : ffff800800000000 x2 : 0000000000000008 >> x1 : ffff800008ff0000 x0 : 0000fffff6fdac00 >> .................... >> Call trace: >> Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0) >> [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280 >> [<ffff0000082454d0>] __vfs_read+0x48/0x130 >> [<ffff0000082467dc>] vfs_read+0x8c/0x148 >> [<ffff000008247a34>] SyS_pread64+0x94/0xa8 >> [<ffff0000080833b0>] el0_svc_naked+0x24/0x28 > > So this certainly looks like a kernel bug, but I don't think your patch is > the right way to fix it. I agree that the reserved regions are not meant to be accessed by the kernel as system ram. So, another option was to to return a NULL for this translation. Since, the same usage was working on other architectures I ported over the same code to highlight the issue. > >> Code: a88120c7 d503201f d503201f 36180082 (f8408423) >> >> Signed-off-by: Sameer Goel <sgoel@codeaurora.org> >> Tested-by: Shanker Donthineni <shankerd@codeaurora.org> >> --- >> arch/arm64/include/asm/io.h | 5 +++++ >> arch/arm64/mm/ioremap.c | 31 +++++++++++++++++++++++++++++++ >> 2 files changed, 36 insertions(+) >> >> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h >> index 0c00c87..c869ea4 100644 >> --- a/arch/arm64/include/asm/io.h >> +++ b/arch/arm64/include/asm/io.h >> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) >> #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) >> #define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) >> >> +extern void *xlate_dev_mem_ptr(phys_addr_t phys); >> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); >> + >> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr >> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr >> #include <asm-generic/io.h> >> >> /* >> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c >> index c4c8cd4..ba7e63b 100644 >> --- a/arch/arm64/mm/ioremap.c >> +++ b/arch/arm64/mm/ioremap.c >> @@ -24,6 +24,7 @@ >> #include <linux/mm.h> >> #include <linux/vmalloc.h> >> #include <linux/io.h> >> +#include <linux/memblock.h> >> >> #include <asm/fixmap.h> >> #include <asm/tlbflush.h> >> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) >> EXPORT_SYMBOL(ioremap_cache); >> >> /* >> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem >> + * access >> + */ >> +void *xlate_dev_mem_ptr(phys_addr_t phys) >> +{ >> + unsigned long start = phys & PAGE_MASK; >> + unsigned long offset = phys & ~PAGE_MASK; >> + void *vaddr; >> + >> + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ >> + if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys)) >> + return __va(phys); >> + >> + vaddr = ioremap_cache(start, PAGE_SIZE); > > Blindly using ioremap like this looks unsafe, since we could accidentally > set conflict with the attributes of a mapping used by something else (e.g. > firmware running on another CPU). > > I'd like to understand more about the crash, so we can see work out how to > fix this properly. > This does opens up access to any valid physical address. In the short term we can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP. Eventually we might need to add another memory type to make sure that it can be mapped. I have not though about the exact design here. Thanks, Sameer > Will >
On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: > On 5/3/2017 5:26 AM, Will Deacon wrote: > > [adding some /dev/mem fans to cc] > > > > On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: > >> Port architecture specific xlate and unxlate functions for /dev/mem > >> read/write. This sets up the mapping for a valid physical address if a > >> kernel direct mapping is not already present. > >> > >> This is a generic issue as a user space app should not be allowed to crash > >> the kernel. > > > >> This issue was observed when systemd tried to access performance > >> pointer record from the FPDT table. > > > > Why is it doing that? Is there not a way to get this via /sys? > > There is no ACPI FPDT implementation in the kernel, so the userspace > systemd code is getting the FPDT table contents from /sys > and parsing the entries. The performance pointer record is a > reserved address populated by UEFI and the userspace code tries to > access it using /dev/mem. The physical address is valid, so cannot > push back on the user space code. OK, so then we need to add support for parsing this table in the kernel and exposing the referred-to regions in a controllable fashion. Maybe something that belongs under /sys/firmware/efi (adding Matt), or maybe something that deserves its own driver. The only two use-cases for /dev/mem on arm64 are: - Implementing interfaces in the kernel takes up-front effort. - Being able to accidentally panic the kernel from userland. / Leif > https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c > http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf 5.2.23 > > > >> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem > >> read/write") > >> > >> Crash Signature: > >> Unable to handle kernel paging request at virtual address ffff800008ff0000 > >> pgd = ffff8007de8b2200 > >> [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000 > >> Internal error: Oops: 96000007 [#1] SMP > >> ................ > >> CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1 > >> task: ffff8007c0820000 task.stack: ffff8007c0900000 > >> PC is at __arch_copy_to_user+0xb4/0x280 > >> LR is at read_mem+0xc0/0x138 > >> pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>] > >> pstate: 80000145 > >> sp : ffff8007c0903d40 > >> .................... > >> x3 : ffff800800000000 x2 : 0000000000000008 > >> x1 : ffff800008ff0000 x0 : 0000fffff6fdac00 > >> .................... > >> Call trace: > >> Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0) > >> [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280 > >> [<ffff0000082454d0>] __vfs_read+0x48/0x130 > >> [<ffff0000082467dc>] vfs_read+0x8c/0x148 > >> [<ffff000008247a34>] SyS_pread64+0x94/0xa8 > >> [<ffff0000080833b0>] el0_svc_naked+0x24/0x28 > > > > So this certainly looks like a kernel bug, but I don't think your patch is > > the right way to fix it. > > I agree that the reserved regions are not meant to be accessed by the kernel as system > ram. So, another option was to to return a NULL for this translation. > > Since, the same usage was working on other architectures I ported over the same code to > highlight the issue. > > > > >> Code: a88120c7 d503201f d503201f 36180082 (f8408423) > >> > >> Signed-off-by: Sameer Goel <sgoel@codeaurora.org> > >> Tested-by: Shanker Donthineni <shankerd@codeaurora.org> > >> --- > >> arch/arm64/include/asm/io.h | 5 +++++ > >> arch/arm64/mm/ioremap.c | 31 +++++++++++++++++++++++++++++++ > >> 2 files changed, 36 insertions(+) > >> > >> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h > >> index 0c00c87..c869ea4 100644 > >> --- a/arch/arm64/include/asm/io.h > >> +++ b/arch/arm64/include/asm/io.h > >> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) > >> #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) > >> #define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) > >> > >> +extern void *xlate_dev_mem_ptr(phys_addr_t phys); > >> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); > >> + > >> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr > >> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr > >> #include <asm-generic/io.h> > >> > >> /* > >> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c > >> index c4c8cd4..ba7e63b 100644 > >> --- a/arch/arm64/mm/ioremap.c > >> +++ b/arch/arm64/mm/ioremap.c > >> @@ -24,6 +24,7 @@ > >> #include <linux/mm.h> > >> #include <linux/vmalloc.h> > >> #include <linux/io.h> > >> +#include <linux/memblock.h> > >> > >> #include <asm/fixmap.h> > >> #include <asm/tlbflush.h> > >> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) > >> EXPORT_SYMBOL(ioremap_cache); > >> > >> /* > >> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem > >> + * access > >> + */ > >> +void *xlate_dev_mem_ptr(phys_addr_t phys) > >> +{ > >> + unsigned long start = phys & PAGE_MASK; > >> + unsigned long offset = phys & ~PAGE_MASK; > >> + void *vaddr; > >> + > >> + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ > >> + if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys)) > >> + return __va(phys); > >> + > >> + vaddr = ioremap_cache(start, PAGE_SIZE); > > > > Blindly using ioremap like this looks unsafe, since we could accidentally > > set conflict with the attributes of a mapping used by something else (e.g. > > firmware running on another CPU). > > > > I'd like to understand more about the crash, so we can see work out how to > > fix this properly. > > > This does opens up access to any valid physical address. In the short term we > can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP. > > Eventually we might need to add another memory type to make sure that it can be mapped. > I have not though about the exact design here. > > Thanks, > Sameer > > > Will > > > > -- > Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
On 5/3/2017 2:18 PM, Leif Lindholm wrote: > On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: >> On 5/3/2017 5:26 AM, Will Deacon wrote: >>> [adding some /dev/mem fans to cc] >>> >>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: >>>> Port architecture specific xlate and unxlate functions for /dev/mem >>>> read/write. This sets up the mapping for a valid physical address if a >>>> kernel direct mapping is not already present. >>>> >>>> This is a generic issue as a user space app should not be allowed to crash >>>> the kernel. >>> >>>> This issue was observed when systemd tried to access performance >>>> pointer record from the FPDT table. >>> >>> Why is it doing that? Is there not a way to get this via /sys? >> >> There is no ACPI FPDT implementation in the kernel, so the userspace >> systemd code is getting the FPDT table contents from /sys >> and parsing the entries. The performance pointer record is a >> reserved address populated by UEFI and the userspace code tries to >> access it using /dev/mem. The physical address is valid, so cannot >> push back on the user space code. > > OK, so then we need to add support for parsing this table in the > kernel and exposing the referred-to regions in a controllable fashion. > Maybe something that belongs under /sys/firmware/efi (adding Matt), or > maybe something that deserves its own driver. > > The only two use-cases for /dev/mem on arm64 are: > - Implementing interfaces in the kernel takes up-front effort. > - Being able to accidentally panic the kernel from userland. > We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support. - Sameer > / > Leif > >> https://github.com/systemd/systemd/blob/master/src/shared/acpi-fpdt.c >> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf 5.2.23 >>> >>>> Ported from commit e045fb2a988a ("x86: PAT avoid aliasing in /dev/mem >>>> read/write") >>>> >>>> Crash Signature: >>>> Unable to handle kernel paging request at virtual address ffff800008ff0000 >>>> pgd = ffff8007de8b2200 >>>> [ffff800008ff0000] *pgd=0000000000000000, *pud=0000000000000000 >>>> Internal error: Oops: 96000007 [#1] SMP >>>> ................ >>>> CPU: 0 PID: 1 Comm: systemd Not tainted 4.10.0 #1 >>>> task: ffff8007c0820000 task.stack: ffff8007c0900000 >>>> PC is at __arch_copy_to_user+0xb4/0x280 >>>> LR is at read_mem+0xc0/0x138 >>>> pc : [<ffff0000084b3bb4>] lr : [<ffff00000869d178>] >>>> pstate: 80000145 >>>> sp : ffff8007c0903d40 >>>> .................... >>>> x3 : ffff800800000000 x2 : 0000000000000008 >>>> x1 : ffff800008ff0000 x0 : 0000fffff6fdac00 >>>> .................... >>>> Call trace: >>>> Exception stack(0xffff8007c0903b70 to 0xffff8007c0903ca0) >>>> [<ffff0000084b3bb4>] __arch_copy_to_user+0xb4/0x280 >>>> [<ffff0000082454d0>] __vfs_read+0x48/0x130 >>>> [<ffff0000082467dc>] vfs_read+0x8c/0x148 >>>> [<ffff000008247a34>] SyS_pread64+0x94/0xa8 >>>> [<ffff0000080833b0>] el0_svc_naked+0x24/0x28 >>> >>> So this certainly looks like a kernel bug, but I don't think your patch is >>> the right way to fix it. >> >> I agree that the reserved regions are not meant to be accessed by the kernel as system >> ram. So, another option was to to return a NULL for this translation. >> >> Since, the same usage was working on other architectures I ported over the same code to >> highlight the issue. >> >>> >>>> Code: a88120c7 d503201f d503201f 36180082 (f8408423) >>>> >>>> Signed-off-by: Sameer Goel <sgoel@codeaurora.org> >>>> Tested-by: Shanker Donthineni <shankerd@codeaurora.org> >>>> --- >>>> arch/arm64/include/asm/io.h | 5 +++++ >>>> arch/arm64/mm/ioremap.c | 31 +++++++++++++++++++++++++++++++ >>>> 2 files changed, 36 insertions(+) >>>> >>>> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h >>>> index 0c00c87..c869ea4 100644 >>>> --- a/arch/arm64/include/asm/io.h >>>> +++ b/arch/arm64/include/asm/io.h >>>> @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) >>>> #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) >>>> #define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) >>>> >>>> +extern void *xlate_dev_mem_ptr(phys_addr_t phys); >>>> +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); >>>> + >>>> +#define xlate_dev_mem_ptr xlate_dev_mem_ptr >>>> +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr >>>> #include <asm-generic/io.h> >>>> >>>> /* >>>> diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c >>>> index c4c8cd4..ba7e63b 100644 >>>> --- a/arch/arm64/mm/ioremap.c >>>> +++ b/arch/arm64/mm/ioremap.c >>>> @@ -24,6 +24,7 @@ >>>> #include <linux/mm.h> >>>> #include <linux/vmalloc.h> >>>> #include <linux/io.h> >>>> +#include <linux/memblock.h> >>>> >>>> #include <asm/fixmap.h> >>>> #include <asm/tlbflush.h> >>>> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) >>>> EXPORT_SYMBOL(ioremap_cache); >>>> >>>> /* >>>> + * Convert a physical pointer to a virtual kernel pointer for /dev/mem >>>> + * access >>>> + */ >>>> +void *xlate_dev_mem_ptr(phys_addr_t phys) >>>> +{ >>>> + unsigned long start = phys & PAGE_MASK; >>>> + unsigned long offset = phys & ~PAGE_MASK; >>>> + void *vaddr; >>>> + >>>> + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ >>>> + if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys)) >>>> + return __va(phys); >>>> + >>>> + vaddr = ioremap_cache(start, PAGE_SIZE); >>> >>> Blindly using ioremap like this looks unsafe, since we could accidentally >>> set conflict with the attributes of a mapping used by something else (e.g. >>> firmware running on another CPU). >>> >>> I'd like to understand more about the crash, so we can see work out how to >>> fix this properly. >>> >> This does opens up access to any valid physical address. In the short term we >> can block this crash by return NULL from this function if the memblock is MEMBLOCK_NOMAP. >> >> Eventually we might need to add another memory type to make sure that it can be mapped. >> I have not though about the exact design here. >> >> Thanks, >> Sameer >> >>> Will >>> >> >> -- >> Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project. >
On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote: > > > On 5/3/2017 2:18 PM, Leif Lindholm wrote: >> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: >>> On 5/3/2017 5:26 AM, Will Deacon wrote: >>>> [adding some /dev/mem fans to cc] >>>> >>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: >>>>> Port architecture specific xlate and unxlate functions for /dev/mem >>>>> read/write. This sets up the mapping for a valid physical address if a >>>>> kernel direct mapping is not already present. >>>>> >>>>> This is a generic issue as a user space app should not be allowed to crash >>>>> the kernel. >>>> >>>>> This issue was observed when systemd tried to access performance >>>>> pointer record from the FPDT table. >>>> >>>> Why is it doing that? Is there not a way to get this via /sys? >>> >>> There is no ACPI FPDT implementation in the kernel, so the userspace >>> systemd code is getting the FPDT table contents from /sys >>> and parsing the entries. The performance pointer record is a >>> reserved address populated by UEFI and the userspace code tries to >>> access it using /dev/mem. The physical address is valid, so cannot >>> push back on the user space code. >> >> OK, so then we need to add support for parsing this table in the >> kernel and exposing the referred-to regions in a controllable fashion. >> Maybe something that belongs under /sys/firmware/efi (adding Matt), or >> maybe something that deserves its own driver. >> >> The only two use-cases for /dev/mem on arm64 are: >> - Implementing interfaces in the kernel takes up-front effort. >> - Being able to accidentally panic the kernel from userland. >> > We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked > memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support. > I reported the same issue a couple of weeks ago [0]. So while we all agree that such accesses shouldn't oops the kernel, I think we may disagree on whether such accesses should be allowed in the first place, especially when using read/write on /dev/mem (as opposed to mmap()) One the UEFI/EDK2 side, there are two fundamental issues here, which we should resolve: - The use of EfiRuntimeServicesData for such regions: these tables have no significance to the firmware itself (not after ExitBootServices()) anyway, and so the only reason for choosing this memory type is that they are guaranteed to be left untouched by the OS. Also, using this type rather than something like 'ACPI Reclaim' results in the memory to be occupied regardless of whether you understand cq. are interested in its contents, which is something we usually try to avoid in UEFI. - The unconditional use of the EFI_MEMORY_RUNTIME attribute for EfiRuntimeServicesData regions, which requests the OS to create a runtime mapping for it in the OS page tables. We should be able to take this attribute as a cue that the firmware has no interest in its contents (given that it never requested a mapping for it) making it safe for the OS to map it with any attributes it likes. However, EDK2 currently does not provide this facility, i.e., the EFI_MEMORY_RUNTIME bit is always set. So modulo the feedback on my patch, I think that approach is preferred, and we should really look into cleaning this up further on the firmware side. For now, the userland fix is to use mmap() rather than read() (which is already documented in the code as something that should not be relied upon to remain available indefinitely). [0] http://marc.info/?l=linux-arm-kernel&m=149198561609050
On 5/4/2017 12:40 AM, Ard Biesheuvel wrote: > On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote: >> >> >> On 5/3/2017 2:18 PM, Leif Lindholm wrote: >>> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: >>>> On 5/3/2017 5:26 AM, Will Deacon wrote: >>>>> [adding some /dev/mem fans to cc] >>>>> >>>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: >>>>>> Port architecture specific xlate and unxlate functions for /dev/mem >>>>>> read/write. This sets up the mapping for a valid physical address if a >>>>>> kernel direct mapping is not already present. >>>>>> >>>>>> This is a generic issue as a user space app should not be allowed to crash >>>>>> the kernel. >>>>> >>>>>> This issue was observed when systemd tried to access performance >>>>>> pointer record from the FPDT table. >>>>> >>>>> Why is it doing that? Is there not a way to get this via /sys? >>>> >>>> There is no ACPI FPDT implementation in the kernel, so the userspace >>>> systemd code is getting the FPDT table contents from /sys >>>> and parsing the entries. The performance pointer record is a >>>> reserved address populated by UEFI and the userspace code tries to >>>> access it using /dev/mem. The physical address is valid, so cannot >>>> push back on the user space code. >>> >>> OK, so then we need to add support for parsing this table in the >>> kernel and exposing the referred-to regions in a controllable fashion. >>> Maybe something that belongs under /sys/firmware/efi (adding Matt), or >>> maybe something that deserves its own driver. >>> >>> The only two use-cases for /dev/mem on arm64 are: >>> - Implementing interfaces in the kernel takes up-front effort. >>> - Being able to accidentally panic the kernel from userland. >>> >> We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked >> memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support. >> > > I reported the same issue a couple of weeks ago [0]. So while we all > agree that such accesses shouldn't oops the kernel, I think we may > disagree on whether such accesses should be allowed in the first > place, especially when using read/write on /dev/mem (as opposed to > mmap()) > > One the UEFI/EDK2 side, there are two fundamental issues here, which > we should resolve: > - The use of EfiRuntimeServicesData for such regions: these tables > have no significance to the firmware itself (not after > ExitBootServices()) anyway, and so the only reason for choosing this > memory type is that they are guaranteed to be left untouched by the > OS. Also, using this type rather than something like 'ACPI Reclaim' > results in the memory to be occupied regardless of whether you > understand cq. are interested in its contents, which is something we > usually try to avoid in UEFI. > - The unconditional use of the EFI_MEMORY_RUNTIME attribute for > EfiRuntimeServicesData regions, which requests the OS to create a > runtime mapping for it in the OS page tables. We should be able to > take this attribute as a cue that the firmware has no interest in its > contents (given that it never requested a mapping for it) making it > safe for the OS to map it with any attributes it likes. However, EDK2 > currently does not provide this facility, i.e., the EFI_MEMORY_RUNTIME > bit is always set. > > So modulo the feedback on my patch, I think that approach is > preferred, and we should really look into cleaning this up further on > the firmware side. For now, the userland fix is to use mmap() rather > than read() (which is already documented in the code as something that > should not be relied upon to remain available indefinitely). > > > > > > [0] http://marc.info/?l=linux-arm-kernel&m=149198561609050 > Makes sense. I will pick up the patch mentioned in [0] for fixing my current issue. Thanks, Sameer
On Thu, May 04, 2017 at 07:40:50AM +0100, Ard Biesheuvel wrote: > On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote: > > > > > > On 5/3/2017 2:18 PM, Leif Lindholm wrote: > >> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: > >>> On 5/3/2017 5:26 AM, Will Deacon wrote: > >>>> [adding some /dev/mem fans to cc] > >>>> > >>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: > >>>>> Port architecture specific xlate and unxlate functions for /dev/mem > >>>>> read/write. This sets up the mapping for a valid physical address if a > >>>>> kernel direct mapping is not already present. > >>>>> > >>>>> This is a generic issue as a user space app should not be allowed to crash > >>>>> the kernel. > >>>> > >>>>> This issue was observed when systemd tried to access performance > >>>>> pointer record from the FPDT table. > >>>> > >>>> Why is it doing that? Is there not a way to get this via /sys? > >>> > >>> There is no ACPI FPDT implementation in the kernel, so the userspace > >>> systemd code is getting the FPDT table contents from /sys > >>> and parsing the entries. The performance pointer record is a > >>> reserved address populated by UEFI and the userspace code tries to > >>> access it using /dev/mem. The physical address is valid, so cannot > >>> push back on the user space code. > >> > >> OK, so then we need to add support for parsing this table in the > >> kernel and exposing the referred-to regions in a controllable fashion. > >> Maybe something that belongs under /sys/firmware/efi (adding Matt), or > >> maybe something that deserves its own driver. > >> > >> The only two use-cases for /dev/mem on arm64 are: > >> - Implementing interfaces in the kernel takes up-front effort. > >> - Being able to accidentally panic the kernel from userland. > >> > > We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked > > memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support. > > > > I reported the same issue a couple of weeks ago [0]. So while we all > agree that such accesses shouldn't oops the kernel, I think we may > disagree on whether such accesses should be allowed in the first > place, especially when using read/write on /dev/mem (as opposed to > mmap()) Did you plan to respin those patches to address Alex's comments? I agree that it would be good to close the oops, regardless of the rest of the discussion here. Will
> On 5 May 2017, at 15:55, Will Deacon <will.deacon@arm.com> wrote: > >> On Thu, May 04, 2017 at 07:40:50AM +0100, Ard Biesheuvel wrote: >>> On 3 May 2017 at 22:47, Goel, Sameer <sgoel@codeaurora.org> wrote: >>> >>> >>>> On 5/3/2017 2:18 PM, Leif Lindholm wrote: >>>>> On Wed, May 03, 2017 at 11:07:45AM -0600, Goel, Sameer wrote: >>>>>> On 5/3/2017 5:26 AM, Will Deacon wrote: >>>>>> [adding some /dev/mem fans to cc] >>>>>> >>>>>>> On Tue, May 02, 2017 at 02:28:05PM -0600, Sameer Goel wrote: >>>>>>> Port architecture specific xlate and unxlate functions for /dev/mem >>>>>>> read/write. This sets up the mapping for a valid physical address if a >>>>>>> kernel direct mapping is not already present. >>>>>>> >>>>>>> This is a generic issue as a user space app should not be allowed to crash >>>>>>> the kernel. >>>>>> >>>>>>> This issue was observed when systemd tried to access performance >>>>>>> pointer record from the FPDT table. >>>>>> >>>>>> Why is it doing that? Is there not a way to get this via /sys? >>>>> >>>>> There is no ACPI FPDT implementation in the kernel, so the userspace >>>>> systemd code is getting the FPDT table contents from /sys >>>>> and parsing the entries. The performance pointer record is a >>>>> reserved address populated by UEFI and the userspace code tries to >>>>> access it using /dev/mem. The physical address is valid, so cannot >>>>> push back on the user space code. >>>> >>>> OK, so then we need to add support for parsing this table in the >>>> kernel and exposing the referred-to regions in a controllable fashion. >>>> Maybe something that belongs under /sys/firmware/efi (adding Matt), or >>>> maybe something that deserves its own driver. >>>> >>>> The only two use-cases for /dev/mem on arm64 are: >>>> - Implementing interfaces in the kernel takes up-front effort. >>>> - Being able to accidentally panic the kernel from userland. >>>> >>> We will see this issue with any access using /dev/mem to a MEMBLOCK_NOMAP marked >>> memblock. The kernel crash issue has to be fixed irrespective of ACPI FPDT support. >>> >> >> I reported the same issue a couple of weeks ago [0]. So while we all >> agree that such accesses shouldn't oops the kernel, I think we may >> disagree on whether such accesses should be allowed in the first >> place, especially when using read/write on /dev/mem (as opposed to >> mmap()) > > Did you plan to respin those patches to address Alex's comments? I agree > that it would be good to close the oops, regardless of the rest of the > discussion here. Agreed. I will look into this after my vacation (back on May 15th)
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 0c00c87..c869ea4 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -183,6 +183,11 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define iowrite32be(v,p) ({ __iowmb(); __raw_writel((__force __u32)cpu_to_be32(v), p); }) #define iowrite64be(v,p) ({ __iowmb(); __raw_writeq((__force __u64)cpu_to_be64(v), p); }) +extern void *xlate_dev_mem_ptr(phys_addr_t phys); +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); + +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr #include <asm-generic/io.h> /* diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index c4c8cd4..ba7e63b 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -24,6 +24,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/io.h> +#include <linux/memblock.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> @@ -105,6 +106,36 @@ void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size) EXPORT_SYMBOL(ioremap_cache); /* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(phys_addr_t phys) +{ + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + void *vaddr; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT) && memblock_is_memory(phys)) + return __va(phys); + + vaddr = ioremap_cache(start, PAGE_SIZE); + /* Add the offset on success and return NULL if ioremap() failed */ + if (vaddr) + vaddr += offset; + + return vaddr; +} + +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); +} + +/* * Must be called after early_fixmap_init */ void __init early_ioremap_init(void)