Message ID | 20200317165409.469013-1-colin.king@canonical.com (mailing list archive) |
---|---|
State | Mainlined, archived |
Headers | show |
Series | [V2] ACPI: sysfs: copy ACPI data using io memory copying | expand |
On Tue, Mar 17, 2020 at 5:54 PM Colin King <colin.king@canonical.com> wrote: > > From: Colin Ian King <colin.king@canonical.com> > > Reading ACPI data on ARM64 at a non-aligned offset from > /sys/firmware/acpi/tables/data/BERT will cause a splat because > the data is I/O memory mapped and being read with just a memcpy. > Fix this by introducing an I/O variant of memory_read_from_buffer > and using I/O memory mapped copies instead. > > Fixes the following splat: > > [ 439.789355] Unable to handle kernel paging request at virtual address ffff800041ac0007 > [ 439.797275] Mem abort info: > [ 439.800078] ESR = 0x96000021 > [ 439.803131] EC = 0x25: DABT (current EL), IL = 32 bits > [ 439.808437] SET = 0, FnV = 0 > [ 439.811486] EA = 0, S1PTW = 0 > [ 439.814621] Data abort info: > [ 439.817489] ISV = 0, ISS = 0x00000021 > [ 439.821319] CM = 0, WnR = 0 > [ 439.824282] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000817fc000 > [ 439.830979] [ffff800041ac0007] pgd=000000bffcfff003, pud=0000009f27cee003, pmd=000000bf4b993003, pte=0068000080280703 > [ 439.841584] Internal error: Oops: 96000021 [#1] SMP > [ 439.846449] Modules linked in: nls_iso8859_1 dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua ipmi_ssif input_leds joydev ipmi_devintf ipmi_msghandler thunderx2_pmu sch_fq_codel ip_tables x_tables autofs4 btrfs zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor xor_neon raid6_pq libcrc32c raid1 raid0 multipath linear i2c_smbus ast i2c_algo_bit crct10dif_ce drm_vram_helper uas ttm ghash_ce drm_kms_helper sha2_ce syscopyarea sha256_arm64 qede sysfillrect mpt3sas sha1_ce sysimgblt fb_sys_fops raid_class qed drm scsi_transport_sas usb_storage ahci crc8 gpio_xlp i2c_xlp9xx hid_generic usbhid hid aes_neon_bs aes_neon_blk aes_ce_blk crypto_simd cryptd aes_ce_cipher > [ 439.908474] CPU: 2 PID: 3926 Comm: a.out Not tainted 5.4.0-14-generic #17-Ubuntu > [ 439.915855] Hardware name: To be filled by O.E.M. Saber/Saber, BIOS 0ACKL027 07/01/2019 > [ 439.923844] pstate: 80400009 (Nzcv daif +PAN -UAO) > [ 439.928625] pc : __memcpy+0x90/0x180 > [ 439.932192] lr : memory_read_from_buffer+0x64/0x88 > [ 439.936968] sp : ffff8000350dbc70 > [ 439.940270] x29: ffff8000350dbc70 x28: ffff009e9c444b00 > [ 439.945568] x27: 0000000000000000 x26: 0000000000000000 > [ 439.950866] x25: 0000000056000000 x24: ffff800041ac0000 > [ 439.956164] x23: ffff009ea163f980 x22: 0000000000000007 > [ 439.961462] x21: ffff8000350dbce8 x20: 000000000000000e > [ 439.966760] x19: 0000000000000007 x18: ffff8000112f64a8 > [ 439.972058] x17: 0000000000000000 x16: 0000000000000000 > [ 439.977355] x15: 0000000080280000 x14: ffff800041aed000 > [ 439.982653] x13: ffff009ee9fa2840 x12: ffff800041ad1000 > [ 439.987951] x11: ffff8000115e1360 x10: ffff8000115e1360 > [ 439.993248] x9 : 0000000000010000 x8 : ffff800011ad2658 > [ 439.998546] x7 : ffff800041ac0000 x6 : ffff009ea163f980 > [ 440.003844] x5 : 0140000000000000 x4 : 0000000000010000 > [ 440.009141] x3 : ffff800041ac0000 x2 : 0000000000000007 > [ 440.014439] x1 : ffff800041ac0007 x0 : ffff009ea163f980 > [ 440.019737] Call trace: > [ 440.022173] __memcpy+0x90/0x180 > [ 440.025392] acpi_data_show+0x54/0x80 > [ 440.029044] sysfs_kf_bin_read+0x6c/0xa8 > [ 440.032954] kernfs_file_direct_read+0x90/0x2d0 > [ 440.037470] kernfs_fop_read+0x68/0x78 > [ 440.041210] __vfs_read+0x48/0x90 > [ 440.044511] vfs_read+0xd0/0x1a0 > [ 440.047726] ksys_read+0x78/0x100 > [ 440.051028] __arm64_sys_read+0x24/0x30 > [ 440.054852] el0_svc_common.constprop.0+0xdc/0x1d8 > [ 440.059629] el0_svc_handler+0x34/0xa0 > [ 440.063366] el0_svc+0x10/0x14 > [ 440.066411] Code: 36180062 f8408423 f80084c3 36100062 (b8404423) > [ 440.072492] ---[ end trace 45fb374e8d2d800e ]--- > > A simple reproducer is as follows: > > int main(void) > { > int fd; > char buffer[7]; > ssize_t n; > > fd = open("/sys/firmware/acpi/tables/data/BERT", O_RDONLY); > if (fd < 0) { > perror("open failed"); > return -1; > } > do { > n = read(fd, buffer, sizeof(buffer)); > } while (n > 0); > > return 0; > } > > Signed-off-by: Colin Ian King <colin.king@canonical.com> > --- > V2: Add missing #include <linux/io.h> without which we get > a build failure when building with allnoconfig > --- > drivers/acpi/sysfs.c | 2 +- > fs/libfs.c | 34 ++++++++++++++++++++++++++++++++++ > include/linux/string.h | 2 ++ > 3 files changed, 37 insertions(+), 1 deletion(-) > > diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c > index c60d2c6..fb9e216 100644 > --- a/drivers/acpi/sysfs.c > +++ b/drivers/acpi/sysfs.c > @@ -446,7 +446,7 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, > base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); > if (!base) > return -ENOMEM; > - rc = memory_read_from_buffer(buf, count, &offset, base, > + rc = memory_read_from_io_buffer(buf, count, &offset, base, > data_attr->attr.size); > acpi_os_unmap_memory(base, data_attr->attr.size); > > diff --git a/fs/libfs.c b/fs/libfs.c > index c686bd9..1a49da1 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -20,6 +20,7 @@ > #include <linux/fs_context.h> > #include <linux/pseudo_fs.h> > #include <linux/fsnotify.h> > +#include <linux/io.h> > > #include <linux/uaccess.h> > > @@ -800,6 +801,39 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, > } > EXPORT_SYMBOL(memory_read_from_buffer); > > +/** > + * memory_read_from_io_buffer - copy data from a io memory mapped buffer > + * @to: the kernel space buffer to read to > + * @count: the maximum number of bytes to read > + * @ppos: the current position in the buffer > + * @from: the buffer to read from > + * @available: the size of the buffer > + * > + * The memory_read_from_buffer() function reads up to @count bytes from the > + * io memory mappy buffer @from at offset @ppos into the kernel space address > + * starting at @to. > + * > + * On success, the number of bytes read is returned and the offset @ppos is > + * advanced by this number, or negative value is returned on error. > + **/ > +ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, > + const void *from, size_t available) > +{ > + loff_t pos = *ppos; > + > + if (pos < 0) > + return -EINVAL; > + if (pos >= available) > + return 0; > + if (count > available - pos) > + count = available - pos; > + memcpy_fromio(to, from + pos, count); > + *ppos = pos + count; > + > + return count; > +} > +EXPORT_SYMBOL(memory_read_from_io_buffer); > + > /* > * Transaction based IO. > * The file expects a single write which triggers the transaction, and then > diff --git a/include/linux/string.h b/include/linux/string.h > index 6dfbb2e..0c6ec2a 100644 > --- a/include/linux/string.h > +++ b/include/linux/string.h > @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); > > extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, > const void *from, size_t available); > +extern ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, > + const void *from, size_t available); > > int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); > > -- I've replaced the v1 with this one, thanks!
[adding James and Lorenzo] On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: > From: Colin Ian King <colin.king@canonical.com> > > Reading ACPI data on ARM64 at a non-aligned offset from > /sys/firmware/acpi/tables/data/BERT will cause a splat because > the data is I/O memory mapped and being read with just a memcpy. > Fix this by introducing an I/O variant of memory_read_from_buffer > and using I/O memory mapped copies instead. Just to check, is that correct is it correct to map those tables with Device attributes in the first place, or should we be mapping the tables with Normal Cacheable attributes with memremap()? If the FW placed those into memory using cacheavble attributes, reading them using Device attributes could result in stale values, which could be garbage. Thanks, Mark. > > Fixes the following splat: > > [ 439.789355] Unable to handle kernel paging request at virtual address ffff800041ac0007 > [ 439.797275] Mem abort info: > [ 439.800078] ESR = 0x96000021 > [ 439.803131] EC = 0x25: DABT (current EL), IL = 32 bits > [ 439.808437] SET = 0, FnV = 0 > [ 439.811486] EA = 0, S1PTW = 0 > [ 439.814621] Data abort info: > [ 439.817489] ISV = 0, ISS = 0x00000021 > [ 439.821319] CM = 0, WnR = 0 > [ 439.824282] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000817fc000 > [ 439.830979] [ffff800041ac0007] pgd=000000bffcfff003, pud=0000009f27cee003, pmd=000000bf4b993003, pte=0068000080280703 > [ 439.841584] Internal error: Oops: 96000021 [#1] SMP > [ 439.846449] Modules linked in: nls_iso8859_1 dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua ipmi_ssif input_leds joydev ipmi_devintf ipmi_msghandler thunderx2_pmu sch_fq_codel ip_tables x_tables autofs4 btrfs zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor xor_neon raid6_pq libcrc32c raid1 raid0 multipath linear i2c_smbus ast i2c_algo_bit crct10dif_ce drm_vram_helper uas ttm ghash_ce drm_kms_helper sha2_ce syscopyarea sha256_arm64 qede sysfillrect mpt3sas sha1_ce sysimgblt fb_sys_fops raid_class qed drm scsi_transport_sas usb_storage ahci crc8 gpio_xlp i2c_xlp9xx hid_generic usbhid hid aes_neon_bs aes_neon_blk aes_ce_blk crypto_simd cryptd aes_ce_cipher > [ 439.908474] CPU: 2 PID: 3926 Comm: a.out Not tainted 5.4.0-14-generic #17-Ubuntu > [ 439.915855] Hardware name: To be filled by O.E.M. Saber/Saber, BIOS 0ACKL027 07/01/2019 > [ 439.923844] pstate: 80400009 (Nzcv daif +PAN -UAO) > [ 439.928625] pc : __memcpy+0x90/0x180 > [ 439.932192] lr : memory_read_from_buffer+0x64/0x88 > [ 439.936968] sp : ffff8000350dbc70 > [ 439.940270] x29: ffff8000350dbc70 x28: ffff009e9c444b00 > [ 439.945568] x27: 0000000000000000 x26: 0000000000000000 > [ 439.950866] x25: 0000000056000000 x24: ffff800041ac0000 > [ 439.956164] x23: ffff009ea163f980 x22: 0000000000000007 > [ 439.961462] x21: ffff8000350dbce8 x20: 000000000000000e > [ 439.966760] x19: 0000000000000007 x18: ffff8000112f64a8 > [ 439.972058] x17: 0000000000000000 x16: 0000000000000000 > [ 439.977355] x15: 0000000080280000 x14: ffff800041aed000 > [ 439.982653] x13: ffff009ee9fa2840 x12: ffff800041ad1000 > [ 439.987951] x11: ffff8000115e1360 x10: ffff8000115e1360 > [ 439.993248] x9 : 0000000000010000 x8 : ffff800011ad2658 > [ 439.998546] x7 : ffff800041ac0000 x6 : ffff009ea163f980 > [ 440.003844] x5 : 0140000000000000 x4 : 0000000000010000 > [ 440.009141] x3 : ffff800041ac0000 x2 : 0000000000000007 > [ 440.014439] x1 : ffff800041ac0007 x0 : ffff009ea163f980 > [ 440.019737] Call trace: > [ 440.022173] __memcpy+0x90/0x180 > [ 440.025392] acpi_data_show+0x54/0x80 > [ 440.029044] sysfs_kf_bin_read+0x6c/0xa8 > [ 440.032954] kernfs_file_direct_read+0x90/0x2d0 > [ 440.037470] kernfs_fop_read+0x68/0x78 > [ 440.041210] __vfs_read+0x48/0x90 > [ 440.044511] vfs_read+0xd0/0x1a0 > [ 440.047726] ksys_read+0x78/0x100 > [ 440.051028] __arm64_sys_read+0x24/0x30 > [ 440.054852] el0_svc_common.constprop.0+0xdc/0x1d8 > [ 440.059629] el0_svc_handler+0x34/0xa0 > [ 440.063366] el0_svc+0x10/0x14 > [ 440.066411] Code: 36180062 f8408423 f80084c3 36100062 (b8404423) > [ 440.072492] ---[ end trace 45fb374e8d2d800e ]--- > > A simple reproducer is as follows: > > int main(void) > { > int fd; > char buffer[7]; > ssize_t n; > > fd = open("/sys/firmware/acpi/tables/data/BERT", O_RDONLY); > if (fd < 0) { > perror("open failed"); > return -1; > } > do { > n = read(fd, buffer, sizeof(buffer)); > } while (n > 0); > > return 0; > } > > Signed-off-by: Colin Ian King <colin.king@canonical.com> > --- > V2: Add missing #include <linux/io.h> without which we get > a build failure when building with allnoconfig > --- > drivers/acpi/sysfs.c | 2 +- > fs/libfs.c | 34 ++++++++++++++++++++++++++++++++++ > include/linux/string.h | 2 ++ > 3 files changed, 37 insertions(+), 1 deletion(-) > > diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c > index c60d2c6..fb9e216 100644 > --- a/drivers/acpi/sysfs.c > +++ b/drivers/acpi/sysfs.c > @@ -446,7 +446,7 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, > base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); > if (!base) > return -ENOMEM; > - rc = memory_read_from_buffer(buf, count, &offset, base, > + rc = memory_read_from_io_buffer(buf, count, &offset, base, > data_attr->attr.size); > acpi_os_unmap_memory(base, data_attr->attr.size); > > diff --git a/fs/libfs.c b/fs/libfs.c > index c686bd9..1a49da1 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -20,6 +20,7 @@ > #include <linux/fs_context.h> > #include <linux/pseudo_fs.h> > #include <linux/fsnotify.h> > +#include <linux/io.h> > > #include <linux/uaccess.h> > > @@ -800,6 +801,39 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, > } > EXPORT_SYMBOL(memory_read_from_buffer); > > +/** > + * memory_read_from_io_buffer - copy data from a io memory mapped buffer > + * @to: the kernel space buffer to read to > + * @count: the maximum number of bytes to read > + * @ppos: the current position in the buffer > + * @from: the buffer to read from > + * @available: the size of the buffer > + * > + * The memory_read_from_buffer() function reads up to @count bytes from the > + * io memory mappy buffer @from at offset @ppos into the kernel space address > + * starting at @to. > + * > + * On success, the number of bytes read is returned and the offset @ppos is > + * advanced by this number, or negative value is returned on error. > + **/ > +ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, > + const void *from, size_t available) > +{ > + loff_t pos = *ppos; > + > + if (pos < 0) > + return -EINVAL; > + if (pos >= available) > + return 0; > + if (count > available - pos) > + count = available - pos; > + memcpy_fromio(to, from + pos, count); > + *ppos = pos + count; > + > + return count; > +} > +EXPORT_SYMBOL(memory_read_from_io_buffer); > + > /* > * Transaction based IO. > * The file expects a single write which triggers the transaction, and then > diff --git a/include/linux/string.h b/include/linux/string.h > index 6dfbb2e..0c6ec2a 100644 > --- a/include/linux/string.h > +++ b/include/linux/string.h > @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); > > extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, > const void *from, size_t available); > +extern ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, > + const void *from, size_t available); > > int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); > > -- > 2.7.4 >
Hello! On 3/20/20 1:19 PM, Mark Rutland wrote: > [adding James and Lorenzo] (but not actually...) > On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: >> From: Colin Ian King <colin.king@canonical.com> >> >> Reading ACPI data on ARM64 at a non-aligned offset from >> /sys/firmware/acpi/tables/data/BERT will cause a splat because >> the data is I/O memory mapped On your platform, on someone else's it may be in memory. Which platform is this on? (I've never seen one generate a BERT!) >> and being read with just a memcpy. >> Fix this by introducing an I/O variant of memory_read_from_buffer >> and using I/O memory mapped copies instead. > Just to check, is that correct is it correct to map those tables with > Device attributes in the first place, or should we be mapping the tables > with Normal Cacheable attributes with memremap()? > > If the FW placed those into memory using cacheavble attributes, reading > them using Device attributes could result in stale values, which could > be garbage. Yes. The BERT code should be using arch_apei_get_mem_attribute() to use the correct attributes. See ghes_map() for an example. bert_init() will need to use a version of ioremap() that takes the pgprot_t. Always using ioremap_cache() means you get a cacheable mapping, regardless of how firmware described this region in the UEFI memory map. This doesn't explain why you got an alignment fault. Otherwise, looks fine to me. (N.B. I ignored this patch as it wasn't copied to linux-arm-kernel and the subject says its about sysfs<->ACPI, nothing to do with APEI!) Thanks, James
> On Apr 1, 2020, at 5:44 AM, James Morse <james.morse@arm.com> wrote: > > Hello! > > On 3/20/20 1:19 PM, Mark Rutland wrote: >> [adding James and Lorenzo] > > (but not actually...) > > >> On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: >>> From: Colin Ian King <colin.king@canonical.com> >>> >>> Reading ACPI data on ARM64 at a non-aligned offset from >>> /sys/firmware/acpi/tables/data/BERT will cause a splat because >>> the data is I/O memory mapped > > On your platform, on someone else's it may be in memory. > > Which platform is this on? > (I've never seen one generate a BERT!) I have seen this on several platforms. The latest is an Altra based machine. It shows up in the Linux Test Project: ltp test "read_all -d /sys -q -r 10”. > > >>> and being read with just a memcpy. >>> Fix this by introducing an I/O variant of memory_read_from_buffer >>> and using I/O memory mapped copies instead. > >> Just to check, is that correct is it correct to map those tables with >> Device attributes in the first place, or should we be mapping the tables >> with Normal Cacheable attributes with memremap()? >> >> If the FW placed those into memory using cacheavble attributes, reading >> them using Device attributes could result in stale values, which could >> be garbage. > > Yes. The BERT code should be using arch_apei_get_mem_attribute() to use the > correct attributes. See ghes_map() for an example. bert_init() will need to use > a version of ioremap() that takes the pgprot_t. > > Always using ioremap_cache() means you get a cacheable mapping, regardless of > how firmware described this region in the UEFI memory map. This doesn't explain > why you got an alignment fault. The BERT error region doesn’t appear in the UEFI memory map on any of the systems I have looked at. This means that acpi_os_map_memory() will always map the area as PROT_DEVICE_nGnRnE, which results in an alignment fault on an unaligned access. For some reason this does not fail on some implementations. It isn’t clear to me from the ACPI spec whether this can be in anything other than normal memory as bert_init() seems to assume it is. We have used this patch to resolve this problem on the assumption it will eventually make it into the mainline kernel. Is there any chance this will happen? Thanks, Henry > > Otherwise, looks fine to me. > > > (N.B. I ignored this patch as it wasn't copied to linux-arm-kernel and the > subject says its about sysfs<->ACPI, nothing to do with APEI!) > > > Thanks, > > James >
On Mon, Feb 28, 2022 at 11:51:23PM +0000, Henry Willard wrote: > > > > On Apr 1, 2020, at 5:44 AM, James Morse <james.morse@arm.com> wrote: > > > > Hello! > > > > On 3/20/20 1:19 PM, Mark Rutland wrote: > >> [adding James and Lorenzo] > > > > (but not actually...) > > > > > >> On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: > >>> From: Colin Ian King <colin.king@canonical.com> > >>> > >>> Reading ACPI data on ARM64 at a non-aligned offset from > >>> /sys/firmware/acpi/tables/data/BERT will cause a splat because > >>> the data is I/O memory mapped > > > > On your platform, on someone else's it may be in memory. > > > > Which platform is this on? > > (I've never seen one generate a BERT!) > > I have seen this on several platforms. The latest is an Altra based machine. It shows up in the Linux Test Project: ltp test "read_all -d /sys -q -r 10”. > > > > > >>> and being read with just a memcpy. > >>> Fix this by introducing an I/O variant of memory_read_from_buffer > >>> and using I/O memory mapped copies instead. > > > >> Just to check, is that correct is it correct to map those tables with > >> Device attributes in the first place, or should we be mapping the tables > >> with Normal Cacheable attributes with memremap()? > >> > >> If the FW placed those into memory using cacheavble attributes, reading > >> them using Device attributes could result in stale values, which could > >> be garbage. > > > > Yes. The BERT code should be using arch_apei_get_mem_attribute() to use the > > correct attributes. See ghes_map() for an example. bert_init() will need to use > > a version of ioremap() that takes the pgprot_t. > > > > Always using ioremap_cache() means you get a cacheable mapping, regardless of > > how firmware described this region in the UEFI memory map. This doesn't explain > > why you got an alignment fault. > > The BERT error region doesn’t appear in the UEFI memory map on any of the systems I have looked at. This means that acpi_os_map_memory() will always map the area as PROT_DEVICE_nGnRnE, which results in an alignment fault on an unaligned access. For some reason this does not fail on some implementations. > > It isn’t clear to me from the ACPI spec whether this can be in anything other than normal memory as bert_init() seems to assume it is. > > We have used this patch to resolve this problem on the assumption it > will eventually make it into the mainline kernel. Is there any chance > this will happen? Yes this should be fixed. I put together a patch that I needed to post, below. -- >8 -- Subject: [PATCH] ACPI: osl: Fix BERT error region memory mapping Currently the sysfs interface maps the BERT error region as "memory" (through acpi_os_map_memory()) in order to copy the error records into memory buffers through memory operations (eg memory_read_from_buffer()). The OS system cannot detect whether the BERT error region is part of system RAM or it is "device memory" (eg BMC memory) and therefore it cannot detect which memory attributes the bus to memory support (and corresponding kernel mapping, unless firmware provides the required information). The acpi_os_map_memory() arch backend implementation determines the mapping attributes. On arm64, if the BERT error region is not present in the EFI memory map, the error region is mapped as device-nGnRnE; this triggers alignment faults since memcpy unaligned accesses are not allowed in device-nGnRnE regions. The ACPI sysfs code cannot therefore map by default the BERT error region with memory semantics but should use a safer default. Change the sysfs code to map the BERT error region as MMIO (through acpi_os_map_iomem()) and use the memcpy_fromio() interface to read the error region into the kernel buffer. Link: https://lore.kernel.org/linux-arm-kernel/31ffe8fc-f5ee-2858-26c5-0fd8bdd68702@arm.com Link: https://lore.kernel.org/linux-acpi/CAJZ5v0g+OVbhuUUDrLUCfX_mVqY_e8ubgLTU98=jfjTeb4t+Pw@mail.gmail.com Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> Cc: Ard Biesheuvel <ardb@kernel.org> Cc: Will Deacon <will@kernel.org> Cc: Hanjun Guo <guohanjun@huawei.com> Cc: Sudeep Holla <sudeep.holla@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> --- drivers/acpi/sysfs.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index a4b638bea6f1..cc2fe0618178 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -415,19 +415,30 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, loff_t offset, size_t count) { struct acpi_data_attr *data_attr; - void *base; - ssize_t rc; + void __iomem *base; + ssize_t size; data_attr = container_of(bin_attr, struct acpi_data_attr, attr); + size = data_attr->attr.size; + + if (offset < 0) + return -EINVAL; + + if (offset >= size) + return 0; - base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); + if (count > size - offset) + count = size - offset; + + base = acpi_os_map_iomem(data_attr->addr, size); if (!base) return -ENOMEM; - rc = memory_read_from_buffer(buf, count, &offset, base, - data_attr->attr.size); - acpi_os_unmap_memory(base, data_attr->attr.size); - return rc; + memcpy_fromio(buf, base + offset, count); + + acpi_os_unmap_iomem(base, size); + + return count; } static int acpi_bert_data_init(void *th, struct acpi_data_attr *data_attr)
On 22-03-01 08:00, Lorenzo Pieralisi wrote: > On Mon, Feb 28, 2022 at 11:51:23PM +0000, Henry Willard wrote: >> >> >>> On Apr 1, 2020, at 5:44 AM, James Morse <james.morse@arm.com> wrote: >>> >>> Hello! >>> >>> On 3/20/20 1:19 PM, Mark Rutland wrote: >>>> [adding James and Lorenzo] >>> >>> (but not actually...) >>> >>> >>>> On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: >>>>> From: Colin Ian King <colin.king@canonical.com> >>>>> >>>>> Reading ACPI data on ARM64 at a non-aligned offset from >>>>> /sys/firmware/acpi/tables/data/BERT will cause a splat because >>>>> the data is I/O memory mapped >>> >>> On your platform, on someone else's it may be in memory. >>> >>> Which platform is this on? >>> (I've never seen one generate a BERT!) >> >> I have seen this on several platforms. The latest is an Altra based machine. It shows up in the Linux Test Project: ltp test "read_all -d /sys -q -r 10”. >>> >>> >>>>> and being read with just a memcpy. >>>>> Fix this by introducing an I/O variant of memory_read_from_buffer >>>>> and using I/O memory mapped copies instead. >>> >>>> Just to check, is that correct is it correct to map those tables with >>>> Device attributes in the first place, or should we be mapping the tables >>>> with Normal Cacheable attributes with memremap()? >>>> >>>> If the FW placed those into memory using cacheavble attributes, reading >>>> them using Device attributes could result in stale values, which could >>>> be garbage. >>> >>> Yes. The BERT code should be using arch_apei_get_mem_attribute() to use the >>> correct attributes. See ghes_map() for an example. bert_init() will need to use >>> a version of ioremap() that takes the pgprot_t. >>> >>> Always using ioremap_cache() means you get a cacheable mapping, regardless of >>> how firmware described this region in the UEFI memory map. This doesn't explain >>> why you got an alignment fault. >> >> The BERT error region doesn’t appear in the UEFI memory map on any of the systems I have looked at. This means that acpi_os_map_memory() will always map the area as PROT_DEVICE_nGnRnE, which results in an alignment fault on an unaligned access. For some reason this does not fail on some implementations. >> >> It isn’t clear to me from the ACPI spec whether this can be in anything other than normal memory as bert_init() seems to assume it is. >> >> We have used this patch to resolve this problem on the assumption it >> will eventually make it into the mainline kernel. Is there any chance >> this will happen? > > Yes this should be fixed. I put together a patch that I needed to post, > below. > > -- >8 -- > Subject: [PATCH] ACPI: osl: Fix BERT error region memory mapping I hit this read issue while debugging something else and this addressed the problem for me on Ampere Altra and AltraMax systems. Previously I could only read the BERT file using dd with BS of 1-4. With this patch, now I can 'cp' the BERT file. Tested-by: Doug Rady <dcrady@os.amperecomputing.com> > > Currently the sysfs interface maps the BERT error region as "memory" > (through acpi_os_map_memory()) in order to copy the error records into > memory buffers through memory operations (eg memory_read_from_buffer()). > > The OS system cannot detect whether the BERT error region is part of > system RAM or it is "device memory" (eg BMC memory) and therefore it > cannot detect which memory attributes the bus to memory support (and > corresponding kernel mapping, unless firmware provides the required > information). > > The acpi_os_map_memory() arch backend implementation determines the > mapping attributes. On arm64, if the BERT error region is not present in > the EFI memory map, the error region is mapped as device-nGnRnE; this > triggers alignment faults since memcpy unaligned accesses are not > allowed in device-nGnRnE regions. > > The ACPI sysfs code cannot therefore map by default the BERT error > region with memory semantics but should use a safer default. > > Change the sysfs code to map the BERT error region as MMIO (through > acpi_os_map_iomem()) and use the memcpy_fromio() interface to read the > error region into the kernel buffer. > > Link: https://lore.kernel.org/linux-arm-kernel/31ffe8fc-f5ee-2858-26c5-0fd8bdd68702@arm.com > Link: https://lore.kernel.org/linux-acpi/CAJZ5v0g+OVbhuUUDrLUCfX_mVqY_e8ubgLTU98=jfjTeb4t+Pw@mail.gmail.com > Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> > Cc: Ard Biesheuvel <ardb@kernel.org> > Cc: Will Deacon <will@kernel.org> > Cc: Hanjun Guo <guohanjun@huawei.com> > Cc: Sudeep Holla <sudeep.holla@arm.com> > Cc: Catalin Marinas <catalin.marinas@arm.com> > Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> > --- > drivers/acpi/sysfs.c | 25 ++++++++++++++++++------- > 1 file changed, 18 insertions(+), 7 deletions(-) > > diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c > index a4b638bea6f1..cc2fe0618178 100644 > --- a/drivers/acpi/sysfs.c > +++ b/drivers/acpi/sysfs.c > @@ -415,19 +415,30 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, > loff_t offset, size_t count) > { > struct acpi_data_attr *data_attr; > - void *base; > - ssize_t rc; > + void __iomem *base; > + ssize_t size; > > data_attr = container_of(bin_attr, struct acpi_data_attr, attr); > + size = data_attr->attr.size; > + > + if (offset < 0) > + return -EINVAL; > + > + if (offset >= size) > + return 0; > > - base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); > + if (count > size - offset) > + count = size - offset; > + > + base = acpi_os_map_iomem(data_attr->addr, size); > if (!base) > return -ENOMEM; > - rc = memory_read_from_buffer(buf, count, &offset, base, > - data_attr->attr.size); > - acpi_os_unmap_memory(base, data_attr->attr.size); > > - return rc; > + memcpy_fromio(buf, base + offset, count); > + > + acpi_os_unmap_iomem(base, size); > + > + return count; > } > > static int acpi_bert_data_init(void *th, struct acpi_data_attr *data_attr)
Hi Lorenzo, Just checking the status on this patch? Are you planning to submit for Rafael to pull? It is more important now that the below has gone into 5.18: '[PATCH] ACPI/APEI: Limit printable size of BERT table data' https://lore.kernel.org/lkml/CAJZ5v0gMh2ed+ZWOnd-t_uTrZtm=AUfxOAkAKWT7WQK3=gf+7w@mail.gmail.com/ Thank you! doug On 22-03-07 13:22, doug rady OS wrote: > On 22-03-01 08:00, Lorenzo Pieralisi wrote: >> On Mon, Feb 28, 2022 at 11:51:23PM +0000, Henry Willard wrote: >>> >>> >>>> On Apr 1, 2020, at 5:44 AM, James Morse <james.morse@arm.com> wrote: >>>> >>>> Hello! >>>> >>>> On 3/20/20 1:19 PM, Mark Rutland wrote: >>>>> [adding James and Lorenzo] >>>> >>>> (but not actually...) >>>> >>>> >>>>> On Tue, Mar 17, 2020 at 04:54:09PM +0000, Colin King wrote: >>>>>> From: Colin Ian King <colin.king@canonical.com> >>>>>> >>>>>> Reading ACPI data on ARM64 at a non-aligned offset from >>>>>> /sys/firmware/acpi/tables/data/BERT will cause a splat because >>>>>> the data is I/O memory mapped >>>> >>>> On your platform, on someone else's it may be in memory. >>>> >>>> Which platform is this on? >>>> (I've never seen one generate a BERT!) >>> >>> I have seen this on several platforms. The latest is an Altra based machine. It shows up in the Linux Test Project: ltp test "read_all -d /sys -q -r 10”. >>>> >>>> >>>>>> and being read with just a memcpy. >>>>>> Fix this by introducing an I/O variant of memory_read_from_buffer >>>>>> and using I/O memory mapped copies instead. >>>> >>>>> Just to check, is that correct is it correct to map those tables with >>>>> Device attributes in the first place, or should we be mapping the tables >>>>> with Normal Cacheable attributes with memremap()? >>>>> >>>>> If the FW placed those into memory using cacheavble attributes, reading >>>>> them using Device attributes could result in stale values, which could >>>>> be garbage. >>>> >>>> Yes. The BERT code should be using arch_apei_get_mem_attribute() to use the >>>> correct attributes. See ghes_map() for an example. bert_init() will need to use >>>> a version of ioremap() that takes the pgprot_t. >>>> >>>> Always using ioremap_cache() means you get a cacheable mapping, regardless of >>>> how firmware described this region in the UEFI memory map. This doesn't explain >>>> why you got an alignment fault. >>> >>> The BERT error region doesn’t appear in the UEFI memory map on any of the systems I have looked at. This means that acpi_os_map_memory() will always map the area as PROT_DEVICE_nGnRnE, which results in an alignment fault on an unaligned access. For some reason this does not fail on some implementations. >>> >>> It isn’t clear to me from the ACPI spec whether this can be in anything other than normal memory as bert_init() seems to assume it is. >>> >>> We have used this patch to resolve this problem on the assumption it >>> will eventually make it into the mainline kernel. Is there any chance >>> this will happen? >> >> Yes this should be fixed. I put together a patch that I needed to post, >> below. >> >> -- >8 -- >> Subject: [PATCH] ACPI: osl: Fix BERT error region memory mapping > > I hit this read issue while debugging something else and this addressed > the problem for me on Ampere Altra and AltraMax systems. Previously I > could only read the BERT file using dd with BS of 1-4. > With this patch, now I can 'cp' the BERT file. > > Tested-by: Doug Rady <dcrady@os.amperecomputing.com> > > >> >> Currently the sysfs interface maps the BERT error region as "memory" >> (through acpi_os_map_memory()) in order to copy the error records into >> memory buffers through memory operations (eg memory_read_from_buffer()). >> >> The OS system cannot detect whether the BERT error region is part of >> system RAM or it is "device memory" (eg BMC memory) and therefore it >> cannot detect which memory attributes the bus to memory support (and >> corresponding kernel mapping, unless firmware provides the required >> information). >> >> The acpi_os_map_memory() arch backend implementation determines the >> mapping attributes. On arm64, if the BERT error region is not present in >> the EFI memory map, the error region is mapped as device-nGnRnE; this >> triggers alignment faults since memcpy unaligned accesses are not >> allowed in device-nGnRnE regions. >> >> The ACPI sysfs code cannot therefore map by default the BERT error >> region with memory semantics but should use a safer default. >> >> Change the sysfs code to map the BERT error region as MMIO (through >> acpi_os_map_iomem()) and use the memcpy_fromio() interface to read the >> error region into the kernel buffer. >> >> Link: https://lore.kernel.org/linux-arm-kernel/31ffe8fc-f5ee-2858-26c5-0fd8bdd68702@arm.com >> Link: https://lore.kernel.org/linux-acpi/CAJZ5v0g+OVbhuUUDrLUCfX_mVqY_e8ubgLTU98=jfjTeb4t+Pw@mail.gmail.com >> Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> >> Cc: Ard Biesheuvel <ardb@kernel.org> >> Cc: Will Deacon <will@kernel.org> >> Cc: Hanjun Guo <guohanjun@huawei.com> >> Cc: Sudeep Holla <sudeep.holla@arm.com> >> Cc: Catalin Marinas <catalin.marinas@arm.com> >> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> >> --- >> drivers/acpi/sysfs.c | 25 ++++++++++++++++++------- >> 1 file changed, 18 insertions(+), 7 deletions(-) >> >> diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c >> index a4b638bea6f1..cc2fe0618178 100644 >> --- a/drivers/acpi/sysfs.c >> +++ b/drivers/acpi/sysfs.c >> @@ -415,19 +415,30 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, >> loff_t offset, size_t count) >> { >> struct acpi_data_attr *data_attr; >> - void *base; >> - ssize_t rc; >> + void __iomem *base; >> + ssize_t size; >> >> data_attr = container_of(bin_attr, struct acpi_data_attr, attr); >> + size = data_attr->attr.size; >> + >> + if (offset < 0) >> + return -EINVAL; >> + >> + if (offset >= size) >> + return 0; >> >> - base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); >> + if (count > size - offset) >> + count = size - offset; >> + >> + base = acpi_os_map_iomem(data_attr->addr, size); >> if (!base) >> return -ENOMEM; >> - rc = memory_read_from_buffer(buf, count, &offset, base, >> - data_attr->attr.size); >> - acpi_os_unmap_memory(base, data_attr->attr.size); >> >> - return rc; >> + memcpy_fromio(buf, base + offset, count); >> + >> + acpi_os_unmap_iomem(base, size); >> + >> + return count; >> } >> >> static int acpi_bert_data_init(void *th, struct acpi_data_attr *data_attr) >
diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index c60d2c6..fb9e216 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -446,7 +446,7 @@ static ssize_t acpi_data_show(struct file *filp, struct kobject *kobj, base = acpi_os_map_memory(data_attr->addr, data_attr->attr.size); if (!base) return -ENOMEM; - rc = memory_read_from_buffer(buf, count, &offset, base, + rc = memory_read_from_io_buffer(buf, count, &offset, base, data_attr->attr.size); acpi_os_unmap_memory(base, data_attr->attr.size); diff --git a/fs/libfs.c b/fs/libfs.c index c686bd9..1a49da1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,6 +20,7 @@ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> +#include <linux/io.h> #include <linux/uaccess.h> @@ -800,6 +801,39 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, } EXPORT_SYMBOL(memory_read_from_buffer); +/** + * memory_read_from_io_buffer - copy data from a io memory mapped buffer + * @to: the kernel space buffer to read to + * @count: the maximum number of bytes to read + * @ppos: the current position in the buffer + * @from: the buffer to read from + * @available: the size of the buffer + * + * The memory_read_from_buffer() function reads up to @count bytes from the + * io memory mappy buffer @from at offset @ppos into the kernel space address + * starting at @to. + * + * On success, the number of bytes read is returned and the offset @ppos is + * advanced by this number, or negative value is returned on error. + **/ +ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available) +{ + loff_t pos = *ppos; + + if (pos < 0) + return -EINVAL; + if (pos >= available) + return 0; + if (count > available - pos) + count = available - pos; + memcpy_fromio(to, from + pos, count); + *ppos = pos + count; + + return count; +} +EXPORT_SYMBOL(memory_read_from_io_buffer); + /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then diff --git a/include/linux/string.h b/include/linux/string.h index 6dfbb2e..0c6ec2a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); +extern ssize_t memory_read_from_io_buffer(void *to, size_t count, loff_t *ppos, + const void *from, size_t available); int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);