diff mbox series

remoteproc: imx_dsp_rproc: add custom memory copy implementation for i.MX DSP Cores

Message ID 20230125110100.14647-1-iuliana.prodan@oss.nxp.com (mailing list archive)
State Superseded
Headers show
Series remoteproc: imx_dsp_rproc: add custom memory copy implementation for i.MX DSP Cores | expand

Commit Message

Iuliana Prodan (OSS) Jan. 25, 2023, 11:01 a.m. UTC
From: Iuliana Prodan <iuliana.prodan@nxp.com>

The IRAM is part of the HiFi DSP.
According to hardware specification only 32-bits write are allowed
otherwise we get a Kernel panic.

Therefore add a custom memory copy function to deal with the
above restriction.

Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
---
 drivers/remoteproc/imx_dsp_rproc.c | 122 ++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 1 deletion(-)

Comments

Mathieu Poirier Jan. 26, 2023, 10:49 p.m. UTC | #1
On Wed, Jan 25, 2023 at 01:01:00PM +0200, Iuliana Prodan (OSS) wrote:
> From: Iuliana Prodan <iuliana.prodan@nxp.com>
> 
> The IRAM is part of the HiFi DSP.
> According to hardware specification only 32-bits write are allowed
> otherwise we get a Kernel panic.
> 
> Therefore add a custom memory copy function to deal with the
> above restriction.
> 
> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
> ---
>  drivers/remoteproc/imx_dsp_rproc.c | 122 ++++++++++++++++++++++++++++-
>  1 file changed, 121 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c
> index 95da1cbefacf..a9991d085494 100644
> --- a/drivers/remoteproc/imx_dsp_rproc.c
> +++ b/drivers/remoteproc/imx_dsp_rproc.c
> @@ -715,6 +715,126 @@ static void imx_dsp_rproc_kick(struct rproc *rproc, int vqid)
>  		dev_err(dev, "%s: failed (%d, err:%d)\n", __func__, vqid, err);
>  }
>  
> +/*
> + * Custom memory copy implementation for i.MX DSP Cores
> + *
> + * The IRAM is part of the HiFi DSP.
> + * According to hw specs only 32-bits writes are allowed.
> + */
> +static int imx_dsp_rproc_memcpy(void *dest, const void *src, size_t size)
> +{
> +	const u8 *src_byte = src;
> +	u32 affected_mask;
> +	u32 tmp;
> +	int q, r;
> +
> +	q = size / 4;
> +	r = size % 4;
> +
> +	/* __iowrite32_copy use 32bit size values so divide by 4 */
> +	__iowrite32_copy(dest, src, q);

The current driver for imx_dsp_rproc does not provide a rproc_da_to_va()
operation, meaning that @is_iomem in rproc_elf_load_segments() can't be true,
forcing a memcpy() operation to be used.  And yet above an _iowrite32_copy() is
used...  

In the conversation that came out of[1], Daniel Baluta mentions that
read/writes should be done in multiples of 32/64 bit but here a blanket 32 bit
enforcement is done.

> +
> +	if (r) {
> +		affected_mask = (1 << (8 * r)) - 1;
> +
> +		/* first read the 32bit data of dest, then change affected
> +		 * bytes, and write back to dest.
> +		 * For unaffected bytes, it should not be changed
> +		 */
> +		tmp = ioread32(dest + q * 4);
> +		tmp &= ~affected_mask;
> +
> +		tmp |= *(u32 *)(src_byte + q * 4) & affected_mask;
> +		iowrite32(tmp, dest + q * 4);
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * imx_dsp_rproc_elf_load_segments() - load firmware segments to memory
> + * @rproc: remote processor which will be booted using these fw segments
> + * @fw: the ELF firmware image
> + *
> + * This function loads the firmware segments to memory, where the remote
> + * processor expects them.
> + *
> + * Return: 0 on success and an appropriate error code otherwise
> + */
> +static int imx_dsp_rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
> +{
> +	struct device *dev = &rproc->dev;
> +	const void *ehdr, *phdr;
> +	int i, ret = 0;
> +	u16 phnum;
> +	const u8 *elf_data = fw->data;
> +	u8 class = fw_elf_get_class(fw);
> +	u32 elf_phdr_get_size = elf_size_of_phdr(class);
> +
> +	ehdr = elf_data;
> +	phnum = elf_hdr_get_e_phnum(class, ehdr);
> +	phdr = elf_data + elf_hdr_get_e_phoff(class, ehdr);
> +
> +	/* go through the available ELF segments */
> +	for (i = 0; i < phnum; i++, phdr += elf_phdr_get_size) {
> +		u64 da = elf_phdr_get_p_paddr(class, phdr);
> +		u64 memsz = elf_phdr_get_p_memsz(class, phdr);
> +		u64 filesz = elf_phdr_get_p_filesz(class, phdr);
> +		u64 offset = elf_phdr_get_p_offset(class, phdr);
> +		u32 type = elf_phdr_get_p_type(class, phdr);
> +		bool is_iomem = false;
> +		void *ptr;
> +
> +		if (type != PT_LOAD || !memsz)
> +			continue;
> +
> +		dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n",
> +			type, da, memsz, filesz);
> +
> +		if (filesz > memsz) {
> +			dev_err(dev, "bad phdr filesz 0x%llx memsz 0x%llx\n",
> +				filesz, memsz);
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		if (offset + filesz > fw->size) {
> +			dev_err(dev, "truncated fw: need 0x%llx avail 0x%zx\n",
> +				offset + filesz, fw->size);
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		if (!rproc_u64_fit_in_size_t(memsz)) {
> +			dev_err(dev, "size (%llx) does not fit in size_t type\n",
> +				memsz);
> +			ret = -EOVERFLOW;
> +			break;
> +		}
> +
> +		/* grab the kernel address for this device address */
> +		ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem);
> +		if (!ptr) {
> +			dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da,
> +				memsz);
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		/* put the segment where the remote processor expects it */
> +		if (filesz) {
> +			ret = imx_dsp_rproc_memcpy(ptr, elf_data + offset, filesz);
> +			if (ret) {
> +				dev_err(dev, "memory copy failed for da 0x%llx memsz 0x%llx\n",
> +					da, memsz);
> +				break;
> +			}
> +		}

This patchset from last year[1] goes to great length to avoid using a driver
specific function and now you are trying to bring that back...  So how was it
working before and why are things broken now?  Moreover, function
rproc_elf_load_segments() deals with situations where the memory slot is bigger
than the file size[2], which is omitted here.

Thanks,
Mathieu

[1]. https://lore.kernel.org/linux-arm-kernel/20220323064944.1351923-1-peng.fan@oss.nxp.com/
[2]. https://elixir.bootlin.com/linux/v6.2-rc5/source/drivers/remoteproc/remoteproc_elf_loader.c#L221 

> +	}
> +
> +	return ret;
> +}
> +
>  static int imx_dsp_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
>  {
>  	if (rproc_elf_load_rsc_table(rproc, fw))
> @@ -729,7 +849,7 @@ static const struct rproc_ops imx_dsp_rproc_ops = {
>  	.start		= imx_dsp_rproc_start,
>  	.stop		= imx_dsp_rproc_stop,
>  	.kick		= imx_dsp_rproc_kick,
> -	.load		= rproc_elf_load_segments,
> +	.load		= imx_dsp_rproc_elf_load_segments,
>  	.parse_fw	= imx_dsp_rproc_parse_fw,
>  	.sanity_check	= rproc_elf_sanity_check,
>  	.get_boot_addr	= rproc_elf_get_boot_addr,
> -- 
> 2.17.1
>
Iuliana Prodan Jan. 27, 2023, 12:16 a.m. UTC | #2
Hi Mathieu,

On 1/27/2023 12:49 AM, Mathieu Poirier wrote:
> On Wed, Jan 25, 2023 at 01:01:00PM +0200, Iuliana Prodan (OSS) wrote:
>> From: Iuliana Prodan <iuliana.prodan@nxp.com>
>>
>> The IRAM is part of the HiFi DSP.
>> According to hardware specification only 32-bits write are allowed
>> otherwise we get a Kernel panic.
>>
>> Therefore add a custom memory copy function to deal with the
>> above restriction.
>>
>> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
>> ---
>>   drivers/remoteproc/imx_dsp_rproc.c | 122 ++++++++++++++++++++++++++++-
>>   1 file changed, 121 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c
>> index 95da1cbefacf..a9991d085494 100644
>> --- a/drivers/remoteproc/imx_dsp_rproc.c
>> +++ b/drivers/remoteproc/imx_dsp_rproc.c
>> @@ -715,6 +715,126 @@ static void imx_dsp_rproc_kick(struct rproc *rproc, int vqid)
>>   		dev_err(dev, "%s: failed (%d, err:%d)\n", __func__, vqid, err);
>>   }
>>   
>> +/*
>> + * Custom memory copy implementation for i.MX DSP Cores
>> + *
>> + * The IRAM is part of the HiFi DSP.
>> + * According to hw specs only 32-bits writes are allowed.
>> + */
>> +static int imx_dsp_rproc_memcpy(void *dest, const void *src, size_t size)
>> +{
>> +	const u8 *src_byte = src;
>> +	u32 affected_mask;
>> +	u32 tmp;
>> +	int q, r;
>> +
>> +	q = size / 4;
>> +	r = size % 4;
>> +
>> +	/* __iowrite32_copy use 32bit size values so divide by 4 */
>> +	__iowrite32_copy(dest, src, q);
> The current driver for imx_dsp_rproc does not provide a rproc_da_to_va()
> operation, meaning that @is_iomem in rproc_elf_load_segments() can't be true,
> forcing a memcpy() operation to be used.  And yet above an _iowrite32_copy() is
> used...

Yes, with rproc_elf_load_segments() we go through memcpy() because 
io_mem is always false (I already have a patch to get rid of this flag 
from imx_dsp_rproc since is not used), but memcpy() vs 
__iowrite32_copy() is crashing on sizes that are not multiple of 32bit.

That's why I added the __iowrite32_copy() and above this, I'm dividing 
the size by 4.

Also,  in imx_dsp_rproc, we shouldn't use memcpy, it should be 
memcpy_toio because all addresses are ioremap - see 
https://elixir.bootlin.com/linux/v6.2-rc5/source/drivers/remoteproc/imx_dsp_rproc.c#L601

> In the conversation that came out of[1], Daniel Baluta mentions that
> read/writes should be done in multiples of 32/64 bit but here a blanket 32 bit
> enforcement is done.

I've checked deeper the documentation and talked to our hardware team, 
and for NXP's DSPs we have a write restriction of 32bit.

>> +
>> +	if (r) {
>> +		affected_mask = (1 << (8 * r)) - 1;
>> +
>> +		/* first read the 32bit data of dest, then change affected
>> +		 * bytes, and write back to dest.
>> +		 * For unaffected bytes, it should not be changed
>> +		 */
>> +		tmp = ioread32(dest + q * 4);
>> +		tmp &= ~affected_mask;
>> +
>> +		tmp |= *(u32 *)(src_byte + q * 4) & affected_mask;
>> +		iowrite32(tmp, dest + q * 4);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * imx_dsp_rproc_elf_load_segments() - load firmware segments to memory
>> + * @rproc: remote processor which will be booted using these fw segments
>> + * @fw: the ELF firmware image
>> + *
>> + * This function loads the firmware segments to memory, where the remote
>> + * processor expects them.
>> + *
>> + * Return: 0 on success and an appropriate error code otherwise
>> + */
>> +static int imx_dsp_rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
>> +{
>> +	struct device *dev = &rproc->dev;
>> +	const void *ehdr, *phdr;
>> +	int i, ret = 0;
>> +	u16 phnum;
>> +	const u8 *elf_data = fw->data;
>> +	u8 class = fw_elf_get_class(fw);
>> +	u32 elf_phdr_get_size = elf_size_of_phdr(class);
>> +
>> +	ehdr = elf_data;
>> +	phnum = elf_hdr_get_e_phnum(class, ehdr);
>> +	phdr = elf_data + elf_hdr_get_e_phoff(class, ehdr);
>> +
>> +	/* go through the available ELF segments */
>> +	for (i = 0; i < phnum; i++, phdr += elf_phdr_get_size) {
>> +		u64 da = elf_phdr_get_p_paddr(class, phdr);
>> +		u64 memsz = elf_phdr_get_p_memsz(class, phdr);
>> +		u64 filesz = elf_phdr_get_p_filesz(class, phdr);
>> +		u64 offset = elf_phdr_get_p_offset(class, phdr);
>> +		u32 type = elf_phdr_get_p_type(class, phdr);
>> +		bool is_iomem = false;
>> +		void *ptr;
>> +
>> +		if (type != PT_LOAD || !memsz)
>> +			continue;
>> +
>> +		dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n",
>> +			type, da, memsz, filesz);
>> +
>> +		if (filesz > memsz) {
>> +			dev_err(dev, "bad phdr filesz 0x%llx memsz 0x%llx\n",
>> +				filesz, memsz);
>> +			ret = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		if (offset + filesz > fw->size) {
>> +			dev_err(dev, "truncated fw: need 0x%llx avail 0x%zx\n",
>> +				offset + filesz, fw->size);
>> +			ret = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		if (!rproc_u64_fit_in_size_t(memsz)) {
>> +			dev_err(dev, "size (%llx) does not fit in size_t type\n",
>> +				memsz);
>> +			ret = -EOVERFLOW;
>> +			break;
>> +		}
>> +
>> +		/* grab the kernel address for this device address */
>> +		ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem);
>> +		if (!ptr) {
>> +			dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da,
>> +				memsz);
>> +			ret = -EINVAL;
>> +			break;
>> +		}
>> +
>> +		/* put the segment where the remote processor expects it */
>> +		if (filesz) {
>> +			ret = imx_dsp_rproc_memcpy(ptr, elf_data + offset, filesz);
>> +			if (ret) {
>> +				dev_err(dev, "memory copy failed for da 0x%llx memsz 0x%llx\n",
>> +					da, memsz);
>> +				break;
>> +			}
>> +		}
> This patchset from last year[1] goes to great length to avoid using a driver
> specific function and now you are trying to bring that back...  So how was it
> working before and why are things broken now?

Until now, it was used in a limited scenario and the firmware was 
correctly built to respect the write restriction - having the IRAM 
sections size a multiple of 4bytes.

Now, I was trying a simple hello_world sample from Zephyr, complied with 
gcc and I crashed the Kernel trying to load it on the hifi4 DSP:

[ 2707.135094] SError Interrupt on CPU0, code 0x00000000bf000002 -- SError
[ 2707.135104] CPU: 0 PID: 665 Comm: sh Tainted: G C         
6.1.0-rc6-04789-gc80e5155d190 #135
[ 2707.135112] Hardware name: Freescale i.MX8QM MEK (DT)
[ 2707.135115] pstate: 400000c5 (nZcv daIF -PAN -UAO -TCO -DIT -SSBS 
BTYPE=--)
[ 2707.135123] pc : vprintk_store+0x3c0/0x460
[ 2707.135141] lr : vprintk_store+0x48/0x460
[ 2707.135149] sp : ffff80000b31b8c0
[ 2707.135152] x29: ffff80000b31b8c0 x28: 0000000000000018 x27: 
0000000000000000
[ 2707.135164] x26: 00000000596f8000 x25: ffff000810c15038 x24: 
0000000000000000
[ 2707.135173] x23: ffff8000098dfaf8 x22: 0000000000000000 x21: 
0000000000000000
[ 2707.135181] x20: ffff80000b31ba30 x19: ffff800009abca31 x18: 
ffffffffffffffff
[ 2707.135190] x17: 6620313231783020 x16: 7a736d656d203030 x15: 
ffff80008b31b867
[ 2707.135199] x14: 0000000000000000 x13: ffff800009d22428 x12: 
000000000000083a
[ 2707.135208] x11: 00000000000002be x10: 0000000000000a40 x9 : 
ffff80000b31bb70
[ 2707.135216] x8 : ffff80000b31bb70 x7 : 00000000ffffffc8 x6 : 
ffff80000b31bb30
[ 2707.135224] x5 : ffff00081a098000 x4 : ffff80000b31ba30 x3 : 
ffff8000098dfaf8
[ 2707.135233] x2 : ffff80000990c000 x1 : 0000000000000000 x0 : 
ffff8008eface000
[ 2707.135243] Kernel panic - not syncing: Asynchronous SError Interrupt
[ 2707.135248] CPU: 0 PID: 665 Comm: sh Tainted: G C         
6.1.0-rc6-04789-gc80e5155d190 #135
[ 2707.135254] Hardware name: Freescale i.MX8QM MEK (DT)
[ 2707.135258] Call trace:
[ 2707.135261]  dump_backtrace.part.0+0xdc/0xf0
[ 2707.135275]  show_stack+0x18/0x40
[ 2707.135284]  dump_stack_lvl+0x68/0x84
[ 2707.135295]  dump_stack+0x18/0x34
[ 2707.135302]  panic+0x184/0x344
[ 2707.135311]  nmi_panic+0xac/0xb0
[ 2707.135319]  arm64_serror_panic+0x6c/0x7c
[ 2707.135324]  do_serror+0x58/0x5c
[ 2707.135329]  el1h_64_error_handler+0x30/0x4c
[ 2707.135339]  el1h_64_error+0x64/0x68
[ 2707.135344]  vprintk_store+0x3c0/0x460
[ 2707.135353]  vprintk_emit+0x104/0x294
[ 2707.135360]  vprintk_default+0x38/0x4c
[ 2707.135369]  vprintk+0xc0/0xe4
[ 2707.135376]  _printk+0x5c/0x84
[ 2707.135383]  rproc_elf_load_segments+0x228/0x308
[ 2707.135391]  rproc_start+0x50/0x1c8
[ 2707.135398]  rproc_boot+0x494/0x574
[ 2707.135404]  state_store+0x44/0x110
[ 2707.135413]  dev_attr_store+0x18/0x30
[ 2707.135422]  sysfs_kf_write+0x44/0x54
[ 2707.135433]  kernfs_fop_write_iter+0x118/0x1b0
[ 2707.135441]  vfs_write+0x220/0x2b0
[ 2707.135449]  ksys_write+0x68/0xf4
[ 2707.135455]  __arm64_sys_write+0x1c/0x2c
[ 2707.135461]  invoke_syscall+0x48/0x114
[ 2707.135470]  el0_svc_common.constprop.0+0xd4/0xfc
[ 2707.135477]  do_el0_svc+0x30/0xd0
[ 2707.135484]  el0_svc+0x2c/0x84
[ 2707.135492]  el0t_64_sync_handler+0xbc/0x140
[ 2707.135501]  el0t_64_sync+0x18c/0x190
[ 2707.135510] SMP: stopping secondary CPUs
[ 2707.135520] Kernel Offset: disabled
[ 2707.135522] CPU features: 0x20000,2082c084,0000421b
[ 2707.135527] Memory Limit: none
[ 2707.397688] ---[ end Kernel panic - not syncing: Asynchronous SError 
Interrupt ]---

>   Moreover, function
> rproc_elf_load_segments() deals with situations where the memory slot is bigger
> than the file size[2], which is omitted here.

I'll add this part in v2.

Thanks,

Iulia

> Thanks,
> Mathieu
>
> [1]. https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flinux-arm-kernel%2F20220323064944.1351923-1-peng.fan%40oss.nxp.com%2F&data=05%7C01%7Ciuliana.prodan%40nxp.com%7Cd946b7c87aaf4016c69908daffef8a64%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C638103701532520716%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=GNJhJnoiRWfAzvseUsAdwHRtY2w2mA816CKkTL%2F2czw%3D&reserved=0
> [2]. https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Felixir.bootlin.com%2Flinux%2Fv6.2-rc5%2Fsource%2Fdrivers%2Fremoteproc%2Fremoteproc_elf_loader.c%23L221&data=05%7C01%7Ciuliana.prodan%40nxp.com%7Cd946b7c87aaf4016c69908daffef8a64%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C638103701532520716%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=5P5FcuMb6H%2B5S4L2Il4BBAxOOIrr6Er5thNvhADhKdc%3D&reserved=0
>
Daniel Baluta Jan. 27, 2023, 2:12 p.m. UTC | #3
<snip>

>
> This patchset from last year[1] goes to great length to avoid using a driver
> specific function and now you are trying to bring that back...  So how was it
> working before and why are things broken now?  Moreover, function
> rproc_elf_load_segments() deals with situations where the memory slot is bigger
> than the file size[2], which is omitted here.

As I mentioned in [1] the problem was and it is still there if an elf
section has a size which
is not a multiple of 4.

And we do notice this when compiling Sound Open Firmware and loading
it via rproc interface.

This is a hardware limitation! And we need to address it. The
limitation happens for IRAM address range
of HIFI4 DSP integration with I.MX.

Some call it a limitation, others call it an optimization. Either way
we need to avoid crashing the kernel.

We have two options:

1) Fully address it in rproc driver (like the present patch).
2) Partially address it in rproc driver (by returning an error for
section with wrong sizes) and then
making all linker scripts to force elf sections to be a multiple of 4.

I would go for option 1) even if we go back to use our custom `load` function.
diff mbox series

Patch

diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c
index 95da1cbefacf..a9991d085494 100644
--- a/drivers/remoteproc/imx_dsp_rproc.c
+++ b/drivers/remoteproc/imx_dsp_rproc.c
@@ -715,6 +715,126 @@  static void imx_dsp_rproc_kick(struct rproc *rproc, int vqid)
 		dev_err(dev, "%s: failed (%d, err:%d)\n", __func__, vqid, err);
 }
 
+/*
+ * Custom memory copy implementation for i.MX DSP Cores
+ *
+ * The IRAM is part of the HiFi DSP.
+ * According to hw specs only 32-bits writes are allowed.
+ */
+static int imx_dsp_rproc_memcpy(void *dest, const void *src, size_t size)
+{
+	const u8 *src_byte = src;
+	u32 affected_mask;
+	u32 tmp;
+	int q, r;
+
+	q = size / 4;
+	r = size % 4;
+
+	/* __iowrite32_copy use 32bit size values so divide by 4 */
+	__iowrite32_copy(dest, src, q);
+
+	if (r) {
+		affected_mask = (1 << (8 * r)) - 1;
+
+		/* first read the 32bit data of dest, then change affected
+		 * bytes, and write back to dest.
+		 * For unaffected bytes, it should not be changed
+		 */
+		tmp = ioread32(dest + q * 4);
+		tmp &= ~affected_mask;
+
+		tmp |= *(u32 *)(src_byte + q * 4) & affected_mask;
+		iowrite32(tmp, dest + q * 4);
+	}
+
+	return 0;
+}
+
+/**
+ * imx_dsp_rproc_elf_load_segments() - load firmware segments to memory
+ * @rproc: remote processor which will be booted using these fw segments
+ * @fw: the ELF firmware image
+ *
+ * This function loads the firmware segments to memory, where the remote
+ * processor expects them.
+ *
+ * Return: 0 on success and an appropriate error code otherwise
+ */
+static int imx_dsp_rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw)
+{
+	struct device *dev = &rproc->dev;
+	const void *ehdr, *phdr;
+	int i, ret = 0;
+	u16 phnum;
+	const u8 *elf_data = fw->data;
+	u8 class = fw_elf_get_class(fw);
+	u32 elf_phdr_get_size = elf_size_of_phdr(class);
+
+	ehdr = elf_data;
+	phnum = elf_hdr_get_e_phnum(class, ehdr);
+	phdr = elf_data + elf_hdr_get_e_phoff(class, ehdr);
+
+	/* go through the available ELF segments */
+	for (i = 0; i < phnum; i++, phdr += elf_phdr_get_size) {
+		u64 da = elf_phdr_get_p_paddr(class, phdr);
+		u64 memsz = elf_phdr_get_p_memsz(class, phdr);
+		u64 filesz = elf_phdr_get_p_filesz(class, phdr);
+		u64 offset = elf_phdr_get_p_offset(class, phdr);
+		u32 type = elf_phdr_get_p_type(class, phdr);
+		bool is_iomem = false;
+		void *ptr;
+
+		if (type != PT_LOAD || !memsz)
+			continue;
+
+		dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n",
+			type, da, memsz, filesz);
+
+		if (filesz > memsz) {
+			dev_err(dev, "bad phdr filesz 0x%llx memsz 0x%llx\n",
+				filesz, memsz);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (offset + filesz > fw->size) {
+			dev_err(dev, "truncated fw: need 0x%llx avail 0x%zx\n",
+				offset + filesz, fw->size);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (!rproc_u64_fit_in_size_t(memsz)) {
+			dev_err(dev, "size (%llx) does not fit in size_t type\n",
+				memsz);
+			ret = -EOVERFLOW;
+			break;
+		}
+
+		/* grab the kernel address for this device address */
+		ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem);
+		if (!ptr) {
+			dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da,
+				memsz);
+			ret = -EINVAL;
+			break;
+		}
+
+		/* put the segment where the remote processor expects it */
+		if (filesz) {
+			ret = imx_dsp_rproc_memcpy(ptr, elf_data + offset, filesz);
+			if (ret) {
+				dev_err(dev, "memory copy failed for da 0x%llx memsz 0x%llx\n",
+					da, memsz);
+				break;
+			}
+		}
+	}
+
+	return ret;
+}
+
 static int imx_dsp_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
 {
 	if (rproc_elf_load_rsc_table(rproc, fw))
@@ -729,7 +849,7 @@  static const struct rproc_ops imx_dsp_rproc_ops = {
 	.start		= imx_dsp_rproc_start,
 	.stop		= imx_dsp_rproc_stop,
 	.kick		= imx_dsp_rproc_kick,
-	.load		= rproc_elf_load_segments,
+	.load		= imx_dsp_rproc_elf_load_segments,
 	.parse_fw	= imx_dsp_rproc_parse_fw,
 	.sanity_check	= rproc_elf_sanity_check,
 	.get_boot_addr	= rproc_elf_get_boot_addr,