Message ID | 8e61aed5f64e434abc1d7b6f81859c8a@realtek.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3] mmc: rtsx: improve performance for multi block rw | expand |
On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote: > > Improving performance for the CMD is multi-block read/write > and the data is sequential. > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) > or normal RW (CMD 17/24) if the CMD is multi-block and the data is > sequential then call to sd_rw_multi_seq() > > This patch mainly to control the timing of reply at CMD 12/13. > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > The new code to distinguish multi-block RW(CMD 18/25) and the data is > sequential or not, if the data is sequential RW driver do not send CMD 12 > and bypass CMD 13 until wait the different direction RW CMD > or trigger the delay_work to sent CMD 12. > > run benchmark result as below: > SD Card : Samsumg Pro Plus 128GB > Number of Samples:100, Sample Size:10MB > <Before> Read : 86.9 MB/s, Write : 38.3 MB/s > <After> Read : 91.5 MB/s, Write : 55.5 MB/s A much nicer commit message, thanks a lot! Would you mind running some additional tests, like random I/O read/writes? Also, please specify the benchmark tool and command you are using. In the meantime, I will continue to look at the code. Kind regards Uffe > > Signed-off-by: Ricky Wu <ricky_wu@realtek.com> > --- > v2: > make commit message more clarity > change function name for more clarity > v3: > add more commit message and benchmark result > --- > drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++- > 1 file changed, 180 insertions(+), 5 deletions(-) > > diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c > index 58cfaffa3c2d..ee2b0eec6422 100644 > --- a/drivers/mmc/host/rtsx_pci_sdmmc.c > +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c > @@ -22,6 +22,8 @@ > #include <asm/unaligned.h> > #include <linux/pm_runtime.h> > > +enum RW_MODE {NORMAL_RW, SEQ_RW}; > + > struct realtek_pci_sdmmc { > struct platform_device *pdev; > struct rtsx_pcr *pcr; > @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc { > > struct work_struct work; > struct mutex host_mutex; > + struct delayed_work rw_idle_work; > > u8 ssc_depth; > unsigned int clock; > @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc { > s32 cookie; > int cookie_sg_count; > bool using_cookie; > + > + enum RW_MODE rw_mode; > + u8 prev_dir; > + u8 cur_dir; > + u64 prev_sec_addr; > + u32 prev_sec_cnt; > }; > > static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios); > @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host, > dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = 0x%08x\n", > __func__, cmd_idx, arg); > > + if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == SEQ_RW) { > + cmd->resp[0] = R1_READY_FOR_DATA | (R1_STATE_TRAN << 9); > + goto out; > + } > + > + if (!mmc_op_multi(cmd->opcode)) > + host->rw_mode = NORMAL_RW; > + > rsp_type = sd_response_type(cmd); > if (rsp_type < 0) > goto out; > @@ -542,6 +559,93 @@ static int sd_write_long_data(struct realtek_pci_sdmmc *host, > return 0; > } > > +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) > +{ > + struct rtsx_pcr *pcr = host->pcr; > + struct mmc_host *mmc = host->mmc; > + struct mmc_card *card = mmc->card; > + struct mmc_data *data = mrq->data; > + int uhs = mmc_card_uhs(card); > + u8 cfg2; > + int err; > + size_t data_len = data->blksz * data->blocks; > + > + cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 | > + SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | SD_RSP_LEN_0; > + > + if (!uhs) > + cfg2 |= SD_NO_CHECK_WAIT_CRC_TO; > + > + rtsx_pci_init_cmd(pcr); > + sd_cmd_set_data_len(pcr, data->blocks, data->blksz); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0, > + DMA_DONE_INT, DMA_DONE_INT); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3, > + 0xFF, (u8)(data_len >> 24)); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2, > + 0xFF, (u8)(data_len >> 16)); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1, > + 0xFF, (u8)(data_len >> 8)); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, (u8)data_len); > + > + if (host->cur_dir == DMA_DIR_FROM_CARD) > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, > + 0x03 | DMA_PACK_SIZE_MASK, > + DMA_DIR_FROM_CARD | DMA_EN | DMA_512); > + else > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, > + 0x03 | DMA_PACK_SIZE_MASK, > + DMA_DIR_TO_CARD | DMA_EN | DMA_512); > + > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE, > + 0x01, RING_BUFFER); > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2); > + > + if (host->cur_dir == DMA_DIR_FROM_CARD) > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF, > + SD_TRANSFER_START | SD_TM_AUTO_READ_3); > + else > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF, > + SD_TRANSFER_START | SD_TM_AUTO_WRITE_3); > + > + rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER, > + SD_TRANSFER_END, SD_TRANSFER_END); > + rtsx_pci_send_cmd_no_wait(pcr); > + > + if (host->cur_dir == DMA_DIR_FROM_CARD) > + err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 1, 10000); > + else > + err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 0, 10000); > + > + if (err < 0) { > + sd_clear_error(host); > + return err; > + } > + > + return 0; > +} > + > +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) > +{ > + struct rtsx_pcr *pcr = host->pcr; > + struct mmc_command *cmd; > + > + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); > + > + cmd->opcode = MMC_STOP_TRANSMISSION; > + cmd->arg = 0; > + cmd->busy_timeout = 0; > + if (host->cur_dir == DMA_DIR_FROM_CARD) > + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; > + else > + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; > + sd_send_cmd_get_rsp(host, cmd); > + udelay(50); > + rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH); > + kfree(cmd); > + return 0; > +} > + > static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host) > { > rtsx_pci_write_register(host->pcr, SD_CFG1, > @@ -796,6 +900,45 @@ static inline int sd_rw_cmd(struct mmc_command *cmd) > (cmd->opcode == MMC_WRITE_BLOCK); > } > > +static void sd_rw_idle_work(struct work_struct *work) > +{ > + struct delayed_work *dwork = to_delayed_work(work); > + struct realtek_pci_sdmmc *host = container_of(dwork, > + struct realtek_pci_sdmmc, rw_idle_work); > + struct mmc_command *cmd; > + > + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); > + > + cmd->opcode = MMC_STOP_TRANSMISSION; > + cmd->arg = 0; > + cmd->busy_timeout = 0; > + if (host->cur_dir == DMA_DIR_FROM_CARD) > + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; > + else > + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; > + > + sd_send_cmd_get_rsp(host, cmd); > + host->rw_mode = NORMAL_RW; > + kfree(cmd); > +} > + > +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) > +{ > + struct mmc_command *cmd = mrq->cmd; > + struct mmc_data *data = mrq->data; > + > + if (!mmc_op_multi(cmd->opcode)) > + return 0; > + > + if (host->prev_dir != host->cur_dir) > + return 0; > + > + if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr) > + return 0; > + > + return 1; > +} > + > static void sd_request(struct work_struct *work) > { > struct realtek_pci_sdmmc *host = container_of(work, > @@ -841,12 +984,36 @@ static void sd_request(struct work_struct *work) > if (!data_size) { > sd_send_cmd_get_rsp(host, cmd); > } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { > - cmd->error = sd_rw_multi(host, mrq); > - if (!host->using_cookie) > - sdmmc_post_req(host->mmc, host->mrq, 0); > + /* Check multi-block and seq function*/ > + if (data->flags & MMC_DATA_READ) > + host->cur_dir = DMA_DIR_FROM_CARD; > + else > + host->cur_dir = DMA_DIR_TO_CARD; > + > + if (host->rw_mode == SEQ_RW) { > + cancel_delayed_work(&host->rw_idle_work); > + if (!sd_check_multi_seq(host, mrq)) { > + sd_stop_rw_multi_seq(host, mrq); > + host->rw_mode = NORMAL_RW; > + } > + } > + > + if (host->rw_mode == SEQ_RW) > + cmd->error = sd_rw_multi_seq(host, mrq); > + else { > + if (mmc_op_multi(cmd->opcode)) > + host->rw_mode = SEQ_RW; > + cmd->error = sd_rw_multi(host, mrq); > + if (!host->using_cookie) > + sdmmc_post_req(host->mmc, host->mrq, 0); > + } > + > + if (cmd->error) > + host->rw_mode = NORMAL_RW; > + > + if (mmc_op_multi(cmd->opcode) && host->rw_mode == SEQ_RW) > + mod_delayed_work(system_wq, &host->rw_idle_work, msecs_to_jiffies(150)); > > - if (mmc_op_multi(cmd->opcode) && mrq->stop) > - sd_send_cmd_get_rsp(host, mrq->stop); > } else { > sd_normal_rw(host, mrq); > } > @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work) > } > > mutex_lock(&host->host_mutex); > + if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { > + host->prev_dir = host->cur_dir; > + host->prev_sec_addr = data->blk_addr; > + host->prev_sec_cnt = data->blocks; > + } > host->mrq = NULL; > mutex_unlock(&host->host_mutex); > > @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct platform_device *pdev) > struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev); > > host->cookie = -1; > + host->rw_mode = NORMAL_RW; > mmc_detect_change(host->mmc, 0); > } > > @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev) > host->cookie = -1; > host->power_state = SDMMC_POWER_OFF; > INIT_WORK(&host->work, sd_request); > + INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work); > platform_set_drvdata(pdev, host); > pcr->slots[RTSX_SD_CARD].p_dev = pdev; > pcr->slots[RTSX_SD_CARD].card_event = rtsx_pci_sdmmc_card_event; > @@ -1526,6 +1700,7 @@ static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev) > pm_runtime_disable(&pdev->dev); > } > > + cancel_delayed_work_sync(&host->rw_idle_work); > cancel_work_sync(&host->work); > > mutex_lock(&host->host_mutex); > -- > 2.25.1
> -----Original Message----- > From: Ulf Hansson <ulf.hansson@linaro.org> > Sent: Tuesday, December 21, 2021 8:51 PM > To: Ricky WU <ricky_wu@realtek.com> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > linux-kernel@vger.kernel.org > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote: > > > > Improving performance for the CMD is multi-block read/write and the > > data is sequential. > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or > > normal RW (CMD 17/24) if the CMD is multi-block and the data is > > sequential then call to sd_rw_multi_seq() > > > > This patch mainly to control the timing of reply at CMD 12/13. > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > > The new code to distinguish multi-block RW(CMD 18/25) and the data is > > sequential or not, if the data is sequential RW driver do not send CMD > > 12 and bypass CMD 13 until wait the different direction RW CMD or > > trigger the delay_work to sent CMD 12. > > > > run benchmark result as below: > > SD Card : Samsumg Pro Plus 128GB > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s, > > Write : 38.3 MB/s <After> Read : 91.5 MB/s, Write : 55.5 MB/s > > A much nicer commit message, thanks a lot! Would you mind running some > additional tests, like random I/O read/writes? > > Also, please specify the benchmark tool and command you are using. In the > meantime, I will continue to look at the code. > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" and the Tool don't have random I/O to choice... Do you have any suggestion for testing random I/O But we think random I/O will not change much BR, Ricky > Kind regards > Uffe > > > > > Signed-off-by: Ricky Wu <ricky_wu@realtek.com> > > --- > > v2: > > make commit message more clarity > > change function name for more clarity > > v3: > > add more commit message and benchmark result > > --- > > drivers/mmc/host/rtsx_pci_sdmmc.c | 185 > > +++++++++++++++++++++++++++++- > > 1 file changed, 180 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c > > b/drivers/mmc/host/rtsx_pci_sdmmc.c > > index 58cfaffa3c2d..ee2b0eec6422 100644 > > --- a/drivers/mmc/host/rtsx_pci_sdmmc.c > > +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c > > @@ -22,6 +22,8 @@ > > #include <asm/unaligned.h> > > #include <linux/pm_runtime.h> > > > > +enum RW_MODE {NORMAL_RW, SEQ_RW}; > > + > > struct realtek_pci_sdmmc { > > struct platform_device *pdev; > > struct rtsx_pcr *pcr; > > @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc { > > > > struct work_struct work; > > struct mutex host_mutex; > > + struct delayed_work rw_idle_work; > > > > u8 ssc_depth; > > unsigned int clock; > > @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc { > > s32 cookie; > > int cookie_sg_count; > > bool using_cookie; > > + > > + enum RW_MODE rw_mode; > > + u8 prev_dir; > > + u8 cur_dir; > > + u64 prev_sec_addr; > > + u32 prev_sec_cnt; > > }; > > > > static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios > > *ios); @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct > realtek_pci_sdmmc *host, > > dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = > 0x%08x\n", > > __func__, cmd_idx, arg); > > > > + if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == > SEQ_RW) { > > + cmd->resp[0] = R1_READY_FOR_DATA | > (R1_STATE_TRAN << 9); > > + goto out; > > + } > > + > > + if (!mmc_op_multi(cmd->opcode)) > > + host->rw_mode = NORMAL_RW; > > + > > rsp_type = sd_response_type(cmd); > > if (rsp_type < 0) > > goto out; > > @@ -542,6 +559,93 @@ static int sd_write_long_data(struct > realtek_pci_sdmmc *host, > > return 0; > > } > > > > +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct > > +mmc_request *mrq) { > > + struct rtsx_pcr *pcr = host->pcr; > > + struct mmc_host *mmc = host->mmc; > > + struct mmc_card *card = mmc->card; > > + struct mmc_data *data = mrq->data; > > + int uhs = mmc_card_uhs(card); > > + u8 cfg2; > > + int err; > > + size_t data_len = data->blksz * data->blocks; > > + > > + cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 | > > + SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | > SD_RSP_LEN_0; > > + > > + if (!uhs) > > + cfg2 |= SD_NO_CHECK_WAIT_CRC_TO; > > + > > + rtsx_pci_init_cmd(pcr); > > + sd_cmd_set_data_len(pcr, data->blocks, data->blksz); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0, > > + DMA_DONE_INT, DMA_DONE_INT); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3, > > + 0xFF, (u8)(data_len >> 24)); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2, > > + 0xFF, (u8)(data_len >> 16)); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1, > > + 0xFF, (u8)(data_len >> 8)); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, > > + (u8)data_len); > > + > > + if (host->cur_dir == DMA_DIR_FROM_CARD) > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, > > + 0x03 | DMA_PACK_SIZE_MASK, > > + DMA_DIR_FROM_CARD | DMA_EN | > DMA_512); > > + else > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, > > + 0x03 | DMA_PACK_SIZE_MASK, > > + DMA_DIR_TO_CARD | DMA_EN | DMA_512); > > + > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE, > > + 0x01, RING_BUFFER); > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2); > > + > > + if (host->cur_dir == DMA_DIR_FROM_CARD) > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, > 0xFF, > > + SD_TRANSFER_START | > SD_TM_AUTO_READ_3); > > + else > > + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, > 0xFF, > > + SD_TRANSFER_START | > > + SD_TM_AUTO_WRITE_3); > > + > > + rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER, > > + SD_TRANSFER_END, SD_TRANSFER_END); > > + rtsx_pci_send_cmd_no_wait(pcr); > > + > > + if (host->cur_dir == DMA_DIR_FROM_CARD) > > + err = rtsx_pci_dma_transfer(pcr, data->sg, > host->sg_count, 1, 10000); > > + else > > + err = rtsx_pci_dma_transfer(pcr, data->sg, > > + host->sg_count, 0, 10000); > > + > > + if (err < 0) { > > + sd_clear_error(host); > > + return err; > > + } > > + > > + return 0; > > +} > > + > > +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, > > +struct mmc_request *mrq) { > > + struct rtsx_pcr *pcr = host->pcr; > > + struct mmc_command *cmd; > > + > > + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); > > + > > + cmd->opcode = MMC_STOP_TRANSMISSION; > > + cmd->arg = 0; > > + cmd->busy_timeout = 0; > > + if (host->cur_dir == DMA_DIR_FROM_CARD) > > + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | > MMC_CMD_AC; > > + else > > + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | > MMC_CMD_AC; > > + sd_send_cmd_get_rsp(host, cmd); > > + udelay(50); > > + rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH); > > + kfree(cmd); > > + return 0; > > +} > > + > > static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc > > *host) { > > rtsx_pci_write_register(host->pcr, SD_CFG1, @@ -796,6 +900,45 > > @@ static inline int sd_rw_cmd(struct mmc_command *cmd) > > (cmd->opcode == MMC_WRITE_BLOCK); } > > > > +static void sd_rw_idle_work(struct work_struct *work) { > > + struct delayed_work *dwork = to_delayed_work(work); > > + struct realtek_pci_sdmmc *host = container_of(dwork, > > + struct realtek_pci_sdmmc, rw_idle_work); > > + struct mmc_command *cmd; > > + > > + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); > > + > > + cmd->opcode = MMC_STOP_TRANSMISSION; > > + cmd->arg = 0; > > + cmd->busy_timeout = 0; > > + if (host->cur_dir == DMA_DIR_FROM_CARD) > > + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | > MMC_CMD_AC; > > + else > > + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | > > + MMC_CMD_AC; > > + > > + sd_send_cmd_get_rsp(host, cmd); > > + host->rw_mode = NORMAL_RW; > > + kfree(cmd); > > +} > > + > > +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct > > +mmc_request *mrq) { > > + struct mmc_command *cmd = mrq->cmd; > > + struct mmc_data *data = mrq->data; > > + > > + if (!mmc_op_multi(cmd->opcode)) > > + return 0; > > + > > + if (host->prev_dir != host->cur_dir) > > + return 0; > > + > > + if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr) > > + return 0; > > + > > + return 1; > > +} > > + > > static void sd_request(struct work_struct *work) { > > struct realtek_pci_sdmmc *host = container_of(work, @@ -841,12 > > +984,36 @@ static void sd_request(struct work_struct *work) > > if (!data_size) { > > sd_send_cmd_get_rsp(host, cmd); > > } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { > > - cmd->error = sd_rw_multi(host, mrq); > > - if (!host->using_cookie) > > - sdmmc_post_req(host->mmc, host->mrq, 0); > > + /* Check multi-block and seq function*/ > > + if (data->flags & MMC_DATA_READ) > > + host->cur_dir = DMA_DIR_FROM_CARD; > > + else > > + host->cur_dir = DMA_DIR_TO_CARD; > > + > > + if (host->rw_mode == SEQ_RW) { > > + cancel_delayed_work(&host->rw_idle_work); > > + if (!sd_check_multi_seq(host, mrq)) { > > + sd_stop_rw_multi_seq(host, mrq); > > + host->rw_mode = NORMAL_RW; > > + } > > + } > > + > > + if (host->rw_mode == SEQ_RW) > > + cmd->error = sd_rw_multi_seq(host, mrq); > > + else { > > + if (mmc_op_multi(cmd->opcode)) > > + host->rw_mode = SEQ_RW; > > + cmd->error = sd_rw_multi(host, mrq); > > + if (!host->using_cookie) > > + sdmmc_post_req(host->mmc, > host->mrq, 0); > > + } > > + > > + if (cmd->error) > > + host->rw_mode = NORMAL_RW; > > + > > + if (mmc_op_multi(cmd->opcode) && host->rw_mode == > SEQ_RW) > > + mod_delayed_work(system_wq, > > + &host->rw_idle_work, msecs_to_jiffies(150)); > > > > - if (mmc_op_multi(cmd->opcode) && mrq->stop) > > - sd_send_cmd_get_rsp(host, mrq->stop); > > } else { > > sd_normal_rw(host, mrq); > > } > > @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work) > > } > > > > mutex_lock(&host->host_mutex); > > + if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { > > + host->prev_dir = host->cur_dir; > > + host->prev_sec_addr = data->blk_addr; > > + host->prev_sec_cnt = data->blocks; > > + } > > host->mrq = NULL; > > mutex_unlock(&host->host_mutex); > > > > @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct > platform_device *pdev) > > struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev); > > > > host->cookie = -1; > > + host->rw_mode = NORMAL_RW; > > mmc_detect_change(host->mmc, 0); } > > > > @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct > platform_device *pdev) > > host->cookie = -1; > > host->power_state = SDMMC_POWER_OFF; > > INIT_WORK(&host->work, sd_request); > > + INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work); > > platform_set_drvdata(pdev, host); > > pcr->slots[RTSX_SD_CARD].p_dev = pdev; > > pcr->slots[RTSX_SD_CARD].card_event = > > rtsx_pci_sdmmc_card_event; @@ -1526,6 +1700,7 @@ static int > rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev) > > pm_runtime_disable(&pdev->dev); > > } > > > > + cancel_delayed_work_sync(&host->rw_idle_work); > > cancel_work_sync(&host->work); > > > > mutex_lock(&host->host_mutex); > > -- > > 2.25.1 > ------Please consider the environment before printing this e-mail.
On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote: > > > -----Original Message----- > > From: Ulf Hansson <ulf.hansson@linaro.org> > > Sent: Tuesday, December 21, 2021 8:51 PM > > To: Ricky WU <ricky_wu@realtek.com> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > linux-kernel@vger.kernel.org > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > Improving performance for the CMD is multi-block read/write and the > > > data is sequential. > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is > > > sequential then call to sd_rw_multi_seq() > > > > > > This patch mainly to control the timing of reply at CMD 12/13. > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > > > The new code to distinguish multi-block RW(CMD 18/25) and the data is > > > sequential or not, if the data is sequential RW driver do not send CMD > > > 12 and bypass CMD 13 until wait the different direction RW CMD or > > > trigger the delay_work to sent CMD 12. > > > > > > run benchmark result as below: > > > SD Card : Samsumg Pro Plus 128GB > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s, > > > Write : 38.3 MB/s <After> Read : 91.5 MB/s, Write : 55.5 MB/s > > > > A much nicer commit message, thanks a lot! Would you mind running some > > additional tests, like random I/O read/writes? > > > > Also, please specify the benchmark tool and command you are using. In the > > meantime, I will continue to look at the code. > > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" > and the Tool don't have random I/O to choice... > > Do you have any suggestion for testing random I/O > But we think random I/O will not change much I would probably look into using fio, https://fio.readthedocs.io/en/latest/ Another option that I use frequently is iozone, https://www.iozone.org. Here's a command line that I often use for iozone ./iozone -az -i0 -i1 -s 20m -y 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e [...] Kind regards Uffe
> -----Original Message----- > From: Ulf Hansson <ulf.hansson@linaro.org> > Sent: Thursday, December 23, 2021 6:37 PM > To: Ricky WU <ricky_wu@realtek.com> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > linux-kernel@vger.kernel.org > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > -----Original Message----- > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > Sent: Tuesday, December 21, 2021 8:51 PM > > > To: Ricky WU <ricky_wu@realtek.com> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > linux-kernel@vger.kernel.org > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > block rw > > > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > Improving performance for the CMD is multi-block read/write and > > > > the data is sequential. > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or > > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is > > > > sequential then call to sd_rw_multi_seq() > > > > > > > > This patch mainly to control the timing of reply at CMD 12/13. > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > > > > The new code to distinguish multi-block RW(CMD 18/25) and the data > > > > is sequential or not, if the data is sequential RW driver do not > > > > send CMD > > > > 12 and bypass CMD 13 until wait the different direction RW CMD or > > > > trigger the delay_work to sent CMD 12. > > > > > > > > run benchmark result as below: > > > > SD Card : Samsumg Pro Plus 128GB > > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s, > > > > Write : 38.3 MB/s <After> Read : 91.5 MB/s, Write : 55.5 MB/s > > > > > > A much nicer commit message, thanks a lot! Would you mind running > > > some additional tests, like random I/O read/writes? > > > > > > Also, please specify the benchmark tool and command you are using. > > > In the meantime, I will continue to look at the code. > > > > > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" > > and the Tool don't have random I/O to choice... > > > > Do you have any suggestion for testing random I/O But we think random > > I/O will not change much > > I would probably look into using fio, https://fio.readthedocs.io/en/latest/ > Filled random I/O data Before the patch: CMD (Randread): sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 fio-3.16 Starting 1 thread Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s] mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021 read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec) clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16 lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16 clat percentiles (usec): | 1.00th=[11338], 5.00th=[11469], 10.00th=[11600], 20.00th=[11600], | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731], | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863], | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341], | 99.99th=[34341] bw ( KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24 iops : min= 80, max= 86, avg=85.00, stdev= 1.41, samples=24 lat (msec) : 20=99.90%, 50=0.10% cpu : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=1 Run status group 0 (all jobs): READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec Disk stats (read/write): mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23% CMD (Randwrite): sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 fio-3.16 Starting 1 thread Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s] mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021 write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22 lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04 clat percentiles (usec): | 1.00th=[20579], 5.00th=[22414], 10.00th=[22676], 20.00th=[22938], | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725], | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361], | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897], | 99.99th=[94897] bw ( KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53 iops : min= 24, max= 42, avg=38.30, stdev= 3.77, samples=53 lat (msec) : 20=0.98%, 50=92.38%, 100=6.64% cpu : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=1 Run status group 0 (all jobs): WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec Disk stats (read/write): mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90% After the patch: CMD (Randread): sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 fio-3.16 Starting 1 thread Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s] mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021 read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec) clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86 lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87 clat percentiles (usec): | 1.00th=[11076], 5.00th=[11338], 10.00th=[11469], 20.00th=[11469], | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600], | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600], | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375], | 99.99th=[32375] bw ( KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23 iops : min= 82, max= 88, avg=86.52, stdev= 1.38, samples=23 lat (msec) : 20=99.80%, 50=0.20% cpu : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=1 Run status group 0 (all jobs): READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec Disk stats (read/write): mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21% CMD (Randwrite): sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 fio-3.16 Starting 1 thread Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s] mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021 write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets clat (msec): min=16, max=118, avg=25.37, stdev=16.34 lat (msec): min=16, max=118, avg=25.44, stdev=16.34 clat percentiles (msec): | 1.00th=[ 17], 5.00th=[ 20], 10.00th=[ 20], 20.00th=[ 20], | 30.00th=[ 20], 40.00th=[ 20], 50.00th=[ 20], 60.00th=[ 20], | 70.00th=[ 21], 80.00th=[ 21], 90.00th=[ 52], 95.00th=[ 75], | 99.00th=[ 78], 99.50th=[ 104], 99.90th=[ 114], 99.95th=[ 120], | 99.99th=[ 120] bw ( KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52 iops : min= 20, max= 50, avg=39.27, stdev=10.25, samples=52 lat (msec) : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68% cpu : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0 IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=1 Run status group 0 (all jobs): WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec Disk stats (read/write): mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89% BR, Ricky > Another option that I use frequently is iozone, https://www.iozone.org. > Here's a command line that I often use for iozone ./iozone -az -i0 -i1 -s 20m -y > 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e > > [...] > > Kind regards > Uffe > ------Please consider the environment before printing this e-mail.
On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote: > > > -----Original Message----- > > From: Ulf Hansson <ulf.hansson@linaro.org> > > Sent: Thursday, December 23, 2021 6:37 PM > > To: Ricky WU <ricky_wu@realtek.com> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > linux-kernel@vger.kernel.org > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > > > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > -----Original Message----- > > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > > Sent: Tuesday, December 21, 2021 8:51 PM > > > > To: Ricky WU <ricky_wu@realtek.com> > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > > linux-kernel@vger.kernel.org > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > > block rw > > > > > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > > > Improving performance for the CMD is multi-block read/write and > > > > > the data is sequential. > > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or > > > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is > > > > > sequential then call to sd_rw_multi_seq() > > > > > > > > > > This patch mainly to control the timing of reply at CMD 12/13. > > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > > > > > The new code to distinguish multi-block RW(CMD 18/25) and the data > > > > > is sequential or not, if the data is sequential RW driver do not > > > > > send CMD > > > > > 12 and bypass CMD 13 until wait the different direction RW CMD or > > > > > trigger the delay_work to sent CMD 12. > > > > > > > > > > run benchmark result as below: > > > > > SD Card : Samsumg Pro Plus 128GB > > > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s, > > > > > Write : 38.3 MB/s <After> Read : 91.5 MB/s, Write : 55.5 MB/s > > > > > > > > A much nicer commit message, thanks a lot! Would you mind running > > > > some additional tests, like random I/O read/writes? > > > > > > > > Also, please specify the benchmark tool and command you are using. > > > > In the meantime, I will continue to look at the code. > > > > > > > > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" > > > and the Tool don't have random I/O to choice... > > > > > > Do you have any suggestion for testing random I/O But we think random > > > I/O will not change much > > > > I would probably look into using fio, https://fio.readthedocs.io/en/latest/ > > > > Filled random I/O data > Before the patch: > CMD (Randread): > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread Thanks for running the tests! Overall, I would not expect an impact on the throughput when using a big blocksize like 1M. This is also pretty clear from the result you have provided. However, especially for random writes and reads, we want to try with smaller blocksizes. Like 8k or 16k, would you mind running another round of tests to see how that works out? I haven't yet been able to provide you with comments on the code, but I am looking into it. Kind regards Uffe > > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > fio-3.16 > Starting 1 thread > Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s] > mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021 > read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec) > clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16 > lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16 > clat percentiles (usec): > | 1.00th=[11338], 5.00th=[11469], 10.00th=[11600], 20.00th=[11600], > | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731], > | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863], > | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341], > | 99.99th=[34341] > bw ( KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24 > iops : min= 80, max= 86, avg=85.00, stdev= 1.41, samples=24 > lat (msec) : 20=99.90%, 50=0.10% > cpu : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256 > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 > latency : target=0, window=0, percentile=100.00%, depth=1 > > Run status group 0 (all jobs): > READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec > > Disk stats (read/write): > mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23% > > CMD (Randwrite): > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > fio-3.16 > Starting 1 thread > Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s] > mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021 > write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets > clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22 > lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04 > clat percentiles (usec): > | 1.00th=[20579], 5.00th=[22414], 10.00th=[22676], 20.00th=[22938], > | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725], > | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361], > | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897], > | 99.99th=[94897] > bw ( KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53 > iops : min= 24, max= 42, avg=38.30, stdev= 3.77, samples=53 > lat (msec) : 20=0.98%, 50=92.38%, 100=6.64% > cpu : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0 > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 > latency : target=0, window=0, percentile=100.00%, depth=1 > > Run status group 0 (all jobs): > WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec > > Disk stats (read/write): > mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90% > > > After the patch: > > CMD (Randread): > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread > > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > fio-3.16 > Starting 1 thread > Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s] > mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021 > read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec) > clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86 > lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87 > clat percentiles (usec): > | 1.00th=[11076], 5.00th=[11338], 10.00th=[11469], 20.00th=[11469], > | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600], > | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600], > | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375], > | 99.99th=[32375] > bw ( KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23 > iops : min= 82, max= 88, avg=86.52, stdev= 1.38, samples=23 > lat (msec) : 20=99.80%, 50=0.20% > cpu : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256 > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 > latency : target=0, window=0, percentile=100.00%, depth=1 > > Run status group 0 (all jobs): > READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec > > Disk stats (read/write): > mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21% > > CMD (Randwrite): > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > fio-3.16 > Starting 1 thread > Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s] > mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021 > write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets > clat (msec): min=16, max=118, avg=25.37, stdev=16.34 > lat (msec): min=16, max=118, avg=25.44, stdev=16.34 > clat percentiles (msec): > | 1.00th=[ 17], 5.00th=[ 20], 10.00th=[ 20], 20.00th=[ 20], > | 30.00th=[ 20], 40.00th=[ 20], 50.00th=[ 20], 60.00th=[ 20], > | 70.00th=[ 21], 80.00th=[ 21], 90.00th=[ 52], 95.00th=[ 75], > | 99.00th=[ 78], 99.50th=[ 104], 99.90th=[ 114], 99.95th=[ 120], > | 99.99th=[ 120] > bw ( KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52 > iops : min= 20, max= 50, avg=39.27, stdev=10.25, samples=52 > lat (msec) : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68% > cpu : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0 > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0% > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% > issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 > latency : target=0, window=0, percentile=100.00%, depth=1 > > Run status group 0 (all jobs): > WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec > > Disk stats (read/write): > mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89% > > BR, > Ricky
> -----Original Message----- > From: Ulf Hansson <ulf.hansson@linaro.org> > Sent: Tuesday, December 28, 2021 10:05 PM > To: Ricky WU <ricky_wu@realtek.com> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > linux-kernel@vger.kernel.org > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > -----Original Message----- > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > Sent: Thursday, December 23, 2021 6:37 PM > > > To: Ricky WU <ricky_wu@realtek.com> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > linux-kernel@vger.kernel.org > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > block rw > > > > > > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > > -----Original Message----- > > > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > > > Sent: Tuesday, December 21, 2021 8:51 PM > > > > > To: Ricky WU <ricky_wu@realtek.com> > > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > > > linux-kernel@vger.kernel.org > > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > > > block rw > > > > > > > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> > wrote: > > > > > > > > > > > > Improving performance for the CMD is multi-block read/write > > > > > > and the data is sequential. > > > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) > > > > > > or normal RW (CMD 17/24) if the CMD is multi-block and the > > > > > > data is sequential then call to sd_rw_multi_seq() > > > > > > > > > > > > This patch mainly to control the timing of reply at CMD 12/13. > > > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25). > > > > > > The new code to distinguish multi-block RW(CMD 18/25) and the > > > > > > data is sequential or not, if the data is sequential RW driver > > > > > > do not send CMD > > > > > > 12 and bypass CMD 13 until wait the different direction RW CMD > > > > > > or trigger the delay_work to sent CMD 12. > > > > > > > > > > > > run benchmark result as below: > > > > > > SD Card : Samsumg Pro Plus 128GB Number of Samples:100, Sample > > > > > > Size:10MB <Before> Read : 86.9 MB/s, Write : 38.3 MB/s <After> > > > > > > Read : 91.5 MB/s, Write : 55.5 MB/s > > > > > > > > > > A much nicer commit message, thanks a lot! Would you mind > > > > > running some additional tests, like random I/O read/writes? > > > > > > > > > > Also, please specify the benchmark tool and command you are using. > > > > > In the meantime, I will continue to look at the code. > > > > > > > > > > > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" > > > > and the Tool don't have random I/O to choice... > > > > > > > > Do you have any suggestion for testing random I/O But we think > > > > random I/O will not change much > > > > > > I would probably look into using fio, > > > https://fio.readthedocs.io/en/latest/ > > > > > > > Filled random I/O data > > Before the patch: > > CMD (Randread): > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=1M -rw=randread > > Thanks for running the tests! Overall, I would not expect an impact on the > throughput when using a big blocksize like 1M. This is also pretty clear from > the result you have provided. > > However, especially for random writes and reads, we want to try with smaller > blocksizes. Like 8k or 16k, would you mind running another round of tests to > see how that works out? > Filled random I/O data(8k/16k) Before(randread) 8k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk stats (read/write): mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89% 16k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk stats (read/write): mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84% Before(randrwrite) 8k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats (read/write): mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90% 16k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats (read/write): mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81% After(randread) 8k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk stats (read/write): mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94% 16k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk stats (read/write): mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87% After(randwrite) 8k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats (read/write): mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92% 16k: Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 result: Run status group 0 (all jobs): WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats (read/write): mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80% > I haven't yet been able to provide you with comments on the code, but I am > looking into it. > > Kind regards > Uffe > > > > > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) > > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > > fio-3.16 > > Starting 1 thread > > Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s] > > mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021 > > read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec) > > clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16 > > lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16 > > clat percentiles (usec): > > | 1.00th=[11338], 5.00th=[11469], 10.00th=[11600], > 20.00th=[11600], > > | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], > 60.00th=[11731], > > | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], > 95.00th=[11863], > > | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], > 99.95th=[34341], > > | 99.99th=[34341] > > bw ( KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, > stdev=1467.81, samples=24 > > iops : min= 80, max= 86, avg=85.00, stdev= 1.41, > samples=24 > > lat (msec) : 20=99.90%, 50=0.10% > > cpu : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256 > > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, > >=64=0.0% > > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 > > latency : target=0, window=0, percentile=100.00%, depth=1 > > > > Run status group 0 (all jobs): > > READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s > > (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec > > > > Disk stats (read/write): > > mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, > > util=99.23% > > > > CMD (Randwrite): > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=1M -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) > > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > > fio-3.16 > > Starting 1 thread > > Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s] > > mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021 > > write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone > resets > > clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22 > > lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04 > > clat percentiles (usec): > > | 1.00th=[20579], 5.00th=[22414], 10.00th=[22676], > 20.00th=[22938], > > | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], > 60.00th=[23725], > > | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], > 95.00th=[56361], > > | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], > 99.95th=[94897], > > | 99.99th=[94897] > > bw ( KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, > stdev=3860.74, samples=53 > > iops : min= 24, max= 42, avg=38.30, stdev= 3.77, > samples=53 > > lat (msec) : 20=0.98%, 50=92.38%, 100=6.64% > > cpu : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0 > > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, > >=64=0.0% > > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 > > latency : target=0, window=0, percentile=100.00%, depth=1 > > > > Run status group 0 (all jobs): > > WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s > > (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec > > > > Disk stats (read/write): > > mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, > > util=99.90% > > > > > > After the patch: > > > > CMD (Randread): > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=1M -rw=randread > > > > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) > > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > > fio-3.16 > > Starting 1 thread > > Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s] > > mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021 > > read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec) > > clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86 > > lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87 > > clat percentiles (usec): > > | 1.00th=[11076], 5.00th=[11338], 10.00th=[11469], > 20.00th=[11469], > > | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], > 60.00th=[11600], > > | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], > 95.00th=[11600], > > | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], > 99.95th=[32375], > > | 99.99th=[32375] > > bw ( KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, > stdev=1410.46, samples=23 > > iops : min= 82, max= 88, avg=86.52, stdev= 1.38, > samples=23 > > lat (msec) : 20=99.80%, 50=0.20% > > cpu : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256 > > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, > >=64=0.0% > > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0 > > latency : target=0, window=0, percentile=100.00%, depth=1 > > > > Run status group 0 (all jobs): > > READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s > > (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec > > > > Disk stats (read/write): > > mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, > > util=99.21% > > > > CMD (Randwrite): > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=1M -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) > > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1 > > fio-3.16 > > Starting 1 thread > > Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s] > > mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021 > > write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone > resets > > clat (msec): min=16, max=118, avg=25.37, stdev=16.34 > > lat (msec): min=16, max=118, avg=25.44, stdev=16.34 > > clat percentiles (msec): > > | 1.00th=[ 17], 5.00th=[ 20], 10.00th=[ 20], > 20.00th=[ 20], > > | 30.00th=[ 20], 40.00th=[ 20], 50.00th=[ 20], > 60.00th=[ 20], > > | 70.00th=[ 21], 80.00th=[ 21], 90.00th=[ 52], > 95.00th=[ 75], > > | 99.00th=[ 78], 99.50th=[ 104], 99.90th=[ 114], > 99.95th=[ 120], > > | 99.99th=[ 120] > > bw ( KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, > stdev=10498.00, samples=52 > > iops : min= 20, max= 50, avg=39.27, stdev=10.25, > samples=52 > > lat (msec) : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68% > > cpu : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0 > > IO depths : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, > >=64=0.0% > > submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, > >=64=0.0% > > issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0 > > latency : target=0, window=0, percentile=100.00%, depth=1 > > > > Run status group 0 (all jobs): > > WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s > > (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec > > > > Disk stats (read/write): > > mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, > > util=99.89% > > > > BR, > > Ricky > ------Please consider the environment before printing this e-mail.
[...] > > > > > > > > > > Do you have any suggestion for testing random I/O But we think > > > > > random I/O will not change much > > > > > > > > I would probably look into using fio, > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > Filled random I/O data > > > Before the patch: > > > CMD (Randread): > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > -bs=1M -rw=randread > > > > Thanks for running the tests! Overall, I would not expect an impact on the > > throughput when using a big blocksize like 1M. This is also pretty clear from > > the result you have provided. > > > > However, especially for random writes and reads, we want to try with smaller > > blocksizes. Like 8k or 16k, would you mind running another round of tests to > > see how that works out? > > > > Filled random I/O data(8k/16k) Hi Ricky, Apologize for the delay! Thanks for running the tests. Let me comment on them below. > > Before(randread) > 8k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec > Disk stats (read/write): > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89% > > 16k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec > Disk stats (read/write): > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84% > > Before(randrwrite) > 8k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec > Disk stats (read/write): > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90% > > 16k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec > Disk stats (read/write): > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81% > > > After(randread) > 8k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec > Disk stats (read/write): > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94% > > 16k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec > Disk stats (read/write): > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87% > > After(randwrite) > 8k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec > Disk stats (read/write): > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92% > > 16k: > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > result: > Run status group 0 (all jobs): > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec > Disk stats (read/write): > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80% It looks like the rand-read tests above are degrading with the new changes, while rand-writes are both improving and degrading. To summarize my view from all the tests you have done at this point (thanks a lot); it looks like the block I/O merging isn't really happening at common blocklayer, at least to that extent that would benefit us. Clearly you have shown that by the suggested change in the mmc host driver, by detecting whether the "next" request is sequential to the previous one, which allows us to skip a CMD12 and minimize some command overhead. However, according to the latest tests above, you have also proved that the changes in the mmc host driver doesn't come without a cost. In particular, small random-reads would degrade in performance from these changes. That said, it looks to me that rather than trying to improve things for one specific mmc host driver, it would be better to look at this from the generic block layer point of view - and investigate why sequential reads/writes aren't getting merged often enough for the MMC/SD case. If we can fix the problem there, all mmc host drivers would benefit I assume. BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I suggest you do as it's a good fit for MMC/SD. [...] Kind regards Uffe
> -----Original Message----- > From: Ulf Hansson <ulf.hansson@linaro.org> > Sent: Monday, February 7, 2022 7:11 PM > To: Ricky WU <ricky_wu@realtek.com> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > linux-kernel@vger.kernel.org > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > [...] > > > > > > > > > > > > > Do you have any suggestion for testing random I/O But we think > > > > > > random I/O will not change much > > > > > > > > > > I would probably look into using fio, > > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > > > > Filled random I/O data > > > > Before the patch: > > > > CMD (Randread): > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > -bs=1M -rw=randread > > > > > > Thanks for running the tests! Overall, I would not expect an impact > > > on the throughput when using a big blocksize like 1M. This is also > > > pretty clear from the result you have provided. > > > > > > However, especially for random writes and reads, we want to try with > > > smaller blocksizes. Like 8k or 16k, would you mind running another > > > round of tests to see how that works out? > > > > > > > Filled random I/O data(8k/16k) > > Hi Ricky, > > Apologize for the delay! Thanks for running the tests. Let me comment on > them below. > > > > > Before(randread) > > 8k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=8k -rw=randread > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > 8192B-8192B, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk > stats (read/write): > > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, > > util=99.89% > > > > 16k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=16k -rw=randread > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk > stats (read/write): > > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, > > util=99.84% > > > > Before(randrwrite) > > 8k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > -bs=8k -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > 8192B-8192B, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats > (read/write): > > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, > > util=99.90% > > > > 16k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > -bs=16k -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats > (read/write): > > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, > > util=99.81% > > > > > > After(randread) > > 8k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=8k -rw=randread > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > 8192B-8192B, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk > stats (read/write): > > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, > > util=99.94% > > > > 16k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > -bs=16k -rw=randread > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk > stats (read/write): > > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, > > util=99.87% > > > > After(randwrite) > > 8k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > -bs=8k -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > 8192B-8192B, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats > (read/write): > > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, > > util=99.92% > > > > 16k: > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > -bs=16k -rw=randwrite > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > result: > > Run status group 0 (all jobs): > > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats > (read/write): > > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, > > util=99.80% > > It looks like the rand-read tests above are degrading with the new changes, > while rand-writes are both improving and degrading. > > To summarize my view from all the tests you have done at this point (thanks a > lot); it looks like the block I/O merging isn't really happening at common > blocklayer, at least to that extent that would benefit us. Clearly you have shown > that by the suggested change in the mmc host driver, by detecting whether the > "next" request is sequential to the previous one, which allows us to skip a > CMD12 and minimize some command overhead. > > However, according to the latest tests above, you have also proved that the > changes in the mmc host driver doesn't come without a cost. > In particular, small random-reads would degrade in performance from these > changes. > > That said, it looks to me that rather than trying to improve things for one > specific mmc host driver, it would be better to look at this from the generic > block layer point of view - and investigate why sequential reads/writes aren't > getting merged often enough for the MMC/SD case. If we can fix the problem > there, all mmc host drivers would benefit I assume. > So you are thinking about how to patch this in MMC/SD? I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver > BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I > suggest you do as it's a good fit for MMC/SD. > I don’t know what is different I/O schedulers means? > [...] > > Kind regards > Uffe > ------Please consider the environment before printing this e-mail.
On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > -----Original Message----- > > From: Ulf Hansson <ulf.hansson@linaro.org> > > Sent: Monday, February 7, 2022 7:11 PM > > To: Ricky WU <ricky_wu@realtek.com> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > linux-kernel@vger.kernel.org > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > > > [...] > > > > > > > > > > > > > > > > Do you have any suggestion for testing random I/O But we think > > > > > > > random I/O will not change much > > > > > > > > > > > > I would probably look into using fio, > > > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > > > > > > > Filled random I/O data > > > > > Before the patch: > > > > > CMD (Randread): > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > -bs=1M -rw=randread > > > > > > > > Thanks for running the tests! Overall, I would not expect an impact > > > > on the throughput when using a big blocksize like 1M. This is also > > > > pretty clear from the result you have provided. > > > > > > > > However, especially for random writes and reads, we want to try with > > > > smaller blocksizes. Like 8k or 16k, would you mind running another > > > > round of tests to see how that works out? > > > > > > > > > > Filled random I/O data(8k/16k) > > > > Hi Ricky, > > > > Apologize for the delay! Thanks for running the tests. Let me comment on > > them below. > > > > > > > > Before(randread) > > > 8k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > -bs=8k -rw=randread > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > > 8192B-8192B, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk > > stats (read/write): > > > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, > > > util=99.89% > > > > > > 16k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > -bs=16k -rw=randread > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk > > stats (read/write): > > > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, > > > util=99.84% > > > > > > Before(randrwrite) > > > 8k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > > -bs=8k -rw=randwrite > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > > 8192B-8192B, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats > > (read/write): > > > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, > > > util=99.90% > > > > > > 16k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > > -bs=16k -rw=randwrite > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats > > (read/write): > > > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, > > > util=99.81% > > > > > > > > > After(randread) > > > 8k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > -bs=8k -rw=randread > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > > 8192B-8192B, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk > > stats (read/write): > > > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, > > > util=99.94% > > > > > > 16k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > -bs=16k -rw=randread > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk > > stats (read/write): > > > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, > > > util=99.87% > > > > > > After(randwrite) > > > 8k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > > -bs=8k -rw=randwrite > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) > > > 8192B-8192B, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats > > (read/write): > > > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, > > > util=99.92% > > > > > > 16k: > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest > > > -bs=16k -rw=randwrite > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > result: > > > Run status group 0 (all jobs): > > > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats > > (read/write): > > > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, > > > util=99.80% > > > > It looks like the rand-read tests above are degrading with the new changes, > > while rand-writes are both improving and degrading. > > > > To summarize my view from all the tests you have done at this point (thanks a > > lot); it looks like the block I/O merging isn't really happening at common > > blocklayer, at least to that extent that would benefit us. Clearly you have shown > > that by the suggested change in the mmc host driver, by detecting whether the > > "next" request is sequential to the previous one, which allows us to skip a > > CMD12 and minimize some command overhead. > > > > However, according to the latest tests above, you have also proved that the > > changes in the mmc host driver doesn't come without a cost. > > In particular, small random-reads would degrade in performance from these > > changes. > > > > That said, it looks to me that rather than trying to improve things for one > > specific mmc host driver, it would be better to look at this from the generic > > block layer point of view - and investigate why sequential reads/writes aren't > > getting merged often enough for the MMC/SD case. If we can fix the problem > > there, all mmc host drivers would benefit I assume. > > > > So you are thinking about how to patch this in MMC/SD? > I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver I would not limit this to the core layer of MMC/SD. The point I was trying to make was that it doesn't look like the generic block layer is merging the sequential I/O requests in the most efficient way, at least for the eMMC/SD devices. Why this is the case, I can't tell. It looks like we need to do some more in-depth analysis to understand why merging isn't efficient for us. > > > BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I > > suggest you do as it's a good fit for MMC/SD. > > > > I don’t know what is different I/O schedulers means? What I/O scheduler did you use when running the test? For MMC/SD the only one that makes sense to use is BFQ, however that needs to be configured via sysfs after boot. There is no way, currently, to make it the default, I think. You may look at Documentation/block/bfq-iosched.rst, if you are more interested. Kind regards Uffe
Hi Ulf Hansson, Can I know what is this patch status or has some concern on this patch? Ricky > -----Original Message----- > From: Ulf Hansson <ulf.hansson@linaro.org> > Sent: Thursday, February 10, 2022 10:57 PM > To: Ricky WU <ricky_wu@realtek.com> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > linux-kernel@vger.kernel.org > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > > -----Original Message----- > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > Sent: Monday, February 7, 2022 7:11 PM > > > To: Ricky WU <ricky_wu@realtek.com> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > linux-kernel@vger.kernel.org > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > block rw > > > > > > [...] > > > > > > > > > > > > > > > > > > > Do you have any suggestion for testing random I/O But we > > > > > > > > think random I/O will not change much > > > > > > > > > > > > > > I would probably look into using fio, > > > > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > > > > > > > > > > Filled random I/O data > > > > > > Before the patch: > > > > > > CMD (Randread): > > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G > > > > > > -name=mytest -bs=1M -rw=randread > > > > > > > > > > Thanks for running the tests! Overall, I would not expect an > > > > > impact on the throughput when using a big blocksize like 1M. > > > > > This is also pretty clear from the result you have provided. > > > > > > > > > > However, especially for random writes and reads, we want to try > > > > > with smaller blocksizes. Like 8k or 16k, would you mind running > > > > > another round of tests to see how that works out? > > > > > > > > > > > > > Filled random I/O data(8k/16k) > > > > > > Hi Ricky, > > > > > > Apologize for the delay! Thanks for running the tests. Let me > > > comment on them below. > > > > > > > > > > > Before(randread) > > > > 8k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > -bs=8k -rw=randread > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s > > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec > Disk > > > stats (read/write): > > > > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, > > > > util=99.89% > > > > > > > > 16k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > -bs=16k -rw=randread > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s > > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec > Disk > > > stats (read/write): > > > > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, > > > > util=99.84% > > > > > > > > Before(randrwrite) > > > > 8k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > -name=mytest -bs=8k -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s > > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk > > > > stats > > > (read/write): > > > > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, > > > > in_queue=24234, util=99.90% > > > > > > > > 16k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > -name=mytest -bs=16k -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s > > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk > > > > stats > > > (read/write): > > > > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, > > > > util=99.81% > > > > > > > > > > > > After(randread) > > > > 8k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > -bs=8k -rw=randread > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s > > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec > Disk > > > stats (read/write): > > > > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, > > > > util=99.94% > > > > > > > > 16k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > -bs=16k -rw=randread > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s > > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec > Disk > > > stats (read/write): > > > > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, > > > > util=99.87% > > > > > > > > After(randwrite) > > > > 8k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > -name=mytest -bs=8k -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s > > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk > > > > stats > > > (read/write): > > > > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, > > > > in_queue=23267, util=99.92% > > > > > > > > 16k: > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > -name=mytest -bs=16k -rw=randwrite > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > result: > > > > Run status group 0 (all jobs): > > > > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s > > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk > > > > stats > > > (read/write): > > > > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, > > > > util=99.80% > > > > > > It looks like the rand-read tests above are degrading with the new > > > changes, while rand-writes are both improving and degrading. > > > > > > To summarize my view from all the tests you have done at this point > > > (thanks a lot); it looks like the block I/O merging isn't really > > > happening at common blocklayer, at least to that extent that would > > > benefit us. Clearly you have shown that by the suggested change in > > > the mmc host driver, by detecting whether the "next" request is > > > sequential to the previous one, which allows us to skip a > > > CMD12 and minimize some command overhead. > > > > > > However, according to the latest tests above, you have also proved > > > that the changes in the mmc host driver doesn't come without a cost. > > > In particular, small random-reads would degrade in performance from > > > these changes. > > > > > > That said, it looks to me that rather than trying to improve things > > > for one specific mmc host driver, it would be better to look at this > > > from the generic block layer point of view - and investigate why > > > sequential reads/writes aren't getting merged often enough for the > > > MMC/SD case. If we can fix the problem there, all mmc host drivers would > benefit I assume. > > > > > > > So you are thinking about how to patch this in MMC/SD? > > I don't know if this method is compatible with other MMC Hosts? Or > > they need to patch other code on their host driver > > I would not limit this to the core layer of MMC/SD. The point I was trying to > make was that it doesn't look like the generic block layer is merging the > sequential I/O requests in the most efficient way, at least for the eMMC/SD > devices. Why this is the case, I can't tell. It looks like we need to do some more > in-depth analysis to understand why merging isn't efficient for us. > > > > > > BTW, have you tried with different I/O schedulers? If you haven't > > > tried BFQ, I suggest you do as it's a good fit for MMC/SD. > > > > > > > I don’t know what is different I/O schedulers means? > > What I/O scheduler did you use when running the test? > > For MMC/SD the only one that makes sense to use is BFQ, however that needs > to be configured via sysfs after boot. There is no way, currently, to make it the > default, I think. You may look at Documentation/block/bfq-iosched.rst, if you > are more interested. > > Kind regards > Uffe > ------Please consider the environment before printing this e-mail.
On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote: > > Hi Ulf Hansson, > > Can I know what is this patch status or has some concern on this patch? Didn't you read my earlier replies? Kind regards Uffe > > Ricky > > -----Original Message----- > > From: Ulf Hansson <ulf.hansson@linaro.org> > > Sent: Thursday, February 10, 2022 10:57 PM > > To: Ricky WU <ricky_wu@realtek.com> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > linux-kernel@vger.kernel.org > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > > > On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > > > > > > -----Original Message----- > > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > > Sent: Monday, February 7, 2022 7:11 PM > > > > To: Ricky WU <ricky_wu@realtek.com> > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > > linux-kernel@vger.kernel.org > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > > block rw > > > > > > > > [...] > > > > > > > > > > > > > > > > > > > > > > Do you have any suggestion for testing random I/O But we > > > > > > > > > think random I/O will not change much > > > > > > > > > > > > > > > > I would probably look into using fio, > > > > > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > > > > > > > > > > > > > Filled random I/O data > > > > > > > Before the patch: > > > > > > > CMD (Randread): > > > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G > > > > > > > -name=mytest -bs=1M -rw=randread > > > > > > > > > > > > Thanks for running the tests! Overall, I would not expect an > > > > > > impact on the throughput when using a big blocksize like 1M. > > > > > > This is also pretty clear from the result you have provided. > > > > > > > > > > > > However, especially for random writes and reads, we want to try > > > > > > with smaller blocksizes. Like 8k or 16k, would you mind running > > > > > > another round of tests to see how that works out? > > > > > > > > > > > > > > > > Filled random I/O data(8k/16k) > > > > > > > > Hi Ricky, > > > > > > > > Apologize for the delay! Thanks for running the tests. Let me > > > > comment on them below. > > > > > > > > > > > > > > Before(randread) > > > > > 8k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > -bs=8k -rw=randread > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s > > > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec > > Disk > > > > stats (read/write): > > > > > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, > > > > > util=99.89% > > > > > > > > > > 16k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > -bs=16k -rw=randread > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s > > > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec > > Disk > > > > stats (read/write): > > > > > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, > > > > > util=99.84% > > > > > > > > > > Before(randrwrite) > > > > > 8k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > -name=mytest -bs=8k -rw=randwrite > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s > > > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk > > > > > stats > > > > (read/write): > > > > > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, > > > > > in_queue=24234, util=99.90% > > > > > > > > > > 16k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > -name=mytest -bs=16k -rw=randwrite > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s > > > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk > > > > > stats > > > > (read/write): > > > > > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, > > > > > util=99.81% > > > > > > > > > > > > > > > After(randread) > > > > > 8k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > -bs=8k -rw=randread > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s > > > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec > > Disk > > > > stats (read/write): > > > > > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, > > > > > util=99.94% > > > > > > > > > > 16k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > -bs=16k -rw=randread > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s > > > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec > > Disk > > > > stats (read/write): > > > > > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, > > > > > util=99.87% > > > > > > > > > > After(randwrite) > > > > > 8k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > -name=mytest -bs=8k -rw=randwrite > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s > > > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk > > > > > stats > > > > (read/write): > > > > > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, > > > > > in_queue=23267, util=99.92% > > > > > > > > > > 16k: > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > -name=mytest -bs=16k -rw=randwrite > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > result: > > > > > Run status group 0 (all jobs): > > > > > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s > > > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk > > > > > stats > > > > (read/write): > > > > > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, > > > > > util=99.80% > > > > > > > > It looks like the rand-read tests above are degrading with the new > > > > changes, while rand-writes are both improving and degrading. > > > > > > > > To summarize my view from all the tests you have done at this point > > > > (thanks a lot); it looks like the block I/O merging isn't really > > > > happening at common blocklayer, at least to that extent that would > > > > benefit us. Clearly you have shown that by the suggested change in > > > > the mmc host driver, by detecting whether the "next" request is > > > > sequential to the previous one, which allows us to skip a > > > > CMD12 and minimize some command overhead. > > > > > > > > However, according to the latest tests above, you have also proved > > > > that the changes in the mmc host driver doesn't come without a cost. > > > > In particular, small random-reads would degrade in performance from > > > > these changes. > > > > > > > > That said, it looks to me that rather than trying to improve things > > > > for one specific mmc host driver, it would be better to look at this > > > > from the generic block layer point of view - and investigate why > > > > sequential reads/writes aren't getting merged often enough for the > > > > MMC/SD case. If we can fix the problem there, all mmc host drivers would > > benefit I assume. > > > > > > > > > > So you are thinking about how to patch this in MMC/SD? > > > I don't know if this method is compatible with other MMC Hosts? Or > > > they need to patch other code on their host driver > > > > I would not limit this to the core layer of MMC/SD. The point I was trying to > > make was that it doesn't look like the generic block layer is merging the > > sequential I/O requests in the most efficient way, at least for the eMMC/SD > > devices. Why this is the case, I can't tell. It looks like we need to do some more > > in-depth analysis to understand why merging isn't efficient for us. > > > > > > > > > BTW, have you tried with different I/O schedulers? If you haven't > > > > tried BFQ, I suggest you do as it's a good fit for MMC/SD. > > > > > > > > > > I don’t know what is different I/O schedulers means? > > > > What I/O scheduler did you use when running the test? > > > > For MMC/SD the only one that makes sense to use is BFQ, however that needs > > to be configured via sysfs after boot. There is no way, currently, to make it the > > default, I think. You may look at Documentation/block/bfq-iosched.rst, if you > > are more interested. > > > > Kind regards > > Uffe > > ------Please consider the environment before printing this e-mail. >
> On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote: > > > > Hi Ulf Hansson, > > > > Can I know what is this patch status or has some concern on this patch? > > Didn't you read my earlier replies? > Are you talking about BFQ for testing speed? Because we tested the Read/Write speed are better than before and our customer that uses our reader on their product also tested the Read/Write speed, they want us to push this patch on > Kind regards > Uffe > > > > > Ricky > > > -----Original Message----- > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > Sent: Thursday, February 10, 2022 10:57 PM > > > To: Ricky WU <ricky_wu@realtek.com> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > linux-kernel@vger.kernel.org > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw > > > > > > On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > > > > > > > > > > -----Original Message----- > > > > > From: Ulf Hansson <ulf.hansson@linaro.org> > > > > > Sent: Monday, February 7, 2022 7:11 PM > > > > > To: Ricky WU <ricky_wu@realtek.com> > > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org; > > > > > linux-kernel@vger.kernel.org > > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi > > > > > block rw > > > > > > > > > > [...] > > > > > > > > > > > > > > > > > > > > > > > > > Do you have any suggestion for testing random I/O But we > > > > > > > > > > think random I/O will not change much > > > > > > > > > > > > > > > > > > I would probably look into using fio, > > > > > > > > > https://fio.readthedocs.io/en/latest/ > > > > > > > > > > > > > > > > > > > > > > > > > Filled random I/O data > > > > > > > > Before the patch: > > > > > > > > CMD (Randread): > > > > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G > > > > > > > > -name=mytest -bs=1M -rw=randread > > > > > > > > > > > > > > Thanks for running the tests! Overall, I would not expect an > > > > > > > impact on the throughput when using a big blocksize like 1M. > > > > > > > This is also pretty clear from the result you have provided. > > > > > > > > > > > > > > However, especially for random writes and reads, we want to try > > > > > > > with smaller blocksizes. Like 8k or 16k, would you mind running > > > > > > > another round of tests to see how that works out? > > > > > > > > > > > > > > > > > > > Filled random I/O data(8k/16k) > > > > > > > > > > Hi Ricky, > > > > > > > > > > Apologize for the delay! Thanks for running the tests. Let me > > > > > comment on them below. > > > > > > > > > > > > > > > > > Before(randread) > > > > > > 8k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > > -bs=8k -rw=randread > > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s > > > > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), > run=62019-62019msec > > > Disk > > > > > stats (read/write): > > > > > > mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, > in_queue=57751, > > > > > > util=99.89% > > > > > > > > > > > > 16k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > > -bs=16k -rw=randread > > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s > > > > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), > run=44034-44034msec > > > Disk > > > > > stats (read/write): > > > > > > mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, > in_queue=39420, > > > > > > util=99.84% > > > > > > > > > > > > Before(randrwrite) > > > > > > 8k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > > -name=mytest -bs=8k -rw=randwrite > > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s > > > > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec > Disk > > > > > > stats > > > > > (read/write): > > > > > > mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, > > > > > > in_queue=24234, util=99.90% > > > > > > > > > > > > 16k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > > -name=mytest -bs=16k -rw=randwrite > > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s > > > > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec > Disk > > > > > > stats > > > > > (read/write): > > > > > > mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, > in_queue=13728, > > > > > > util=99.81% > > > > > > > > > > > > > > > > > > After(randread) > > > > > > 8k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > > -bs=8k -rw=randread > > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s > > > > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), > run=82397-82397msec > > > Disk > > > > > stats (read/write): > > > > > > mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, > in_queue=74125, > > > > > > util=99.94% > > > > > > > > > > > > 16k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest > > > > > > -bs=16k -rw=randread > > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s > > > > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), > run=51076-51076msec > > > Disk > > > > > stats (read/write): > > > > > > mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, > in_queue=46254, > > > > > > util=99.87% > > > > > > > > > > > > After(randwrite) > > > > > > 8k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > > -name=mytest -bs=8k -rw=randwrite > > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, > > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s > > > > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec > Disk > > > > > > stats > > > > > (read/write): > > > > > > mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, > > > > > > in_queue=23267, util=99.92% > > > > > > > > > > > > 16k: > > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M > > > > > > -name=mytest -bs=16k -rw=randwrite > > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) > > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1 > > > > > > result: > > > > > > Run status group 0 (all jobs): > > > > > > WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s > > > > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec > Disk > > > > > > stats > > > > > (read/write): > > > > > > mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, > in_queue=15204, > > > > > > util=99.80% > > > > > > > > > > It looks like the rand-read tests above are degrading with the new > > > > > changes, while rand-writes are both improving and degrading. > > > > > > > > > > To summarize my view from all the tests you have done at this point > > > > > (thanks a lot); it looks like the block I/O merging isn't really > > > > > happening at common blocklayer, at least to that extent that would > > > > > benefit us. Clearly you have shown that by the suggested change in > > > > > the mmc host driver, by detecting whether the "next" request is > > > > > sequential to the previous one, which allows us to skip a > > > > > CMD12 and minimize some command overhead. > > > > > > > > > > However, according to the latest tests above, you have also proved > > > > > that the changes in the mmc host driver doesn't come without a cost. > > > > > In particular, small random-reads would degrade in performance from > > > > > these changes. > > > > > > > > > > That said, it looks to me that rather than trying to improve things > > > > > for one specific mmc host driver, it would be better to look at this > > > > > from the generic block layer point of view - and investigate why > > > > > sequential reads/writes aren't getting merged often enough for the > > > > > MMC/SD case. If we can fix the problem there, all mmc host drivers > would > > > benefit I assume. > > > > > > > > > > > > > So you are thinking about how to patch this in MMC/SD? > > > > I don't know if this method is compatible with other MMC Hosts? Or > > > > they need to patch other code on their host driver > > > > > > I would not limit this to the core layer of MMC/SD. The point I was trying to > > > make was that it doesn't look like the generic block layer is merging the > > > sequential I/O requests in the most efficient way, at least for the eMMC/SD > > > devices. Why this is the case, I can't tell. It looks like we need to do some > more > > > in-depth analysis to understand why merging isn't efficient for us. > > > > > > > > > > > > BTW, have you tried with different I/O schedulers? If you haven't > > > > > tried BFQ, I suggest you do as it's a good fit for MMC/SD. > > > > > > > > > > > > > I don’t know what is different I/O schedulers means? > > > > > > What I/O scheduler did you use when running the test? > > > > > > For MMC/SD the only one that makes sense to use is BFQ, however that > needs > > > to be configured via sysfs after boot. There is no way, currently, to make it > the > > > default, I think. You may look at Documentation/block/bfq-iosched.rst, if > you > > > are more interested. > > > > > > Kind regards > > > Uffe > > > ------Please consider the environment before printing this e-mail. > >
On Fri, 13 Oct 2023 at 04:27, Ricky WU <ricky_wu@realtek.com> wrote: > > > On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > Hi Ulf Hansson, > > > > > > Can I know what is this patch status or has some concern on this patch? > > > > Didn't you read my earlier replies? > > > > Are you talking about BFQ for testing speed? > Because we tested the Read/Write speed are better than before and our customer that uses our reader on their product also tested the Read/Write speed, they want us to push this patch on It's certainly a very positive thing that your target is to upstream solutions that improve performance. We all appreciate this! In this regard, I believe I have tried to guide you on how to move forward with this. This particular optimization doesn't belong in an mmc host driver, but rather at the common upper block device driver layer, such that it can benefit more than one particular mmc host driver. I fully understand that making that kind of improvement is way more difficult and requires in-depth analysis to understand what is happening on those layers too. On the other hand it could be something that may benefit a lot of devices/platforms. Unfortunately, I am currently not in a position where I have the bandwidth to dive deeper into this. If you decide to pursue your investigations, I think we need to involve the experts from the common block community (linux-block mailing list) to get their advice. So to be clear, I am not going to apply $subject patch - or anything similar to an mmc host driver. [...] Kind regards Uffe
> > > > > On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote: > > > > > > > > Hi Ulf Hansson, > > > > > > > > Can I know what is this patch status or has some concern on this patch? > > > > > > Didn't you read my earlier replies? > > > > > > > Are you talking about BFQ for testing speed? > > Because we tested the Read/Write speed are better than before and our > customer that uses our reader on their product also tested the Read/Write > speed, they want us to push this patch on > > It's certainly a very positive thing that your target is to upstream > solutions that improve performance. We all appreciate this! > > In this regard, I believe I have tried to guide you on how to move > forward with this. This particular optimization doesn't belong in an > mmc host driver, but rather at the common upper block device driver > layer, such that it can benefit more than one particular mmc host > driver. > > I fully understand that making that kind of improvement is way more > difficult and requires in-depth analysis to understand what is > happening on those layers too. On the other hand it could be something > that may benefit a lot of devices/platforms. Unfortunately, I am > currently not in a position where I have the bandwidth to dive deeper > into this. > > If you decide to pursue your investigations, I think we need to > involve the experts from the common block community (linux-block > mailing list) to get their advice. > > So to be clear, I am not going to apply $subject patch - or anything > similar to an mmc host driver. > This improve performance solution is developed for our HW design We discussed internally, The CMD 12 response timing is depend on HW design so this solution maybe cannot meet all devices, and the core part of this mechanism is when we got sequential data we control our DMA register for read/write data, this operating has different designed on different device, so this is not easy to push a same way on the mmc core. > > Kind regards > Uffe
diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 58cfaffa3c2d..ee2b0eec6422 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -22,6 +22,8 @@ #include <asm/unaligned.h> #include <linux/pm_runtime.h> +enum RW_MODE {NORMAL_RW, SEQ_RW}; + struct realtek_pci_sdmmc { struct platform_device *pdev; struct rtsx_pcr *pcr; @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc { struct work_struct work; struct mutex host_mutex; + struct delayed_work rw_idle_work; u8 ssc_depth; unsigned int clock; @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc { s32 cookie; int cookie_sg_count; bool using_cookie; + + enum RW_MODE rw_mode; + u8 prev_dir; + u8 cur_dir; + u64 prev_sec_addr; + u32 prev_sec_cnt; }; static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios); @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host, dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = 0x%08x\n", __func__, cmd_idx, arg); + if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == SEQ_RW) { + cmd->resp[0] = R1_READY_FOR_DATA | (R1_STATE_TRAN << 9); + goto out; + } + + if (!mmc_op_multi(cmd->opcode)) + host->rw_mode = NORMAL_RW; + rsp_type = sd_response_type(cmd); if (rsp_type < 0) goto out; @@ -542,6 +559,93 @@ static int sd_write_long_data(struct realtek_pci_sdmmc *host, return 0; } +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) +{ + struct rtsx_pcr *pcr = host->pcr; + struct mmc_host *mmc = host->mmc; + struct mmc_card *card = mmc->card; + struct mmc_data *data = mrq->data; + int uhs = mmc_card_uhs(card); + u8 cfg2; + int err; + size_t data_len = data->blksz * data->blocks; + + cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 | + SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | SD_RSP_LEN_0; + + if (!uhs) + cfg2 |= SD_NO_CHECK_WAIT_CRC_TO; + + rtsx_pci_init_cmd(pcr); + sd_cmd_set_data_len(pcr, data->blocks, data->blksz); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0, + DMA_DONE_INT, DMA_DONE_INT); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3, + 0xFF, (u8)(data_len >> 24)); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2, + 0xFF, (u8)(data_len >> 16)); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1, + 0xFF, (u8)(data_len >> 8)); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, (u8)data_len); + + if (host->cur_dir == DMA_DIR_FROM_CARD) + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, + 0x03 | DMA_PACK_SIZE_MASK, + DMA_DIR_FROM_CARD | DMA_EN | DMA_512); + else + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL, + 0x03 | DMA_PACK_SIZE_MASK, + DMA_DIR_TO_CARD | DMA_EN | DMA_512); + + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE, + 0x01, RING_BUFFER); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2); + + if (host->cur_dir == DMA_DIR_FROM_CARD) + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF, + SD_TRANSFER_START | SD_TM_AUTO_READ_3); + else + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF, + SD_TRANSFER_START | SD_TM_AUTO_WRITE_3); + + rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER, + SD_TRANSFER_END, SD_TRANSFER_END); + rtsx_pci_send_cmd_no_wait(pcr); + + if (host->cur_dir == DMA_DIR_FROM_CARD) + err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 1, 10000); + else + err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 0, 10000); + + if (err < 0) { + sd_clear_error(host); + return err; + } + + return 0; +} + +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) +{ + struct rtsx_pcr *pcr = host->pcr; + struct mmc_command *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + + cmd->opcode = MMC_STOP_TRANSMISSION; + cmd->arg = 0; + cmd->busy_timeout = 0; + if (host->cur_dir == DMA_DIR_FROM_CARD) + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + else + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; + sd_send_cmd_get_rsp(host, cmd); + udelay(50); + rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH); + kfree(cmd); + return 0; +} + static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host) { rtsx_pci_write_register(host->pcr, SD_CFG1, @@ -796,6 +900,45 @@ static inline int sd_rw_cmd(struct mmc_command *cmd) (cmd->opcode == MMC_WRITE_BLOCK); } +static void sd_rw_idle_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct realtek_pci_sdmmc *host = container_of(dwork, + struct realtek_pci_sdmmc, rw_idle_work); + struct mmc_command *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + + cmd->opcode = MMC_STOP_TRANSMISSION; + cmd->arg = 0; + cmd->busy_timeout = 0; + if (host->cur_dir == DMA_DIR_FROM_CARD) + cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + else + cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; + + sd_send_cmd_get_rsp(host, cmd); + host->rw_mode = NORMAL_RW; + kfree(cmd); +} + +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq) +{ + struct mmc_command *cmd = mrq->cmd; + struct mmc_data *data = mrq->data; + + if (!mmc_op_multi(cmd->opcode)) + return 0; + + if (host->prev_dir != host->cur_dir) + return 0; + + if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr) + return 0; + + return 1; +} + static void sd_request(struct work_struct *work) { struct realtek_pci_sdmmc *host = container_of(work, @@ -841,12 +984,36 @@ static void sd_request(struct work_struct *work) if (!data_size) { sd_send_cmd_get_rsp(host, cmd); } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { - cmd->error = sd_rw_multi(host, mrq); - if (!host->using_cookie) - sdmmc_post_req(host->mmc, host->mrq, 0); + /* Check multi-block and seq function*/ + if (data->flags & MMC_DATA_READ) + host->cur_dir = DMA_DIR_FROM_CARD; + else + host->cur_dir = DMA_DIR_TO_CARD; + + if (host->rw_mode == SEQ_RW) { + cancel_delayed_work(&host->rw_idle_work); + if (!sd_check_multi_seq(host, mrq)) { + sd_stop_rw_multi_seq(host, mrq); + host->rw_mode = NORMAL_RW; + } + } + + if (host->rw_mode == SEQ_RW) + cmd->error = sd_rw_multi_seq(host, mrq); + else { + if (mmc_op_multi(cmd->opcode)) + host->rw_mode = SEQ_RW; + cmd->error = sd_rw_multi(host, mrq); + if (!host->using_cookie) + sdmmc_post_req(host->mmc, host->mrq, 0); + } + + if (cmd->error) + host->rw_mode = NORMAL_RW; + + if (mmc_op_multi(cmd->opcode) && host->rw_mode == SEQ_RW) + mod_delayed_work(system_wq, &host->rw_idle_work, msecs_to_jiffies(150)); - if (mmc_op_multi(cmd->opcode) && mrq->stop) - sd_send_cmd_get_rsp(host, mrq->stop); } else { sd_normal_rw(host, mrq); } @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work) } mutex_lock(&host->host_mutex); + if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) { + host->prev_dir = host->cur_dir; + host->prev_sec_addr = data->blk_addr; + host->prev_sec_cnt = data->blocks; + } host->mrq = NULL; mutex_unlock(&host->host_mutex); @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct platform_device *pdev) struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev); host->cookie = -1; + host->rw_mode = NORMAL_RW; mmc_detect_change(host->mmc, 0); } @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev) host->cookie = -1; host->power_state = SDMMC_POWER_OFF; INIT_WORK(&host->work, sd_request); + INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work); platform_set_drvdata(pdev, host); pcr->slots[RTSX_SD_CARD].p_dev = pdev; pcr->slots[RTSX_SD_CARD].card_event = rtsx_pci_sdmmc_card_event; @@ -1526,6 +1700,7 @@ static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev) pm_runtime_disable(&pdev->dev); } + cancel_delayed_work_sync(&host->rw_idle_work); cancel_work_sync(&host->work); mutex_lock(&host->host_mutex);
Improving performance for the CMD is multi-block read/write and the data is sequential. sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or normal RW (CMD 17/24) if the CMD is multi-block and the data is sequential then call to sd_rw_multi_seq() This patch mainly to control the timing of reply at CMD 12/13. Originally code driver reply CMD 12/13 at every RW (CMD 18/25). The new code to distinguish multi-block RW(CMD 18/25) and the data is sequential or not, if the data is sequential RW driver do not send CMD 12 and bypass CMD 13 until wait the different direction RW CMD or trigger the delay_work to sent CMD 12. run benchmark result as below: SD Card : Samsumg Pro Plus 128GB Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s, Write : 38.3 MB/s <After> Read : 91.5 MB/s, Write : 55.5 MB/s Signed-off-by: Ricky Wu <ricky_wu@realtek.com> --- v2: make commit message more clarity change function name for more clarity v3: add more commit message and benchmark result --- drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++- 1 file changed, 180 insertions(+), 5 deletions(-)