diff mbox series

blk-ioprio: Introduce promote-to-rt policy

Message ID 20230201045227.2203123-1-houtao@huaweicloud.com (mailing list archive)
State New, archived
Headers show
Series blk-ioprio: Introduce promote-to-rt policy | expand

Commit Message

Hou Tao Feb. 1, 2023, 4:52 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

Since commit a78418e6a04c ("block: Always initialize bio IO priority on
submit"), bio->bi_ioprio will never be IOPRIO_CLASS_NONE when calling
blkcg_set_ioprio(), so there will be no way to promote the io-priority
of one cgroup to IOPRIO_CLASS_RT, because bi_ioprio will always be
greater than or equals to IOPRIO_CLASS_RT.

It seems possible to call blkcg_set_ioprio() first then try to
initialize bi_ioprio later in bio_set_ioprio(), but this doesn't work
for bio in which bi_ioprio is already initialized (e.g., direct-io), so
introduce a new ioprio policy to promote the iopriority of bio to
IOPRIO_CLASS_RT if the ioprio is not already RT.

To distinguish between the demotion policy and the promotion policy,
use a bit in upper 16-bits of the policy to accomplish that and handle
the bit accordingly in blkcg_set_ioprio().

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 38 ++++++----
 block/blk-ioprio.c                      | 94 +++++++++++++++++--------
 2 files changed, 92 insertions(+), 40 deletions(-)

Comments

Bagas Sanjaya Feb. 1, 2023, 9:07 a.m. UTC | #1
On Wed, Feb 01, 2023 at 12:52:27PM +0800, Hou Tao wrote:
>  The following numerical values are associated with the I/O priority policies:
>  
> -+-------------+---+
> -| no-change   | 0 |
> -+-------------+---+
> -| none-to-rt  | 1 |
> -+-------------+---+
> -| rt-to-be    | 2 |
> -+-------------+---+
> -| all-to-idle | 3 |
> -+-------------+---+
> +
> ++---------------+---------+-----+
> +| policy        | inst    | num |
> ++---------------+---------+-----+
> +| no-change     | demote  | 0   |
> ++---------------+---------+-----+
> +| none-to-rt    | demote  | 1   |
> ++---------------+---------+-----+
> +| rt-to-be      | demote  | 2   |
> ++---------------+---------+-----+
> +| idle          | demote  | 3   |
> ++---------------+---------+-----+
> +| promote-to-rt | promote | 1   |
> ++---------------+---------+-----+
>  

The first row should have been header row:

---- >8 ----
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index e0b9f73ef62a9e..55f9b579716564 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2048,7 +2048,7 @@ The following numerical values are associated with the I/O priority policies:
 
 +---------------+---------+-----+
 | policy        | inst    | num |
-+---------------+---------+-----+
++===============+=========+=====+
 | no-change     | demote  | 0   |
 +---------------+---------+-----+
 | none-to-rt    | demote  | 1   |

> @@ -2064,9 +2074,13 @@ The numerical value that corresponds to each I/O priority class is as follows:
>  
>  The algorithm to set the I/O priority class for a request is as follows:
>  
> -- Translate the I/O priority class policy into a number.
> -- Change the request I/O priority class into the maximum of the I/O priority
> -  class policy number and the numerical I/O priority class.
> +-- Translate the I/O priority class policy into an instruction and a number
> +-- If the instruction is demotion, change the request I/O priority class
> +-  into the maximum of the I/O priority class policy number and the numerical
> +-  I/O priority class.
> +-- If the instruction is promotion, change the request I/O priority class
> +-  into the minimum of the I/O priority class policy number and the numerical
> +-  I/O priority class.
>  

Remove the excessive bullet list marker or the list above become paragraph
instead:

---- >8 ----
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 55f9b579716564..c3f16386c47bdf 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2074,12 +2074,12 @@ The numerical value that corresponds to each I/O priority class is as follows:
 
 The algorithm to set the I/O priority class for a request is as follows:
 
--- Translate the I/O priority class policy into an instruction and a number
--- If the instruction is demotion, change the request I/O priority class
--  into the maximum of the I/O priority class policy number and the numerical
--  I/O priority class.
--- If the instruction is promotion, change the request I/O priority class
--  into the minimum of the I/O priority class policy number and the numerical
+- Translate the I/O priority class policy into an instruction-number pair.
+- If the instruction is demotion, change the request I/O priority class
+  into the maximum of the I/O priority class policy number and the numerical
+  I/O priority class.
+- If the instruction is promotion, change the request I/O priority class
+  into the minimum of the I/O priority class policy number and the numerical
 -  I/O priority class.
 
 PID

Thanks.
Bart Van Assche Feb. 1, 2023, 5:33 p.m. UTC | #2
On 1/31/23 20:52, Hou Tao wrote:
>   /**
>    * enum prio_policy - I/O priority class policy.
>    * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
> @@ -27,21 +34,30 @@
>    * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
>    *		IOPRIO_CLASS_BE.
>    * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
> - *
> + * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
> + * 		IOPRIO_CLASS_RT.
>    * See also <linux/ioprio.h>.
>    */
>   enum prio_policy {
> -	POLICY_NO_CHANGE	= 0,
> -	POLICY_NONE_TO_RT	= 1,
> -	POLICY_RESTRICT_TO_BE	= 2,
> -	POLICY_ALL_TO_IDLE	= 3,
> +	POLICY_NO_CHANGE	= IOPRIO_CLASS_NONE,
> +	POLICY_NONE_TO_RT	= IOPRIO_CLASS_RT,
> +	POLICY_RESTRICT_TO_BE	= IOPRIO_CLASS_BE,
> +	POLICY_ALL_TO_IDLE	= IOPRIO_CLASS_IDLE,
> +	POLICY_PROMOTE_TO_RT	= IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
> +};

The above change complicates the ioprio code. Additionally, I'm 
concerned that it makes the ioprio code slower. Has it been considered 
to keep the numerical values for the existing policies, to assign the 
number 4 to POLICY_PROMOTE_TO_RT and to use a lookup-array in 
blkcg_set_ioprio() to convert the policy number into an IOPRIO_CLASS value?

Thanks,

Bart.
Hou Tao Feb. 2, 2023, 10:50 a.m. UTC | #3
Hi,

On 2/1/2023 5:07 PM, Bagas Sanjaya wrote:
> On Wed, Feb 01, 2023 at 12:52:27PM +0800, Hou Tao wrote:
>>  The following numerical values are associated with the I/O priority policies:
>>  
>> -+-------------+---+
>> -| no-change   | 0 |
>> -+-------------+---+
>> -| none-to-rt  | 1 |
>> -+-------------+---+
>> -| rt-to-be    | 2 |
>> -+-------------+---+
>> -| all-to-idle | 3 |
>> -+-------------+---+
>> +
>> ++---------------+---------+-----+
>> +| policy        | inst    | num |
>> ++---------------+---------+-----+
>> +| no-change     | demote  | 0   |
>> ++---------------+---------+-----+
>> +| none-to-rt    | demote  | 1   |
>> ++---------------+---------+-----+
>> +| rt-to-be      | demote  | 2   |
>> ++---------------+---------+-----+
>> +| idle          | demote  | 3   |
>> ++---------------+---------+-----+
>> +| promote-to-rt | promote | 1   |
>> ++---------------+---------+-----+
>>  
> The first row should have been header row:
>
> ---- >8 ----
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index e0b9f73ef62a9e..55f9b579716564 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2048,7 +2048,7 @@ The following numerical values are associated with the I/O priority policies:
>  
>  +---------------+---------+-----+
>  | policy        | inst    | num |
> -+---------------+---------+-----+
> ++===============+=========+=====+
>  | no-change     | demote  | 0   |
>  +---------------+---------+-----+
>  | none-to-rt    | demote  | 1   |
>
>> @@ -2064,9 +2074,13 @@ The numerical value that corresponds to each I/O priority class is as follows:
>>  
>>  The algorithm to set the I/O priority class for a request is as follows:
>>  
>> -- Translate the I/O priority class policy into a number.
>> -- Change the request I/O priority class into the maximum of the I/O priority
>> -  class policy number and the numerical I/O priority class.
>> +-- Translate the I/O priority class policy into an instruction and a number
>> +-- If the instruction is demotion, change the request I/O priority class
>> +-  into the maximum of the I/O priority class policy number and the numerical
>> +-  I/O priority class.
>> +-- If the instruction is promotion, change the request I/O priority class
>> +-  into the minimum of the I/O priority class policy number and the numerical
>> +-  I/O priority class.
>>  
> Remove the excessive bullet list marker or the list above become paragraph
> instead:
>
> ---- >8 ----
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 55f9b579716564..c3f16386c47bdf 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2074,12 +2074,12 @@ The numerical value that corresponds to each I/O priority class is as follows:
>  
>  The algorithm to set the I/O priority class for a request is as follows:
>  
> --- Translate the I/O priority class policy into an instruction and a number
> --- If the instruction is demotion, change the request I/O priority class
> --  into the maximum of the I/O priority class policy number and the numerical
> --  I/O priority class.
> --- If the instruction is promotion, change the request I/O priority class
> --  into the minimum of the I/O priority class policy number and the numerical
> +- Translate the I/O priority class policy into an instruction-number pair.
> +- If the instruction is demotion, change the request I/O priority class
> +  into the maximum of the I/O priority class policy number and the numerical
> +  I/O priority class.
> +- If the instruction is promotion, change the request I/O priority class
> +  into the minimum of the I/O priority class policy number and the numerical
>  -  I/O priority class.
>  
>  PID
>
> Thanks.
Thanks for your comments. Will fix in v2.
>
Hou Tao Feb. 2, 2023, 11:09 a.m. UTC | #4
Hi,

On 2/2/2023 1:33 AM, Bart Van Assche wrote:
> On 1/31/23 20:52, Hou Tao wrote:
>>   /**
>>    * enum prio_policy - I/O priority class policy.
>>    * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
>> @@ -27,21 +34,30 @@
>>    * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
>>    *        IOPRIO_CLASS_BE.
>>    * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
>> - *
>> + * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
>> + *         IOPRIO_CLASS_RT.
>>    * See also <linux/ioprio.h>.
>>    */
>>   enum prio_policy {
>> -    POLICY_NO_CHANGE    = 0,
>> -    POLICY_NONE_TO_RT    = 1,
>> -    POLICY_RESTRICT_TO_BE    = 2,
>> -    POLICY_ALL_TO_IDLE    = 3,
>> +    POLICY_NO_CHANGE    = IOPRIO_CLASS_NONE,
>> +    POLICY_NONE_TO_RT    = IOPRIO_CLASS_RT,
>> +    POLICY_RESTRICT_TO_BE    = IOPRIO_CLASS_BE,
>> +    POLICY_ALL_TO_IDLE    = IOPRIO_CLASS_IDLE,
>> +    POLICY_PROMOTE_TO_RT    = IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
>> +};
>
> The above change complicates the ioprio code. Additionally, I'm concerned that
> it makes the ioprio code slower. Has it been considered to keep the numerical
> values for the existing policies, to assign the number 4 to
> POLICY_PROMOTE_TO_RT and to use a lookup-array in blkcg_set_ioprio() to
> convert the policy number into an IOPRIO_CLASS value?
For the slowness, do you meaning the extra dereference of blkcg->ioprio->policy
when policy is no-change or the handle of IOPRIO_POL_PROMOTION in
blkcg_set_ioprio()? It seems other functions (e.g., ioprio_show_prio_policy()
and ioprio_set_prio_policy()) are not on the hot path. Using a lookup array in
blkcg_set_ioprio() to do the conversion will also be OK, although it will
introduce an extra lookup each time when policy is not no-change. I don't have
strong preference. If you are OK with lookup array in blkcg_set_ioprio(), will
do it in v2.
>
> Thanks,
>
> Bart.
>
>
> .
Bart Van Assche Feb. 2, 2023, 6:05 p.m. UTC | #5
On 2/2/23 03:09, Hou Tao wrote:
> Hi,
> 
> On 2/2/2023 1:33 AM, Bart Van Assche wrote:
>> On 1/31/23 20:52, Hou Tao wrote:
>>>    /**
>>>     * enum prio_policy - I/O priority class policy.
>>>     * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
>>> @@ -27,21 +34,30 @@
>>>     * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
>>>     *        IOPRIO_CLASS_BE.
>>>     * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
>>> - *
>>> + * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
>>> + *         IOPRIO_CLASS_RT.
>>>     * See also <linux/ioprio.h>.
>>>     */
>>>    enum prio_policy {
>>> -    POLICY_NO_CHANGE    = 0,
>>> -    POLICY_NONE_TO_RT    = 1,
>>> -    POLICY_RESTRICT_TO_BE    = 2,
>>> -    POLICY_ALL_TO_IDLE    = 3,
>>> +    POLICY_NO_CHANGE    = IOPRIO_CLASS_NONE,
>>> +    POLICY_NONE_TO_RT    = IOPRIO_CLASS_RT,
>>> +    POLICY_RESTRICT_TO_BE    = IOPRIO_CLASS_BE,
>>> +    POLICY_ALL_TO_IDLE    = IOPRIO_CLASS_IDLE,
>>> +    POLICY_PROMOTE_TO_RT    = IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
>>> +};
>>
>> The above change complicates the ioprio code. Additionally, I'm concerned that
>> it makes the ioprio code slower. Has it been considered to keep the numerical
>> values for the existing policies, to assign the number 4 to
>> POLICY_PROMOTE_TO_RT and to use a lookup-array in blkcg_set_ioprio() to
>> convert the policy number into an IOPRIO_CLASS value?
> For the slowness, do you meaning the extra dereference of blkcg->ioprio->policy
> when policy is no-change or the handle of IOPRIO_POL_PROMOTION in
> blkcg_set_ioprio()? It seems other functions (e.g., ioprio_show_prio_policy()
> and ioprio_set_prio_policy()) are not on the hot path. Using a lookup array in
> blkcg_set_ioprio() to do the conversion will also be OK, although it will
> introduce an extra lookup each time when policy is not no-change. I don't have
> strong preference. If you are OK with lookup array in blkcg_set_ioprio(), will
> do it in v2.

Hi Hou,

I prefer the lookup array because with the lookup array approach the 
IOPRIO_POL_PROMOTION constant is no longer needed and because I expect 
that this will result in code that is easier to read. Additionally, the 
lookup array will be small so the compiler may be clever enough to 
optimize it away.

Thanks,

Bart.
Hou Tao Feb. 3, 2023, 1:48 a.m. UTC | #6
Hi Bart,

On 2/3/2023 2:05 AM, Bart Van Assche wrote:
> On 2/2/23 03:09, Hou Tao wrote:
>> Hi,
>>
>> On 2/2/2023 1:33 AM, Bart Van Assche wrote:
>>> On 1/31/23 20:52, Hou Tao wrote:
>>>>    /**
>>>>     * enum prio_policy - I/O priority class policy.
>>>>     * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
>>>> @@ -27,21 +34,30 @@
>>>>     * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT
>>>> into
>>>>     *        IOPRIO_CLASS_BE.
>>>>     * @POLICY_ALL_TO_IDLE: change the I/O priority class into
>>>> IOPRIO_CLASS_IDLE.
>>>> - *
>>>> + * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
>>>> + *         IOPRIO_CLASS_RT.
>>>>     * See also <linux/ioprio.h>.
>>>>     */
>>>>    enum prio_policy {
>>>> -    POLICY_NO_CHANGE    = 0,
>>>> -    POLICY_NONE_TO_RT    = 1,
>>>> -    POLICY_RESTRICT_TO_BE    = 2,
>>>> -    POLICY_ALL_TO_IDLE    = 3,
>>>> +    POLICY_NO_CHANGE    = IOPRIO_CLASS_NONE,
>>>> +    POLICY_NONE_TO_RT    = IOPRIO_CLASS_RT,
>>>> +    POLICY_RESTRICT_TO_BE    = IOPRIO_CLASS_BE,
>>>> +    POLICY_ALL_TO_IDLE    = IOPRIO_CLASS_IDLE,
>>>> +    POLICY_PROMOTE_TO_RT    = IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
>>>> +};
>>>
>>> The above change complicates the ioprio code. Additionally, I'm concerned that
>>> it makes the ioprio code slower. Has it been considered to keep the numerical
>>> values for the existing policies, to assign the number 4 to
>>> POLICY_PROMOTE_TO_RT and to use a lookup-array in blkcg_set_ioprio() to
>>> convert the policy number into an IOPRIO_CLASS value?
>> For the slowness, do you meaning the extra dereference of blkcg->ioprio->policy
>> when policy is no-change or the handle of IOPRIO_POL_PROMOTION in
>> blkcg_set_ioprio()? It seems other functions (e.g., ioprio_show_prio_policy()
>> and ioprio_set_prio_policy()) are not on the hot path. Using a lookup array in
>> blkcg_set_ioprio() to do the conversion will also be OK, although it will
>> introduce an extra lookup each time when policy is not no-change. I don't have
>> strong preference. If you are OK with lookup array in blkcg_set_ioprio(), will
>> do it in v2.
>
> Hi Hou,
>
> I prefer the lookup array because with the lookup array approach the
> IOPRIO_POL_PROMOTION constant is no longer needed and because I expect that
> this will result in code that is easier to read. Additionally, the lookup
> array will be small so the compiler may be clever enough to optimize it away.
I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
used to determinate on how to calculate the final ioprio for bio: choosing the
maximum or minimum between blkcg ioprio and original bio bi_ioprio.
>
> Thanks,
>
> Bart.
>
>
> .
Bart Van Assche Feb. 3, 2023, 7:45 p.m. UTC | #7
On 2/2/23 17:48, Hou Tao wrote:
> I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
> ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
> used to determinate on how to calculate the final ioprio for bio: choosing the
> maximum or minimum between blkcg ioprio and original bio bi_ioprio.

Do the block layer code changes shown below implement the functionality that you
need?

Thanks,

Bart.



diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 8bb6b8eba4ce..4a56da95168e 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -27,6 +27,8 @@
   * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
   *		IOPRIO_CLASS_BE.
   * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
+ * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
+ * 		IOPRIO_CLASS_RT.
   *
   * See also <linux/ioprio.h>.
   */
@@ -35,6 +37,7 @@ enum prio_policy {
  	POLICY_NONE_TO_RT	= 1,
  	POLICY_RESTRICT_TO_BE	= 2,
  	POLICY_ALL_TO_IDLE	= 3,
+	POLICY_PROMOTE_TO_RT,
  };

  static const char *policy_name[] = {
@@ -42,6 +45,7 @@ static const char *policy_name[] = {
  	[POLICY_NONE_TO_RT]	= "none-to-rt",
  	[POLICY_RESTRICT_TO_BE]	= "restrict-to-be",
  	[POLICY_ALL_TO_IDLE]	= "idle",
+	[POLICY_PROMOTE_TO_RT]	= "promote-to-rt",
  };

  static struct blkcg_policy ioprio_policy;
@@ -189,17 +193,23 @@ void blkcg_set_ioprio(struct bio *bio)
  	if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
  		return;

-	/*
-	 * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
-	 * correspond to a lower priority. Hence, the max_t() below selects
-	 * the lower priority of bi_ioprio and the cgroup I/O priority class.
-	 * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
-	 * priority is assigned to the bio.
-	 */
-	prio = max_t(u16, bio->bi_ioprio,
-			IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
-	if (prio > bio->bi_ioprio)
-		bio->bi_ioprio = prio;
+	if (blkcg->prio_policy == PROMOTE_TO_RT) {
+		if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT)
+			bio->bi_ioprio = IOPRIO_CLASS_RT;
+	} else {
+		/*
+		 * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
+		 * correspond to a lower priority. Hence, the max_t() below
+		 * selects the lower priority of bi_ioprio and the cgroup I/O
+		 * priority class.  If the bio I/O priority equals
+		 * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the
+		 * bio.
+		 */
+		prio = max_t(u16, bio->bi_ioprio,
+			     IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+		if (prio > bio->bi_ioprio)
+			bio->bi_ioprio = prio;
+	}
  }

  void blk_ioprio_exit(struct gendisk *disk)
Bart Van Assche Feb. 3, 2023, 7:51 p.m. UTC | #8
On 1/31/23 20:52, Hou Tao wrote:
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index c8ae7c897f14..e0b9f73ef62a 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2038,17 +2038,27 @@ that attribute:
>   	Change the I/O priority class of all requests into IDLE, the lowest
>   	I/O priority class.
>   
> +  promote-to-rt
> +	For requests that have I/O priority class BE or that have I/O priority
> +        class IDLE, change it into RT. Do not modify the I/O priority class
> +        of requests that have priority class RT.

Please document whether or not this policy modifies the I/O priority
(IOPRIO_PRIO_DATA()). Do you agree that the I/O priority should be preserved
when promoting from BE to RT and that only the I/O priority class should be
modified for such promotions?

>   The following numerical values are associated with the I/O priority policies:
>   
> -+-------------+---+
> -| no-change   | 0 |
> -+-------------+---+
> -| none-to-rt  | 1 |
> -+-------------+---+
> -| rt-to-be    | 2 |
> -+-------------+---+
> -| all-to-idle | 3 |
> -+-------------+---+
> +
> ++---------------+---------+-----+
> +| policy        | inst    | num |
> ++---------------+---------+-----+
> +| no-change     | demote  | 0   |
> ++---------------+---------+-----+
> +| none-to-rt    | demote  | 1   |
> ++---------------+---------+-----+
> +| rt-to-be      | demote  | 2   |
> ++---------------+---------+-----+
> +| idle          | demote  | 3   |
> ++---------------+---------+-----+
> +| promote-to-rt | promote | 1   |
> ++---------------+---------+-----+

I prefer that this table is not modified. The numerical values associated with
policies only matters for none-to-rt, rt-to-be and all-to-idle but not for
promote-to-rt. So I don't think that it is necessary to mention a numerical
value for the promote-to-rt policy. Additionally, "none-to-rt" is not a policy
that demotes the I/O priority but a policy that may promote the I/O priority.

> +-- If the instruction is promotion, change the request I/O priority class
> +-  into the minimum of the I/O priority class policy number and the numerical
> +-  I/O priority class.

Using the minimum value seems wrong to me because that will change
IOPRIO_VALUE(IOPRIO_CLASS_RT, 1) into IOPRIO_VALUE(IOPRIO_CLASS_RT, 0).

Thanks,

Bart.
Hou Tao Feb. 5, 2023, 7:04 a.m. UTC | #9
Hi,

On 2/4/2023 3:45 AM, Bart Van Assche wrote:
> On 2/2/23 17:48, Hou Tao wrote:
>> I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
>> ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
>> used to determinate on how to calculate the final ioprio for bio: choosing the
>> maximum or minimum between blkcg ioprio and original bio bi_ioprio.
>
> Do the block layer code changes shown below implement the functionality that you
> need?
Yes, something like that. The reason for introducing IOPRIO_POL_PROMOTION is to
support other promotion policy (e.g., promote-to-be), but now I think the
possibility of adding other promotion policies is low, so the code below is fine
to me.
>
> Thanks,
>
> Bart.
>
>
>
> diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
> index 8bb6b8eba4ce..4a56da95168e 100644
> --- a/block/blk-ioprio.c
> +++ b/block/blk-ioprio.c
> @@ -27,6 +27,8 @@
>   * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
>   *        IOPRIO_CLASS_BE.
>   * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
> + * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
> + *         IOPRIO_CLASS_RT.
>   *
>   * See also <linux/ioprio.h>.
>   */
> @@ -35,6 +37,7 @@ enum prio_policy {
>      POLICY_NONE_TO_RT    = 1,
>      POLICY_RESTRICT_TO_BE    = 2,
>      POLICY_ALL_TO_IDLE    = 3,
> +    POLICY_PROMOTE_TO_RT,
>  };
>
>  static const char *policy_name[] = {
> @@ -42,6 +45,7 @@ static const char *policy_name[] = {
>      [POLICY_NONE_TO_RT]    = "none-to-rt",
>      [POLICY_RESTRICT_TO_BE]    = "restrict-to-be",
>      [POLICY_ALL_TO_IDLE]    = "idle",
> +    [POLICY_PROMOTE_TO_RT]    = "promote-to-rt",
>  };
>
>  static struct blkcg_policy ioprio_policy;
> @@ -189,17 +193,23 @@ void blkcg_set_ioprio(struct bio *bio)
>      if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
>          return;
>
> -    /*
> -     * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
> -     * correspond to a lower priority. Hence, the max_t() below selects
> -     * the lower priority of bi_ioprio and the cgroup I/O priority class.
> -     * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
> -     * priority is assigned to the bio.
> -     */
> -    prio = max_t(u16, bio->bi_ioprio,
> -            IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
> -    if (prio > bio->bi_ioprio)
> -        bio->bi_ioprio = prio;
> +    if (blkcg->prio_policy == PROMOTE_TO_RT) {
> +        if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT)
> +            bio->bi_ioprio = IOPRIO_CLASS_RT;
> +    } else {
> +        /*
> +         * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
> +         * correspond to a lower priority. Hence, the max_t() below
> +         * selects the lower priority of bi_ioprio and the cgroup I/O
> +         * priority class.  If the bio I/O priority equals
> +         * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the
> +         * bio.
> +         */
> +        prio = max_t(u16, bio->bi_ioprio,
> +                 IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
> +        if (prio > bio->bi_ioprio)
> +            bio->bi_ioprio = prio;
> +    }
>  }
>
>  void blk_ioprio_exit(struct gendisk *disk)
>
Hou Tao Feb. 5, 2023, 7:17 a.m. UTC | #10
Hi,

On 2/4/2023 3:51 AM, Bart Van Assche wrote:
> On 1/31/23 20:52, Hou Tao wrote:
>> diff --git a/Documentation/admin-guide/cgroup-v2.rst
>> b/Documentation/admin-guide/cgroup-v2.rst
>> index c8ae7c897f14..e0b9f73ef62a 100644
>> --- a/Documentation/admin-guide/cgroup-v2.rst
>> +++ b/Documentation/admin-guide/cgroup-v2.rst
>> @@ -2038,17 +2038,27 @@ that attribute:
>>       Change the I/O priority class of all requests into IDLE, the lowest
>>       I/O priority class.
>>   +  promote-to-rt
>> +    For requests that have I/O priority class BE or that have I/O priority
>> +        class IDLE, change it into RT. Do not modify the I/O priority class
>> +        of requests that have priority class RT.
>
> Please document whether or not this policy modifies the I/O priority
> (IOPRIO_PRIO_DATA()). Do you agree that the I/O priority should be preserved
> when promoting from BE to RT and that only the I/O priority class should be
> modified for such promotions?
I don't think it is a good idea to keep priority data for BE and IDLE class,
else after the override of bi_ioprio, a priority with IDLE class and high
priority data (e.g., 0) will have higher priority than BE class with low
priority data (e.g., 7). So maybe we should assign the lowest priority data to
the promoted io priority.
>
>>   The following numerical values are associated with the I/O priority policies:
>>   -+-------------+---+
>> -| no-change   | 0 |
>> -+-------------+---+
>> -| none-to-rt  | 1 |
>> -+-------------+---+
>> -| rt-to-be    | 2 |
>> -+-------------+---+
>> -| all-to-idle | 3 |
>> -+-------------+---+
>> +
>> ++---------------+---------+-----+
>> +| policy        | inst    | num |
>> ++---------------+---------+-----+
>> +| no-change     | demote  | 0   |
>> ++---------------+---------+-----+
>> +| none-to-rt    | demote  | 1   |
>> ++---------------+---------+-----+
>> +| rt-to-be      | demote  | 2   |
>> ++---------------+---------+-----+
>> +| idle          | demote  | 3   |
>> ++---------------+---------+-----+
>> +| promote-to-rt | promote | 1   |
>> ++---------------+---------+-----+
>
> I prefer that this table is not modified. The numerical values associated with
> policies only matters for none-to-rt, rt-to-be and all-to-idle but not for
> promote-to-rt. So I don't think that it is necessary to mention a numerical
> value for the promote-to-rt policy. Additionally, "none-to-rt" is not a policy
> that demotes the I/O priority but a policy that may promote the I/O priority.
Yes, this is no need to associate a number with promote-rt policy. Will fix in
v2. "none-to-rt" may promote io priority when the priority if NONE, although for
now bi_ioprio will never be NONE when blkcg_set_ioprio() is called.
>
>> +-- If the instruction is promotion, change the request I/O priority class
>> +-  into the minimum of the I/O priority class policy number and the numerical
>> +-  I/O priority class.
>
> Using the minimum value seems wrong to me because that will change
> IOPRIO_VALUE(IOPRIO_CLASS_RT, 1) into IOPRIO_VALUE(IOPRIO_CLASS_RT, 0).
Yes, you are right. Will fix in v2.
>
> Thanks,
>
> Bart.
Jan Kara Feb. 8, 2023, 1:43 p.m. UTC | #11
On Fri 03-02-23 11:45:32, Bart Van Assche wrote:
> On 2/2/23 17:48, Hou Tao wrote:
> > I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
> > ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
> > used to determinate on how to calculate the final ioprio for bio: choosing the
> > maximum or minimum between blkcg ioprio and original bio bi_ioprio.
> 
> Do the block layer code changes shown below implement the functionality
> that you need?

Just one question guys: So with my a78418e6a04c ("block: Always initialize
bio IO priority on submit") none-to-rt policy became effectively a noop as
Hou properly noticed. Are we aware of any users that were broken by this?
Shouldn't we rather fix the code so that none-to-rt starts to operate
correctly again? Or maybe change the none-to-rt meaning to be actually
promote-to-rt?

I have to admit I'm wondering a bit what was the intended usecase behind
the introduction of none-to-rt policy. Can someone elaborate? promote-to-rt 
makes some sense to me - we have a priviledged cgroup we want to provide
low latency access to IO but none-to-rt just does not make much sense to
me...

								Honza
Bart Van Assche Feb. 8, 2023, 5:53 p.m. UTC | #12
On 2/8/23 05:43, Jan Kara wrote:
> On Fri 03-02-23 11:45:32, Bart Van Assche wrote:
>> On 2/2/23 17:48, Hou Tao wrote:
>>> I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
>>> ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
>>> used to determinate on how to calculate the final ioprio for bio: choosing the
>>> maximum or minimum between blkcg ioprio and original bio bi_ioprio.
>>
>> Do the block layer code changes shown below implement the functionality
>> that you need?
> 
> Just one question guys: So with my a78418e6a04c ("block: Always initialize
> bio IO priority on submit") none-to-rt policy became effectively a noop as
> Hou properly noticed. Are we aware of any users that were broken by this?
> Shouldn't we rather fix the code so that none-to-rt starts to operate
> correctly again? Or maybe change the none-to-rt meaning to be actually
> promote-to-rt?
> 
> I have to admit I'm wondering a bit what was the intended usecase behind
> the introduction of none-to-rt policy. Can someone elaborate? promote-to-rt
> makes some sense to me - we have a priviledged cgroup we want to provide
> low latency access to IO but none-to-rt just does not make much sense to
> me...

Hi Jan,

The test results I shared some time ago show that IOPRIO_CLASS_NONE was 
the default I/O priority two years ago (see also 
https://lore.kernel.org/linux-block/20210927220328.1410161-5-bvanassche@acm.org/). 
The none-to-rt policy increases the priority of bio's that have not been 
assigned an I/O priority to RT. Does this answer your question?

Thanks,

Bart.
Jan Kara Feb. 9, 2023, 8:56 a.m. UTC | #13
On Wed 08-02-23 09:53:41, Bart Van Assche wrote:
> On 2/8/23 05:43, Jan Kara wrote:
> > On Fri 03-02-23 11:45:32, Bart Van Assche wrote:
> > > On 2/2/23 17:48, Hou Tao wrote:
> > > > I don't get it on how to remove IOPRIO_POL_PROMOTION when calculating the final
> > > > ioprio for bio. IOPRIO_POL_PROMOTION is not used for IOPRIO_CLASS values but
> > > > used to determinate on how to calculate the final ioprio for bio: choosing the
> > > > maximum or minimum between blkcg ioprio and original bio bi_ioprio.
> > > 
> > > Do the block layer code changes shown below implement the functionality
> > > that you need?
> > 
> > Just one question guys: So with my a78418e6a04c ("block: Always initialize
> > bio IO priority on submit") none-to-rt policy became effectively a noop as
> > Hou properly noticed. Are we aware of any users that were broken by this?
> > Shouldn't we rather fix the code so that none-to-rt starts to operate
> > correctly again? Or maybe change the none-to-rt meaning to be actually
> > promote-to-rt?
> > 
> > I have to admit I'm wondering a bit what was the intended usecase behind
> > the introduction of none-to-rt policy. Can someone elaborate? promote-to-rt
> > makes some sense to me - we have a priviledged cgroup we want to provide
> > low latency access to IO but none-to-rt just does not make much sense to
> > me...
> 
> Hi Jan,
> 
> The test results I shared some time ago show that IOPRIO_CLASS_NONE was the
> default I/O priority two years ago (see also https://lore.kernel.org/linux-block/20210927220328.1410161-5-bvanassche@acm.org/).
> The none-to-rt policy increases the priority of bio's that have not been
> assigned an I/O priority to RT. Does this answer your question?

Not quite. I know that historically we didn't set bio I/O priority in some
paths (but we did set it in other paths such as some (but not all) direct
IO implementations). But that was exactly a mess because how none-to-rt
actually behaved depended on the exact details of the kernel internal IO
path.  So my question is: Was none-to-rt actually just a misnomer and the
intended behavior was "always override to RT"? Or what was exactly the
expectation around when IO priority is not set and should be overridden?

How should it interact with AIO submissions with IOCB_FLAG_IOPRIO? How
should it interact with task having its IO priority modified with
ioprio_set(2)? And what if task has its normal scheduling priority modified
but that translates into different IO priority (which happens in
__get_task_ioprio())?

So I think that none-to-rt is just poorly defined and if we can just get
rid of it (or redefine to promote-to-rt), that would be good. But maybe I'm
missing some intended usecase...

								Honza
Bart Van Assche Feb. 9, 2023, 7:09 p.m. UTC | #14
On 2/9/23 00:56, Jan Kara wrote:
> On Wed 08-02-23 09:53:41, Bart Van Assche wrote:
>> The test results I shared some time ago show that IOPRIO_CLASS_NONE was the
>> default I/O priority two years ago (see also https://lore.kernel.org/linux-block/20210927220328.1410161-5-bvanassche@acm.org/).
>> The none-to-rt policy increases the priority of bio's that have not been
>> assigned an I/O priority to RT. Does this answer your question?
> 
> Not quite. I know that historically we didn't set bio I/O priority in some
> paths (but we did set it in other paths such as some (but not all) direct
> IO implementations). But that was exactly a mess because how none-to-rt
> actually behaved depended on the exact details of the kernel internal IO
> path.  So my question is: Was none-to-rt actually just a misnomer and the
> intended behavior was "always override to RT"? Or what was exactly the
> expectation around when IO priority is not set and should be overridden?
> 
> How should it interact with AIO submissions with IOCB_FLAG_IOPRIO? How
> should it interact with task having its IO priority modified with
> ioprio_set(2)? And what if task has its normal scheduling priority modified
> but that translates into different IO priority (which happens in
> __get_task_ioprio())?
> 
> So I think that none-to-rt is just poorly defined and if we can just get
> rid of it (or redefine to promote-to-rt), that would be good. But maybe I'm
> missing some intended usecase...

Hi Jan,

We have no plans to use the ioprio_set() system call since it only 
affects foreground I/O and not page cache writeback.

While Android supports io_uring, there are no plans to support libaio in 
the Android C library (Bionic).

Regarding __get_task_ioprio(), I haven't found any code in that function 
that derives an I/O priority from the scheduling priority. Did I perhaps 
overlook something?

Until recently "none-to-rt" meant "if no I/O priority has been assigned 
to a task, use IOPRIO_CLASS_RT". Promoting the I/O priority to 
IOPRIO_CLASS_RT works for us. I'm fine with changing the meaning of 
"none-to-rt" into promoting the I/O priority class to RT. Introducing 
"promote-to-rt" as a synonym of "none-to-rt" is also fine with me.

Thanks,

Bart.
Jan Kara Feb. 10, 2023, 10:12 a.m. UTC | #15
On Thu 09-02-23 11:09:33, Bart Van Assche wrote:
> On 2/9/23 00:56, Jan Kara wrote:
> > On Wed 08-02-23 09:53:41, Bart Van Assche wrote:
> > > The test results I shared some time ago show that IOPRIO_CLASS_NONE was the
> > > default I/O priority two years ago (see also https://lore.kernel.org/linux-block/20210927220328.1410161-5-bvanassche@acm.org/).
> > > The none-to-rt policy increases the priority of bio's that have not been
> > > assigned an I/O priority to RT. Does this answer your question?
> > 
> > Not quite. I know that historically we didn't set bio I/O priority in some
> > paths (but we did set it in other paths such as some (but not all) direct
> > IO implementations). But that was exactly a mess because how none-to-rt
> > actually behaved depended on the exact details of the kernel internal IO
> > path.  So my question is: Was none-to-rt actually just a misnomer and the
> > intended behavior was "always override to RT"? Or what was exactly the
> > expectation around when IO priority is not set and should be overridden?
> > 
> > How should it interact with AIO submissions with IOCB_FLAG_IOPRIO? How
> > should it interact with task having its IO priority modified with
> > ioprio_set(2)? And what if task has its normal scheduling priority modified
> > but that translates into different IO priority (which happens in
> > __get_task_ioprio())?
> > 
> > So I think that none-to-rt is just poorly defined and if we can just get
> > rid of it (or redefine to promote-to-rt), that would be good. But maybe I'm
> > missing some intended usecase...
> 
> Hi Jan,
> 
> We have no plans to use the ioprio_set() system call since it only affects
> foreground I/O and not page cache writeback.
> 
> While Android supports io_uring, there are no plans to support libaio in the
> Android C library (Bionic).
> 
> Regarding __get_task_ioprio(), I haven't found any code in that function
> that derives an I/O priority from the scheduling priority. Did I perhaps
> overlook something?

This condition in __get_task_ioprio():

        if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
                prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));

sets task's IO priority based on scheduling priority.

> Until recently "none-to-rt" meant "if no I/O priority has been assigned to a
> task, use IOPRIO_CLASS_RT". Promoting the I/O priority to IOPRIO_CLASS_RT
> works for us. I'm fine with changing the meaning of "none-to-rt" into
> promoting the I/O priority class to RT. Introducing "promote-to-rt" as a
> synonym of "none-to-rt" is also fine with me.

OK, so it seems we are all in agreement here that "none-to-rt" behavior is
not really needed. Hou, can you perhaps update your patches and the
documentation to make "none-to-rt" just an alias for "promote-to-rt"?
Thanks!

								Honza
Hou Tao Feb. 13, 2023, 12:51 p.m. UTC | #16
Hi Jan,

On 2/10/2023 6:12 PM, Jan Kara wrote:
> On Thu 09-02-23 11:09:33, Bart Van Assche wrote:
>> On 2/9/23 00:56, Jan Kara wrote:
>>> On Wed 08-02-23 09:53:41, Bart Van Assche wrote:
>>>> The test results I shared some time ago show that IOPRIO_CLASS_NONE was the
>>>> default I/O priority two years ago (see also https://lore.kernel.org/linux-block/20210927220328.1410161-5-bvanassche@acm.org/).
>>>> The none-to-rt policy increases the priority of bio's that have not been
>>>> assigned an I/O priority to RT. Does this answer your question?
>>> Not quite. I know that historically we didn't set bio I/O priority in some
>>> paths (but we did set it in other paths such as some (but not all) direct
>>> IO implementations). But that was exactly a mess because how none-to-rt
>>> actually behaved depended on the exact details of the kernel internal IO
>>> path.  So my question is: Was none-to-rt actually just a misnomer and the
>>> intended behavior was "always override to RT"? Or what was exactly the
>>> expectation around when IO priority is not set and should be overridden?
>>>
>>> How should it interact with AIO submissions with IOCB_FLAG_IOPRIO? How
>>> should it interact with task having its IO priority modified with
>>> ioprio_set(2)? And what if task has its normal scheduling priority modified
>>> but that translates into different IO priority (which happens in
>>> __get_task_ioprio())?
>>>
>>> So I think that none-to-rt is just poorly defined and if we can just get
>>> rid of it (or redefine to promote-to-rt), that would be good. But maybe I'm
>>> missing some intended usecase...
>> Hi Jan,
>>
>> We have no plans to use the ioprio_set() system call since it only affects
>> foreground I/O and not page cache writeback.
>>
>> While Android supports io_uring, there are no plans to support libaio in the
>> Android C library (Bionic).
>>
>> Regarding __get_task_ioprio(), I haven't found any code in that function
>> that derives an I/O priority from the scheduling priority. Did I perhaps
>> overlook something?
> This condition in __get_task_ioprio():
>
>         if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
>                 prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
>                                          task_nice_ioprio(p));
>
> sets task's IO priority based on scheduling priority.
>
>> Until recently "none-to-rt" meant "if no I/O priority has been assigned to a
>> task, use IOPRIO_CLASS_RT". Promoting the I/O priority to IOPRIO_CLASS_RT
>> works for us. I'm fine with changing the meaning of "none-to-rt" into
>> promoting the I/O priority class to RT. Introducing "promote-to-rt" as a
>> synonym of "none-to-rt" is also fine with me.
> OK, so it seems we are all in agreement here that "none-to-rt" behavior is
> not really needed. Hou, can you perhaps update your patches and the
> documentation to make "none-to-rt" just an alias for "promote-to-rt"?
> Thanks!
Should I keep "none-to-rt" and make it work just like "promote-to-rt" or should
I just remove "none-to-rt" and add "promote-to-rt" ? I think the latter will be
more appropriate.
>
> 								Honza
Bart Van Assche Feb. 13, 2023, 5:10 p.m. UTC | #17
On 2/13/23 04:51, Hou Tao wrote:
> Should I keep "none-to-rt" and make it work just like "promote-to-rt" or should
> I just remove "none-to-rt" and add "promote-to-rt" ? I think the latter will be
> more appropriate.

Removing none-to-rt would break existing systems that use this policy. I 
prefer the former solution.

Thanks,

Bart.
Jan Kara Feb. 14, 2023, 8:52 a.m. UTC | #18
On Mon 13-02-23 09:10:26, Bart Van Assche wrote:
> On 2/13/23 04:51, Hou Tao wrote:
> > Should I keep "none-to-rt" and make it work just like "promote-to-rt" or should
> > I just remove "none-to-rt" and add "promote-to-rt" ? I think the latter will be
> > more appropriate.
> 
> Removing none-to-rt would break existing systems that use this policy. I
> prefer the former solution.

Agreed. I also think that keeping none-to-rt as an alias for promote-to-rt
allows for a smoother transition. However I'm all for documenting that
none-to-rt is deprecated and works as promote-to-rt.

								Honza
diff mbox series

Patch

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index c8ae7c897f14..e0b9f73ef62a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2038,17 +2038,27 @@  that attribute:
 	Change the I/O priority class of all requests into IDLE, the lowest
 	I/O priority class.
 
+  promote-to-rt
+	For requests that have I/O priority class BE or that have I/O priority
+        class IDLE, change it into RT. Do not modify the I/O priority class
+        of requests that have priority class RT.
+
 The following numerical values are associated with the I/O priority policies:
 
-+-------------+---+
-| no-change   | 0 |
-+-------------+---+
-| none-to-rt  | 1 |
-+-------------+---+
-| rt-to-be    | 2 |
-+-------------+---+
-| all-to-idle | 3 |
-+-------------+---+
+
++---------------+---------+-----+
+| policy        | inst    | num |
++---------------+---------+-----+
+| no-change     | demote  | 0   |
++---------------+---------+-----+
+| none-to-rt    | demote  | 1   |
++---------------+---------+-----+
+| rt-to-be      | demote  | 2   |
++---------------+---------+-----+
+| idle          | demote  | 3   |
++---------------+---------+-----+
+| promote-to-rt | promote | 1   |
++---------------+---------+-----+
 
 The numerical value that corresponds to each I/O priority class is as follows:
 
@@ -2064,9 +2074,13 @@  The numerical value that corresponds to each I/O priority class is as follows:
 
 The algorithm to set the I/O priority class for a request is as follows:
 
-- Translate the I/O priority class policy into a number.
-- Change the request I/O priority class into the maximum of the I/O priority
-  class policy number and the numerical I/O priority class.
+-- Translate the I/O priority class policy into an instruction and a number
+-- If the instruction is demotion, change the request I/O priority class
+-  into the maximum of the I/O priority class policy number and the numerical
+-  I/O priority class.
+-- If the instruction is promotion, change the request I/O priority class
+-  into the minimum of the I/O priority class policy number and the numerical
+-  I/O priority class.
 
 PID
 ---
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 8bb6b8eba4ce..0d400bee9c72 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -20,6 +20,13 @@ 
 #include "blk-ioprio.h"
 #include "blk-rq-qos.h"
 
+/*
+ * Upper 16-bits are reserved for special flags.
+ *
+ * @IOPRIO_POL_PROMOTION: Promote bi_ioprio instead of demote it.
+ */
+#define IOPRIO_POL_PROMOTION (1U << 17)
+
 /**
  * enum prio_policy - I/O priority class policy.
  * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
@@ -27,21 +34,30 @@ 
  * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
  *		IOPRIO_CLASS_BE.
  * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
- *
+ * @POLICY_PROMOTE_TO_RT: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_BE into
+ * 		IOPRIO_CLASS_RT.
  * See also <linux/ioprio.h>.
  */
 enum prio_policy {
-	POLICY_NO_CHANGE	= 0,
-	POLICY_NONE_TO_RT	= 1,
-	POLICY_RESTRICT_TO_BE	= 2,
-	POLICY_ALL_TO_IDLE	= 3,
+	POLICY_NO_CHANGE	= IOPRIO_CLASS_NONE,
+	POLICY_NONE_TO_RT	= IOPRIO_CLASS_RT,
+	POLICY_RESTRICT_TO_BE	= IOPRIO_CLASS_BE,
+	POLICY_ALL_TO_IDLE	= IOPRIO_CLASS_IDLE,
+	POLICY_PROMOTE_TO_RT	= IOPRIO_CLASS_RT | IOPRIO_POL_PROMOTION,
+};
+
+struct ioprio_policy_tuple {
+	const char *name;
+	enum prio_policy policy;
 };
 
-static const char *policy_name[] = {
-	[POLICY_NO_CHANGE]	= "no-change",
-	[POLICY_NONE_TO_RT]	= "none-to-rt",
-	[POLICY_RESTRICT_TO_BE]	= "restrict-to-be",
-	[POLICY_ALL_TO_IDLE]	= "idle",
+/* ioprio_alloc_cpd() needs POLICY_NO_CHANGE to be the first policy */
+static const struct ioprio_policy_tuple ioprio_policies[] = {
+	{ "no-change",		POLICY_NO_CHANGE },
+	{ "none-to-rt",		POLICY_NONE_TO_RT },
+	{ "restrict-to-be",	POLICY_RESTRICT_TO_BE },
+	{ "idle",		POLICY_ALL_TO_IDLE },
+	{ "promote-to-rt",	POLICY_PROMOTE_TO_RT }
 };
 
 static struct blkcg_policy ioprio_policy;
@@ -57,11 +73,11 @@  struct ioprio_blkg {
 /**
  * struct ioprio_blkcg - Per cgroup data.
  * @cpd: blkcg_policy_data structure.
- * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
+ * @ioprio: Policy name and definition.
  */
 struct ioprio_blkcg {
 	struct blkcg_policy_data cpd;
-	enum prio_policy	 prio_policy;
+	const struct ioprio_policy_tuple *ioprio;
 };
 
 static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -95,23 +111,35 @@  static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
 {
 	struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
 
-	seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
+	seq_printf(sf, "%s\n", blkcg->ioprio->name);
 	return 0;
 }
 
+static const struct ioprio_policy_tuple *ioprio_match_policy(const char *buf)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ioprio_policies); i++) {
+		if (sysfs_streq(ioprio_policies[i].name, buf))
+			return &ioprio_policies[i];
+	}
+
+	return NULL;
+}
+
 static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
 				      size_t nbytes, loff_t off)
 {
 	struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
-	int ret;
+	const struct ioprio_policy_tuple *ioprio;
 
 	if (off != 0)
 		return -EIO;
 	/* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
-	ret = sysfs_match_string(policy_name, buf);
-	if (ret < 0)
-		return ret;
-	blkcg->prio_policy = ret;
+	ioprio = ioprio_match_policy(buf);
+	if (!ioprio)
+		return -EINVAL;
+	blkcg->ioprio = ioprio;
 	return nbytes;
 }
 
@@ -141,7 +169,7 @@  static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
 	blkcg = kzalloc(sizeof(*blkcg), gfp);
 	if (!blkcg)
 		return NULL;
-	blkcg->prio_policy = POLICY_NO_CHANGE;
+	blkcg->ioprio = &ioprio_policies[0];
 	return &blkcg->cpd;
 }
 
@@ -186,20 +214,30 @@  void blkcg_set_ioprio(struct bio *bio)
 	struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
 	u16 prio;
 
-	if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
+	if (!blkcg || blkcg->ioprio->policy == POLICY_NO_CHANGE)
 		return;
 
+	WARN_ON_ONCE(bio->bi_ioprio == IOPRIO_CLASS_NONE);
+
 	/*
 	 * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
-	 * correspond to a lower priority. Hence, the max_t() below selects
-	 * the lower priority of bi_ioprio and the cgroup I/O priority class.
-	 * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
-	 * priority is assigned to the bio.
+	 * correspond to a lower priority.
+	 *
+	 * When IOPRIO_POL_PROMOTION is enabled, the min_t() below selects
+	 * the higher priority of bi_ioprio and the cgroup I/O priority class,
+	 * otherwise the lower priority is selected.
 	 */
-	prio = max_t(u16, bio->bi_ioprio,
-			IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
-	if (prio > bio->bi_ioprio)
-		bio->bi_ioprio = prio;
+	if (blkcg->ioprio->policy & IOPRIO_POL_PROMOTION) {
+		prio = min_t(u16, bio->bi_ioprio,
+				IOPRIO_PRIO_VALUE(blkcg->ioprio->policy, 0));
+		if (prio < bio->bi_ioprio)
+			bio->bi_ioprio = prio;
+	} else {
+		prio = max_t(u16, bio->bi_ioprio,
+				IOPRIO_PRIO_VALUE(blkcg->ioprio->policy, 0));
+		if (prio > bio->bi_ioprio)
+			bio->bi_ioprio = prio;
+	}
 }
 
 void blk_ioprio_exit(struct gendisk *disk)