Message ID | 20240313125457.19475-1-ivecera@redhat.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net] i40e: Enforce software interrupt during busy-poll exit | expand |
On Wed, Mar 13, 2024 at 1:55 PM Ivan Vecera <ivecera@redhat.com> wrote: > diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h > index 9b701615c7c6..4d2b05de6c63 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e.h > +++ b/drivers/net/ethernet/intel/i40e/i40e.h > @@ -908,6 +908,7 @@ struct i40e_q_vector { > struct rcu_head rcu; /* to avoid race with update stats on free */ > char name[I40E_INT_NAME_STR_LEN]; > bool arm_wb_state; > + bool in_busy_poll; > int irq_num; /* IRQ assigned to this q_vector */ > } ____cacheline_internodealigned_in_smp; > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c > index 89a3401d20ab..1ea6d06b0acc 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -3915,6 +3915,12 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi) > q_vector->tx.target_itr >> 1); > q_vector->tx.current_itr = q_vector->tx.target_itr; > > + /* Set ITR for software interrupts triggered after exiting > + * busy-loop polling. > + */ > + wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), > + I40E_ITR_20K); > + > wr32(hw, I40E_PFINT_RATEN(vector - 1), > i40e_intrl_usec_to_reg(vsi->int_rate_limit)); > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h > index 14ab642cafdb..baa6bb68bcf8 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_register.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h > @@ -335,6 +335,8 @@ > #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 > #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 > #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) > +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 > +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) > #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ > #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 > #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > index 0d7177083708..356c3140adf3 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > @@ -2658,8 +2658,22 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) > return val; > } > > -/* a small macro to shorten up some long lines */ > -#define INTREG I40E_PFINT_DYN_CTLN > +static inline u32 i40e_buildreg_swint(int type) > +{ > + u32 val; > + > + /* 1. Enable the interrupt > + * 2. Do not modify any ITR interval > + * 3. Trigger a SW interrupt specified by type > + */ > + val = I40E_PFINT_DYN_CTLN_INTENA_MASK | > + I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */ > + I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | > + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | > + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, type); > + > + return val; > +} This function is called only from one place and with a constant argument. Does it really need to be a function, as opposed to a constant? Or are you going to add more callers soon? > > /* The act of updating the ITR will cause it to immediately trigger. In order > * to prevent this from throwing off adaptive update statistics we defer the > @@ -2702,8 +2716,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, > */ > if (q_vector->rx.target_itr < q_vector->rx.current_itr) { > /* Rx ITR needs to be reduced, this is highest priority */ > - intval = i40e_buildreg_itr(I40E_RX_ITR, > - q_vector->rx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), > + q_vector->rx.target_itr >> 1); > q_vector->rx.current_itr = q_vector->rx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || > @@ -2712,25 +2726,33 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, > /* Tx ITR needs to be reduced, this is second priority > * Tx ITR needs to be increased more than Rx, fourth priority > */ > - intval = i40e_buildreg_itr(I40E_TX_ITR, > - q_vector->tx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, q_vector->reg_idx), > + q_vector->tx.target_itr >> 1); > q_vector->tx.current_itr = q_vector->tx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { > /* Rx ITR needs to be increased, third priority */ > - intval = i40e_buildreg_itr(I40E_RX_ITR, > - q_vector->rx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), > + q_vector->rx.target_itr >> 1); > q_vector->rx.current_itr = q_vector->rx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else { > /* No ITR update, lowest priority */ > - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); > if (q_vector->itr_countdown) > q_vector->itr_countdown--; > } > > - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) > - wr32(hw, INTREG(q_vector->reg_idx), intval); > + /* Do not enable interrupt if VSI is down */ > + if (test_bit(__I40E_VSI_DOWN, vsi->state)) > + return; > + > + if (!q_vector->in_busy_poll) { > + intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); > + } else { > + q_vector->in_busy_poll = false; > + intval = i40e_buildreg_swint(I40E_SW_ITR); > + } > + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), intval); > } > > /** > @@ -2845,6 +2867,8 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) > */ > if (likely(napi_complete_done(napi, work_done))) > i40e_update_enable_itr(vsi, q_vector); > + else > + q_vector->in_busy_poll = true; > > return min(work_done, budget - 1); > } > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > index abf15067eb5d..2cdc7de6301c 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h > @@ -68,6 +68,7 @@ enum i40e_dyn_idx { > /* these are indexes into ITRN registers */ > #define I40E_RX_ITR I40E_IDX_ITR0 > #define I40E_TX_ITR I40E_IDX_ITR1 > +#define I40E_SW_ITR I40E_IDX_ITR2 > > /* Supported RSS offloads */ > #define I40E_DEFAULT_RSS_HENA ( \ > -- > 2.43.0 >
On 13. 03. 24 14:47, Michal Schmidt wrote: >> -/* a small macro to shorten up some long lines */ >> -#define INTREG I40E_PFINT_DYN_CTLN >> +static inline u32 i40e_buildreg_swint(int type) >> +{ >> + u32 val; >> + >> + /* 1. Enable the interrupt >> + * 2. Do not modify any ITR interval >> + * 3. Trigger a SW interrupt specified by type >> + */ >> + val = I40E_PFINT_DYN_CTLN_INTENA_MASK | >> + I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */ >> + I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | >> + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | >> + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, type); >> + >> + return val; >> +} > This function is called only from one place and with a constant > argument. Does it really need to be a function, as opposed to a > constant? Or are you going to add more callers soon? This can be reused also from i40e_force_wb() but I didn't want to make such refactors in this fix. Lets do it later in -next. Ivan
On 3/13/2024 5:54 AM, Ivan Vecera wrote: > As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts > during poll exit") I'm seeing the similar issue also with i40e driver. > > In certain situation when busy-loop is enabled together with adaptive > coalescing, the driver occasionally miss that there are outstanding > descriptors to clean when exiting busy poll. > > Try to catch the remaining work by triggering a software interrupt > when exiting busy poll. No extra interrupts will be generated when > busy polling is not used. > > The issue was found when running sockperf ping-pong tcp test with > adaptive coalescing and busy poll enabled (50 as value busy_pool > and busy_read sysctl knobs) and results in huge latency spikes > with more than 100000us. I like the results of this fix! Thanks for working on it. > > The fix is inspired from the ice driver and do the following: > 1) During napi poll exit in case of busy-poll (napo_complete_done() > returns false) this is recorded to q_vector that we were in busy > loop. > 2) In i40e_update_enable_itr() > - updates refreshed ITR intervals directly using PFINT_ITRN register > - if we are exiting ordinary poll then just enables the interrupt > using PFINT_DYN_CTLN > - if we are exiting busy poll then enables the interrupt and > additionally triggers an immediate software interrupt to catch any > pending clean-ups > 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to > 20K interrupts per second to limit the number of these sw interrupts. This is a good idea. > > @@ -2702,8 +2716,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, > */ > if (q_vector->rx.target_itr < q_vector->rx.current_itr) { > /* Rx ITR needs to be reduced, this is highest priority */ > - intval = i40e_buildreg_itr(I40E_RX_ITR, > - q_vector->rx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), > + q_vector->rx.target_itr >> 1); so here you write (this is a new write) > q_vector->rx.current_itr = q_vector->rx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || > @@ -2712,25 +2726,33 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, > /* Tx ITR needs to be reduced, this is second priority > * Tx ITR needs to be increased more than Rx, fourth priority > */ > - intval = i40e_buildreg_itr(I40E_TX_ITR, > - q_vector->tx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, q_vector->reg_idx), > + q_vector->tx.target_itr >> 1); > q_vector->tx.current_itr = q_vector->tx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { > /* Rx ITR needs to be increased, third priority */ > - intval = i40e_buildreg_itr(I40E_RX_ITR, > - q_vector->rx.target_itr); > + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), > + q_vector->rx.target_itr >> 1); or here (new write) > q_vector->rx.current_itr = q_vector->rx.target_itr; > q_vector->itr_countdown = ITR_COUNTDOWN_START; > } else { > /* No ITR update, lowest priority */ > - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); > if (q_vector->itr_countdown) > q_vector->itr_countdown--; > } > > - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) > - wr32(hw, INTREG(q_vector->reg_idx), intval); The above used to be the *only* write. > + /* Do not enable interrupt if VSI is down */ > + if (test_bit(__I40E_VSI_DOWN, vsi->state)) > + return; > + > + if (!q_vector->in_busy_poll) { > + intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); > + } else { > + q_vector->in_busy_poll = false; > + intval = i40e_buildreg_swint(I40E_SW_ITR); > + } > + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), intval); and then you write again here. So this function will now regularly have two writes in hot-path. Before it was very carefully crafted to reduce the number of writes to 1. This is made possible because the PFINT_DYN_CTLN register can do multiple tasks at once with a single write. Can you just modify intval to *both* trigger a software interrupt, and update the ITR simultaneously? I'm really not sure that's even possible. It may make more sense to only do the second write when exiting busy poll, what do you think? If there is no way to get the functionality without the two writes everytime I'd put up with it, but we can have a lot of interrupts per second, per queue (especially with adaptive disabled and rate set to a small number like 2us or 0) and doubling the number of writes will have a performance effect.
On 15. 03. 24 1:53, Jesse Brandeburg wrote: > On 3/13/2024 5:54 AM, Ivan Vecera wrote: >> As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts >> during poll exit") I'm seeing the similar issue also with i40e driver. >> >> In certain situation when busy-loop is enabled together with adaptive >> coalescing, the driver occasionally miss that there are outstanding >> descriptors to clean when exiting busy poll. >> >> Try to catch the remaining work by triggering a software interrupt >> when exiting busy poll. No extra interrupts will be generated when >> busy polling is not used. >> >> The issue was found when running sockperf ping-pong tcp test with >> adaptive coalescing and busy poll enabled (50 as value busy_pool >> and busy_read sysctl knobs) and results in huge latency spikes >> with more than 100000us. > > I like the results of this fix! Thanks for working on it. > >> >> The fix is inspired from the ice driver and do the following: >> 1) During napi poll exit in case of busy-poll (napo_complete_done() >> returns false) this is recorded to q_vector that we were in busy >> loop. >> 2) In i40e_update_enable_itr() >> - updates refreshed ITR intervals directly using PFINT_ITRN register >> - if we are exiting ordinary poll then just enables the interrupt >> using PFINT_DYN_CTLN >> - if we are exiting busy poll then enables the interrupt and >> additionally triggers an immediate software interrupt to catch any >> pending clean-ups >> 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to >> 20K interrupts per second to limit the number of these sw interrupts. > > This is a good idea. > >> >> @@ -2702,8 +2716,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, >> */ >> if (q_vector->rx.target_itr < q_vector->rx.current_itr) { >> /* Rx ITR needs to be reduced, this is highest priority */ >> - intval = i40e_buildreg_itr(I40E_RX_ITR, >> - q_vector->rx.target_itr); >> + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), >> + q_vector->rx.target_itr >> 1); > > so here you write (this is a new write) > >> q_vector->rx.current_itr = q_vector->rx.target_itr; >> q_vector->itr_countdown = ITR_COUNTDOWN_START; >> } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || >> @@ -2712,25 +2726,33 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, >> /* Tx ITR needs to be reduced, this is second priority >> * Tx ITR needs to be increased more than Rx, fourth priority >> */ >> - intval = i40e_buildreg_itr(I40E_TX_ITR, >> - q_vector->tx.target_itr); >> + wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, q_vector->reg_idx), >> + q_vector->tx.target_itr >> 1); >> q_vector->tx.current_itr = q_vector->tx.target_itr; >> q_vector->itr_countdown = ITR_COUNTDOWN_START; >> } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { >> /* Rx ITR needs to be increased, third priority */ >> - intval = i40e_buildreg_itr(I40E_RX_ITR, >> - q_vector->rx.target_itr); >> + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), >> + q_vector->rx.target_itr >> 1); > > or here (new write) > >> q_vector->rx.current_itr = q_vector->rx.target_itr; >> q_vector->itr_countdown = ITR_COUNTDOWN_START; >> } else { >> /* No ITR update, lowest priority */ >> - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); >> if (q_vector->itr_countdown) >> q_vector->itr_countdown--; >> } >> >> - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) >> - wr32(hw, INTREG(q_vector->reg_idx), intval); > > The above used to be the *only* write. > >> + /* Do not enable interrupt if VSI is down */ >> + if (test_bit(__I40E_VSI_DOWN, vsi->state)) >> + return; >> + >> + if (!q_vector->in_busy_poll) { >> + intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); >> + } else { >> + q_vector->in_busy_poll = false; >> + intval = i40e_buildreg_swint(I40E_SW_ITR); >> + } >> + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), intval); > > and then you write again here. > > So this function will now regularly have two writes in hot-path. Before > it was very carefully crafted to reduce the number of writes to 1. > > This is made possible because the PFINT_DYN_CTLN register can do > multiple tasks at once with a single write. > > Can you just modify intval to *both* trigger a software interrupt, and > update the ITR simultaneously? I'm really not sure that's even possible. > > It may make more sense to only do the second write when exiting busy > poll, what do you think? Yeah, you are right, we can eliminate these two writes by one and also for busy-poll exit. I'm setting up ITR2_IDX rate during MSI-X initialization and as this is fixed we do not need to update it everytime in i40e_update_enable_itr(). Per datasheet the PFINT_DYN_CTLN value can be encoded to do the following at once: - enable interrupt - update interval for particular ITR index - software interrupt trigger limited by interval of different ITR index Will prepare, test and submit v3 with this change. Thanks, Ivan
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 9b701615c7c6..4d2b05de6c63 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -908,6 +908,7 @@ struct i40e_q_vector { struct rcu_head rcu; /* to avoid race with update stats on free */ char name[I40E_INT_NAME_STR_LEN]; bool arm_wb_state; + bool in_busy_poll; int irq_num; /* IRQ assigned to this q_vector */ } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 89a3401d20ab..1ea6d06b0acc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3915,6 +3915,12 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi) q_vector->tx.target_itr >> 1); q_vector->tx.current_itr = q_vector->tx.target_itr; + /* Set ITR for software interrupts triggered after exiting + * busy-loop polling. + */ + wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), + I40E_ITR_20K); + wr32(hw, I40E_PFINT_RATEN(vector - 1), i40e_intrl_usec_to_reg(vsi->int_rate_limit)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 14ab642cafdb..baa6bb68bcf8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -335,6 +335,8 @@ #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708..356c3140adf3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2658,8 +2658,22 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) return val; } -/* a small macro to shorten up some long lines */ -#define INTREG I40E_PFINT_DYN_CTLN +static inline u32 i40e_buildreg_swint(int type) +{ + u32 val; + + /* 1. Enable the interrupt + * 2. Do not modify any ITR interval + * 3. Trigger a SW interrupt specified by type + */ + val = I40E_PFINT_DYN_CTLN_INTENA_MASK | + I40E_PFINT_DYN_CTLN_ITR_INDX_MASK | /* set noitr */ + I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, type); + + return val; +} /* The act of updating the ITR will cause it to immediately trigger. In order * to prevent this from throwing off adaptive update statistics we defer the @@ -2702,8 +2716,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, */ if (q_vector->rx.target_itr < q_vector->rx.current_itr) { /* Rx ITR needs to be reduced, this is highest priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), + q_vector->rx.target_itr >> 1); q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || @@ -2712,25 +2726,33 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, /* Tx ITR needs to be reduced, this is second priority * Tx ITR needs to be increased more than Rx, fourth priority */ - intval = i40e_buildreg_itr(I40E_TX_ITR, - q_vector->tx.target_itr); + wr32(hw, I40E_PFINT_ITRN(I40E_TX_ITR, q_vector->reg_idx), + q_vector->tx.target_itr >> 1); q_vector->tx.current_itr = q_vector->tx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { /* Rx ITR needs to be increased, third priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + wr32(hw, I40E_PFINT_ITRN(I40E_RX_ITR, q_vector->reg_idx), + q_vector->rx.target_itr >> 1); q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else { /* No ITR update, lowest priority */ - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); if (q_vector->itr_countdown) q_vector->itr_countdown--; } - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) - wr32(hw, INTREG(q_vector->reg_idx), intval); + /* Do not enable interrupt if VSI is down */ + if (test_bit(__I40E_VSI_DOWN, vsi->state)) + return; + + if (!q_vector->in_busy_poll) { + intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); + } else { + q_vector->in_busy_poll = false; + intval = i40e_buildreg_swint(I40E_SW_ITR); + } + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), intval); } /** @@ -2845,6 +2867,8 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) */ if (likely(napi_complete_done(napi, work_done))) i40e_update_enable_itr(vsi, q_vector); + else + q_vector->in_busy_poll = true; return min(work_done, budget - 1); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index abf15067eb5d..2cdc7de6301c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -68,6 +68,7 @@ enum i40e_dyn_idx { /* these are indexes into ITRN registers */ #define I40E_RX_ITR I40E_IDX_ITR0 #define I40E_TX_ITR I40E_IDX_ITR1 +#define I40E_SW_ITR I40E_IDX_ITR2 /* Supported RSS offloads */ #define I40E_DEFAULT_RSS_HENA ( \
As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts during poll exit") I'm seeing the similar issue also with i40e driver. In certain situation when busy-loop is enabled together with adaptive coalescing, the driver occasionally miss that there are outstanding descriptors to clean when exiting busy poll. Try to catch the remaining work by triggering a software interrupt when exiting busy poll. No extra interrupts will be generated when busy polling is not used. The issue was found when running sockperf ping-pong tcp test with adaptive coalescing and busy poll enabled (50 as value busy_pool and busy_read sysctl knobs) and results in huge latency spikes with more than 100000us. The fix is inspired from the ice driver and do the following: 1) During napi poll exit in case of busy-poll (napo_complete_done() returns false) this is recorded to q_vector that we were in busy loop. 2) In i40e_update_enable_itr() - updates refreshed ITR intervals directly using PFINT_ITRN register - if we are exiting ordinary poll then just enables the interrupt using PFINT_DYN_CTLN - if we are exiting busy poll then enables the interrupt and additionally triggers an immediate software interrupt to catch any pending clean-ups 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to 20K interrupts per second to limit the number of these sw interrupts. Test results ============ Prior: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2438563; ReceivedMessages=2438562 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2429473; ReceivedMessages=2429473 sockperf: ====> avg-latency=24.571 (std-dev=93.297, mean-ad=4.904, median-ad=1.510, siqr=1.063, cv=3.797, std-error=0.060, 99.0% ci=[24.417, 24.725]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.571 usec sockperf: Total 2429473 observations; each percentile contains 24294.73 observations sockperf: ---> <MAX> observation = 103294.331 sockperf: ---> percentile 99.999 = 45.633 sockperf: ---> percentile 99.990 = 37.013 sockperf: ---> percentile 99.900 = 35.910 sockperf: ---> percentile 99.000 = 33.390 sockperf: ---> percentile 90.000 = 28.626 sockperf: ---> percentile 75.000 = 27.741 sockperf: ---> percentile 50.000 = 26.743 sockperf: ---> percentile 25.000 = 25.614 sockperf: ---> <MIN> observation = 12.220 After: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2400055; ReceivedMessages=2400054 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2391186; ReceivedMessages=2391186 sockperf: ====> avg-latency=24.965 (std-dev=5.934, mean-ad=4.642, median-ad=1.485, siqr=1.067, cv=0.238, std-error=0.004, 99.0% ci=[24.955, 24.975]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.965 usec sockperf: Total 2391186 observations; each percentile contains 23911.86 observations sockperf: ---> <MAX> observation = 195.841 sockperf: ---> percentile 99.999 = 45.026 sockperf: ---> percentile 99.990 = 39.009 sockperf: ---> percentile 99.900 = 35.922 sockperf: ---> percentile 99.000 = 33.482 sockperf: ---> percentile 90.000 = 28.902 sockperf: ---> percentile 75.000 = 27.821 sockperf: ---> percentile 50.000 = 26.860 sockperf: ---> percentile 25.000 = 25.685 sockperf: ---> <MIN> observation = 12.277 Reported-by: Hugo Ferreira <hferreir@redhat.com> Signed-off-by: Ivan Vecera <ivecera@redhat.com> --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 6 +++ .../net/ethernet/intel/i40e/i40e_register.h | 2 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 46 ++++++++++++++----- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + 5 files changed, 45 insertions(+), 11 deletions(-)