Message ID | 1358914330-3768-1-git-send-email-Frank.Li@freescale.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, 2013-01-23 at 12:12 +0800, Frank Li wrote: > Add napi support > > Before this patch > > iperf -s -i 1 > ------------------------------------------------------------ > Server listening on TCP port 5001 > TCP window size: 85.3 KByte (default) > ------------------------------------------------------------ > [ 4] local 10.192.242.153 port 5001 connected with 10.192.242.138 port 50004 > [ ID] Interval Transfer Bandwidth > [ 4] 0.0- 1.0 sec 41.2 MBytes 345 Mbits/sec > [ 4] 1.0- 2.0 sec 43.7 MBytes 367 Mbits/sec > [ 4] 2.0- 3.0 sec 42.8 MBytes 359 Mbits/sec > [ 4] 3.0- 4.0 sec 43.7 MBytes 367 Mbits/sec > [ 4] 4.0- 5.0 sec 42.7 MBytes 359 Mbits/sec > [ 4] 5.0- 6.0 sec 43.8 MBytes 367 Mbits/sec > [ 4] 6.0- 7.0 sec 43.0 MBytes 361 Mbits/sec > > After this patch > [ 4] 2.0- 3.0 sec 51.6 MBytes 433 Mbits/sec > [ 4] 3.0- 4.0 sec 51.8 MBytes 435 Mbits/sec > [ 4] 4.0- 5.0 sec 52.2 MBytes 438 Mbits/sec > [ 4] 5.0- 6.0 sec 52.1 MBytes 437 Mbits/sec > [ 4] 6.0- 7.0 sec 52.1 MBytes 437 Mbits/sec > [ 4] 7.0- 8.0 sec 52.3 MBytes 439 Mbits/sec Strange, as you still call netif_rx() NAPI should call netif_receive_skb() instead
On Wed, Jan 23, 2013 at 12:12:10PM +0800, Frank Li wrote: [...] > diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c > index f52ba33..8fa420c 100644 > --- a/drivers/net/ethernet/freescale/fec.c > +++ b/drivers/net/ethernet/freescale/fec.c > @@ -67,6 +67,8 @@ > #endif > > #define DRIVER_NAME "fec" > +#define FEC_NAPI_WEIGHT 64 > +#define INT32_MAX 0x7FFFFFFF This seems awfully big as an upper-limit to your ISR processing. See below for more comments. > > /* Pause frame feild and FIFO threshold */ > #define FEC_ENET_FCE (1 << 5) > @@ -565,6 +567,20 @@ fec_timeout(struct net_device *ndev) > } > > static void > +fec_enet_rx_int_is_enabled(struct net_device *ndev, bool enabled) > +{ > + struct fec_enet_private *fep = netdev_priv(ndev); > + uint int_events; > + > + int_events = readl(fep->hwp + FEC_IMASK); > + if (enabled) > + int_events |= FEC_ENET_RXF; > + else > + int_events &= ~FEC_ENET_RXF; > + writel(int_events, fep->hwp + FEC_IMASK); > +} > + > +static void > fec_enet_tx(struct net_device *ndev) > { > struct fec_enet_private *fep; > @@ -656,8 +672,8 @@ fec_enet_tx(struct net_device *ndev) > * not been given to the system, we just set the empty indicator, > * effectively tossing the packet. > */ > -static void > -fec_enet_rx(struct net_device *ndev) > +static int > +fec_enet_rx(struct net_device *ndev, int budget) > { > struct fec_enet_private *fep = netdev_priv(ndev); > const struct platform_device_id *id_entry = > @@ -667,13 +683,12 @@ fec_enet_rx(struct net_device *ndev) > struct sk_buff *skb; > ushort pkt_len; > __u8 *data; > + int pkt_received = 0; > > #ifdef CONFIG_M532x > flush_cache_all(); > #endif > > - spin_lock(&fep->hw_lock); > - > /* First, grab all of the stats for the incoming packet. > * These get messed up if we get called due to a busy condition. > */ > @@ -681,6 +696,10 @@ fec_enet_rx(struct net_device *ndev) > > while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { > > + if (pkt_received >= budget) > + break; > + pkt_received++; > + This seems wrong given how it's called from the ISR when NAPI isn't being used. If you have continuous packets coming in, you won't exit your ISR and release your spin_lock until you hit INT32_MAX. > /* Since we have allocated space to hold a complete frame, > * the last indicator should be set. > */ > @@ -796,7 +815,7 @@ rx_processing_done: > } > fep->cur_rx = bdp; > > - spin_unlock(&fep->hw_lock); > + return pkt_received; > } > > static irqreturn_t > @@ -805,6 +824,7 @@ fec_enet_interrupt(int irq, void *dev_id) > struct net_device *ndev = dev_id; > struct fec_enet_private *fep = netdev_priv(ndev); > uint int_events; > + ulong flags; > irqreturn_t ret = IRQ_NONE; > > do { > @@ -813,7 +833,18 @@ fec_enet_interrupt(int irq, void *dev_id) > > if (int_events & FEC_ENET_RXF) { > ret = IRQ_HANDLED; > - fec_enet_rx(ndev); > + spin_lock_irqsave(&fep->hw_lock, flags); > + > + if (fep->use_napi) { > + /* Disable the RX interrupt */ > + if (napi_schedule_prep(&fep->napi)) { > + fec_enet_rx_int_is_enabled(ndev, false); > + __napi_schedule(&fep->napi); > + } > + } else > + fec_enet_rx(ndev, INT32_MAX); As mentioned above, you may want to check this a bit closer since this can run for a very, very long time if you have constant packets flowing when not using NAPI. Also, the code formatting here should also have braces around the else clause since the if clause has them. > + > + spin_unlock_irqrestore(&fep->hw_lock, flags); > } > > /* Transmit OK, or non-fatal error. Update the buffer > @@ -834,7 +865,16 @@ fec_enet_interrupt(int irq, void *dev_id) > return ret; > } > > - > +static int fec_enet_rx_napi(struct napi_struct *napi, int budget) > +{ > + struct net_device *ndev = napi->dev; > + int pkgs = fec_enet_rx(ndev, budget); > + if (pkgs < budget) { > + napi_complete(napi); > + fec_enet_rx_int_is_enabled(ndev, true); > + } > + return pkgs; > +} > > /* ------------------------------------------------------------------------- */ > static void fec_get_mac(struct net_device *ndev) > @@ -1392,6 +1432,9 @@ fec_enet_open(struct net_device *ndev) > struct fec_enet_private *fep = netdev_priv(ndev); > int ret; > > + if (fep->use_napi) > + napi_enable(&fep->napi); > + > /* I should reset the ring buffers here, but I don't yet know > * a simple way to do that. > */ > @@ -1604,6 +1647,11 @@ static int fec_enet_init(struct net_device *ndev) > ndev->netdev_ops = &fec_netdev_ops; > ndev->ethtool_ops = &fec_enet_ethtool_ops; > > + if (fep->use_napi) { > + fec_enet_rx_int_is_enabled(ndev, false); > + netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, fep->napi_weight); Make sure not to exceed 80 characters here. > + } > + > /* Initialize the receive buffer descriptors. */ > bdp = fep->rx_bd_base; > for (i = 0; i < RX_RING_SIZE; i++) { > @@ -1698,6 +1746,7 @@ fec_probe(struct platform_device *pdev) > static int dev_id; > struct pinctrl *pinctrl; > struct regulator *reg_phy; > + struct device_node *np = pdev->dev.of_node; > > of_id = of_match_device(fec_dt_ids, &pdev->dev); > if (of_id) > @@ -1811,6 +1860,11 @@ fec_probe(struct platform_device *pdev) > } > } > > + fep->use_napi = !of_property_read_bool(np, "disable_napi"); > + > + if (of_property_read_u32(np, "napi_weight", &fep->napi_weight)) > + fep->napi_weight = FEC_NAPI_WEIGHT; /*using default value*/ > + > fec_reset_phy(pdev); > > ret = fec_enet_init(ndev); > diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h > index 2ebedaf..31fcdd0 100644 > --- a/drivers/net/ethernet/freescale/fec.h > +++ b/drivers/net/ethernet/freescale/fec.h > @@ -249,6 +249,10 @@ struct fec_enet_private { > int bufdesc_ex; > int pause_flag; > > + struct napi_struct napi; > + int napi_weight; > + bool use_napi; > + > struct ptp_clock *ptp_clock; > struct ptp_clock_info ptp_caps; > unsigned long last_overflow_check; > -- > 1.7.1 > > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 12:12:10PM +0800, Frank Li wrote: > Add napi support > > Before this patch > > iperf -s -i 1 > ------------------------------------------------------------ > Server listening on TCP port 5001 > TCP window size: 85.3 KByte (default) > ------------------------------------------------------------ > [ 4] local 10.192.242.153 port 5001 connected with 10.192.242.138 port 50004 > [ ID] Interval Transfer Bandwidth > [ 4] 0.0- 1.0 sec 41.2 MBytes 345 Mbits/sec > [ 4] 1.0- 2.0 sec 43.7 MBytes 367 Mbits/sec > [ 4] 2.0- 3.0 sec 42.8 MBytes 359 Mbits/sec > [ 4] 3.0- 4.0 sec 43.7 MBytes 367 Mbits/sec > [ 4] 4.0- 5.0 sec 42.7 MBytes 359 Mbits/sec > [ 4] 5.0- 6.0 sec 43.8 MBytes 367 Mbits/sec > [ 4] 6.0- 7.0 sec 43.0 MBytes 361 Mbits/sec > > After this patch > [ 4] 2.0- 3.0 sec 51.6 MBytes 433 Mbits/sec > [ 4] 3.0- 4.0 sec 51.8 MBytes 435 Mbits/sec > [ 4] 4.0- 5.0 sec 52.2 MBytes 438 Mbits/sec > [ 4] 5.0- 6.0 sec 52.1 MBytes 437 Mbits/sec > [ 4] 6.0- 7.0 sec 52.1 MBytes 437 Mbits/sec > [ 4] 7.0- 8.0 sec 52.3 MBytes 439 Mbits/sec > > Signed-off-by: Frank Li <Frank.Li@freescale.com> > Signed-off-by: Fugang Duan <B38611@freescale.com> > --- > drivers/net/ethernet/freescale/fec.c | 68 ++++++++++++++++++++++++++++++--- > drivers/net/ethernet/freescale/fec.h | 4 ++ > 2 files changed, 65 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c > index f52ba33..8fa420c 100644 > --- a/drivers/net/ethernet/freescale/fec.c > +++ b/drivers/net/ethernet/freescale/fec.c > @@ -67,6 +67,8 @@ > #endif > > #define DRIVER_NAME "fec" > +#define FEC_NAPI_WEIGHT 64 > +#define INT32_MAX 0x7FFFFFFF > > /* Pause frame feild and FIFO threshold */ > #define FEC_ENET_FCE (1 << 5) > @@ -565,6 +567,20 @@ fec_timeout(struct net_device *ndev) > } > > static void > +fec_enet_rx_int_is_enabled(struct net_device *ndev, bool enabled) > +{ > + struct fec_enet_private *fep = netdev_priv(ndev); > + uint int_events; > + > + int_events = readl(fep->hwp + FEC_IMASK); > + if (enabled) > + int_events |= FEC_ENET_RXF; > + else > + int_events &= ~FEC_ENET_RXF; > + writel(int_events, fep->hwp + FEC_IMASK); > +} > + > +static void > fec_enet_tx(struct net_device *ndev) > { > struct fec_enet_private *fep; > @@ -656,8 +672,8 @@ fec_enet_tx(struct net_device *ndev) > * not been given to the system, we just set the empty indicator, > * effectively tossing the packet. > */ > -static void > -fec_enet_rx(struct net_device *ndev) > +static int > +fec_enet_rx(struct net_device *ndev, int budget) > { > struct fec_enet_private *fep = netdev_priv(ndev); > const struct platform_device_id *id_entry = > @@ -667,13 +683,12 @@ fec_enet_rx(struct net_device *ndev) > struct sk_buff *skb; > ushort pkt_len; > __u8 *data; > + int pkt_received = 0; > > #ifdef CONFIG_M532x > flush_cache_all(); > #endif > > - spin_lock(&fep->hw_lock); > - > /* First, grab all of the stats for the incoming packet. > * These get messed up if we get called due to a busy condition. > */ > @@ -681,6 +696,10 @@ fec_enet_rx(struct net_device *ndev) > > while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { > > + if (pkt_received >= budget) > + break; > + pkt_received++; > + > /* Since we have allocated space to hold a complete frame, > * the last indicator should be set. > */ > @@ -796,7 +815,7 @@ rx_processing_done: > } > fep->cur_rx = bdp; > > - spin_unlock(&fep->hw_lock); > + return pkt_received; > } > > static irqreturn_t > @@ -805,6 +824,7 @@ fec_enet_interrupt(int irq, void *dev_id) > struct net_device *ndev = dev_id; > struct fec_enet_private *fep = netdev_priv(ndev); > uint int_events; > + ulong flags; > irqreturn_t ret = IRQ_NONE; > > do { > @@ -813,7 +833,18 @@ fec_enet_interrupt(int irq, void *dev_id) > > if (int_events & FEC_ENET_RXF) { > ret = IRQ_HANDLED; > - fec_enet_rx(ndev); > + spin_lock_irqsave(&fep->hw_lock, flags); > + > + if (fep->use_napi) { > + /* Disable the RX interrupt */ > + if (napi_schedule_prep(&fep->napi)) { > + fec_enet_rx_int_is_enabled(ndev, false); > + __napi_schedule(&fep->napi); > + } > + } else > + fec_enet_rx(ndev, INT32_MAX); > + > + spin_unlock_irqrestore(&fep->hw_lock, flags); > } > > /* Transmit OK, or non-fatal error. Update the buffer > @@ -834,7 +865,16 @@ fec_enet_interrupt(int irq, void *dev_id) > return ret; > } > > - > +static int fec_enet_rx_napi(struct napi_struct *napi, int budget) > +{ > + struct net_device *ndev = napi->dev; > + int pkgs = fec_enet_rx(ndev, budget); > + if (pkgs < budget) { > + napi_complete(napi); > + fec_enet_rx_int_is_enabled(ndev, true); > + } > + return pkgs; > +} > > /* ------------------------------------------------------------------------- */ > static void fec_get_mac(struct net_device *ndev) > @@ -1392,6 +1432,9 @@ fec_enet_open(struct net_device *ndev) > struct fec_enet_private *fep = netdev_priv(ndev); > int ret; > > + if (fep->use_napi) > + napi_enable(&fep->napi); > + > /* I should reset the ring buffers here, but I don't yet know > * a simple way to do that. > */ > @@ -1604,6 +1647,11 @@ static int fec_enet_init(struct net_device *ndev) > ndev->netdev_ops = &fec_netdev_ops; > ndev->ethtool_ops = &fec_enet_ethtool_ops; > > + if (fep->use_napi) { > + fec_enet_rx_int_is_enabled(ndev, false); > + netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, fep->napi_weight); > + } > + > /* Initialize the receive buffer descriptors. */ > bdp = fep->rx_bd_base; > for (i = 0; i < RX_RING_SIZE; i++) { > @@ -1698,6 +1746,7 @@ fec_probe(struct platform_device *pdev) > static int dev_id; > struct pinctrl *pinctrl; > struct regulator *reg_phy; > + struct device_node *np = pdev->dev.of_node; > > of_id = of_match_device(fec_dt_ids, &pdev->dev); > if (of_id) > @@ -1811,6 +1860,11 @@ fec_probe(struct platform_device *pdev) > } > } > > + fep->use_napi = !of_property_read_bool(np, "disable_napi"); > + > + if (of_property_read_u32(np, "napi_weight", &fep->napi_weight)) > + fep->napi_weight = FEC_NAPI_WEIGHT; /*using default value*/ This comment seems rather unnecessary. Also it should be /* space left and right /* devicetree bindings should use '-' instead of '_'. The binding Documentation should be in the patch adding the bindings. Also make sure the devicetree parsing is outside of the path for platform based probing. With this particular binding I'm unsure it should exist anyway since it's configuration rather than hardware description. The devicetree normally is for hardware description only. Sascha
2013/1/23 Eric Dumazet <eric.dumazet@gmail.com>: > On Wed, 2013-01-23 at 12:12 +0800, Frank Li wrote: >> Add napi support >> >> Before this patch >> >> iperf -s -i 1 >> ------------------------------------------------------------ >> Server listening on TCP port 5001 >> TCP window size: 85.3 KByte (default) >> ------------------------------------------------------------ >> [ 4] local 10.192.242.153 port 5001 connected with 10.192.242.138 port 50004 >> [ ID] Interval Transfer Bandwidth >> [ 4] 0.0- 1.0 sec 41.2 MBytes 345 Mbits/sec >> [ 4] 1.0- 2.0 sec 43.7 MBytes 367 Mbits/sec >> [ 4] 2.0- 3.0 sec 42.8 MBytes 359 Mbits/sec >> [ 4] 3.0- 4.0 sec 43.7 MBytes 367 Mbits/sec >> [ 4] 4.0- 5.0 sec 42.7 MBytes 359 Mbits/sec >> [ 4] 5.0- 6.0 sec 43.8 MBytes 367 Mbits/sec >> [ 4] 6.0- 7.0 sec 43.0 MBytes 361 Mbits/sec >> >> After this patch >> [ 4] 2.0- 3.0 sec 51.6 MBytes 433 Mbits/sec >> [ 4] 3.0- 4.0 sec 51.8 MBytes 435 Mbits/sec >> [ 4] 4.0- 5.0 sec 52.2 MBytes 438 Mbits/sec >> [ 4] 5.0- 6.0 sec 52.1 MBytes 437 Mbits/sec >> [ 4] 6.0- 7.0 sec 52.1 MBytes 437 Mbits/sec >> [ 4] 7.0- 8.0 sec 52.3 MBytes 439 Mbits/sec > > Strange, as you still call netif_rx() > > NAPI should call netif_receive_skb() instead > Thank you point out. After re-test, I found performance is almost no change if use netif_receive_skb. I am not sure if it is my NAPI implement problem. napi_gro_received is better than netif_receive_skb, but worse than netif_rx. From performance point view, netif_rx --- fastest napi_gro_received --- middle, near to netif_rx netif_receive_skb --- slowest, almost the same as original no-napi version. Do you have any idea about this phenomena? best regards Frank Li > >
> This comment seems rather unnecessary. Also it should be > > /* space left and right /* > > devicetree bindings should use '-' instead of '_'. The binding > Documentation should be in the patch adding the bindings. Also > make sure the devicetree parsing is outside of the path for platform > based probing. > > With this particular binding I'm unsure it should exist anyway since > it's configuration rather than hardware description. The devicetree > normally is for hardware description only. > What's your opinion? Using module parameter, device-tree to config it or the other method? best regards Frank Li
NAPI should not be configurable, all device drivers should be NAPI only.
On Wed, Jan 23, 2013 at 03:44:53PM +0800, Frank Li wrote: > > This comment seems rather unnecessary. Also it should be > > > > /* space left and right /* > > > > devicetree bindings should use '-' instead of '_'. The binding > > Documentation should be in the patch adding the bindings. Also > > make sure the devicetree parsing is outside of the path for platform > > based probing. > > > > With this particular binding I'm unsure it should exist anyway since > > it's configuration rather than hardware description. The devicetree > > normally is for hardware description only. > > > > What's your opinion? > Using module parameter, device-tree to config it or the other method? Why do you want to make it configurable anyway? So far no other driver seems to need this. My suggestion is to make it unconfigurable. For adding properties like this to the devicetree remember that they should be OS agnostic. napi is a implementation detail relevant only for Linux. And even Linux may decide to drop napi support in favour for something better in the future. Sascha
2013/1/23 Waskiewicz Jr, Peter P <peter.p.waskiewicz.jr@intel.com>: > On Wed, Jan 23, 2013 at 12:12:10PM +0800, Frank Li wrote: > > [...] > >> diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c >> index f52ba33..8fa420c 100644 >> --- a/drivers/net/ethernet/freescale/fec.c >> +++ b/drivers/net/ethernet/freescale/fec.c >> @@ -67,6 +67,8 @@ >> #endif >> >> #define DRIVER_NAME "fec" >> +#define FEC_NAPI_WEIGHT 64 >> +#define INT32_MAX 0x7FFFFFFF > > This seems awfully big as an upper-limit to your ISR processing. See below > for more comments. > >> >> /* Pause frame feild and FIFO threshold */ >> #define FEC_ENET_FCE (1 << 5) >> @@ -565,6 +567,20 @@ fec_timeout(struct net_device *ndev) >> } >> >> static void >> +fec_enet_rx_int_is_enabled(struct net_device *ndev, bool enabled) >> +{ >> + struct fec_enet_private *fep = netdev_priv(ndev); >> + uint int_events; >> + >> + int_events = readl(fep->hwp + FEC_IMASK); >> + if (enabled) >> + int_events |= FEC_ENET_RXF; >> + else >> + int_events &= ~FEC_ENET_RXF; >> + writel(int_events, fep->hwp + FEC_IMASK); >> +} >> + >> +static void >> fec_enet_tx(struct net_device *ndev) >> { >> struct fec_enet_private *fep; >> @@ -656,8 +672,8 @@ fec_enet_tx(struct net_device *ndev) >> * not been given to the system, we just set the empty indicator, >> * effectively tossing the packet. >> */ >> -static void >> -fec_enet_rx(struct net_device *ndev) >> +static int >> +fec_enet_rx(struct net_device *ndev, int budget) >> { >> struct fec_enet_private *fep = netdev_priv(ndev); >> const struct platform_device_id *id_entry = >> @@ -667,13 +683,12 @@ fec_enet_rx(struct net_device *ndev) >> struct sk_buff *skb; >> ushort pkt_len; >> __u8 *data; >> + int pkt_received = 0; >> >> #ifdef CONFIG_M532x >> flush_cache_all(); >> #endif >> >> - spin_lock(&fep->hw_lock); >> - >> /* First, grab all of the stats for the incoming packet. >> * These get messed up if we get called due to a busy condition. >> */ >> @@ -681,6 +696,10 @@ fec_enet_rx(struct net_device *ndev) >> >> while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { >> >> + if (pkt_received >= budget) >> + break; >> + pkt_received++; >> + > > This seems wrong given how it's called from the ISR when NAPI isn't being > used. If you have continuous packets coming in, you won't exit your ISR > and release your spin_lock until you hit INT32_MAX. I just the keep original execute path if napi disabled. Original code will progress all packages in queue. FEC is 1G network, must receive speed around 500Mbps. CPU speed is much faster than received package. and RX queue only 8 entry, I think it is impossible to run loop more than 10. why I set to INT32_MAX is keep the original execute path when napi disable. INT32_MAX is impossible value to exit loop for this case. > >> /* Since we have allocated space to hold a complete frame, >> * the last indicator should be set. >> */ >> @@ -796,7 +815,7 @@ rx_processing_done: >> } >> fep->cur_rx = bdp; >> >> - spin_unlock(&fep->hw_lock); >> + return pkt_received; >> } >> >> static irqreturn_t >> @@ -805,6 +824,7 @@ fec_enet_interrupt(int irq, void *dev_id) >> struct net_device *ndev = dev_id; >> struct fec_enet_private *fep = netdev_priv(ndev); >> uint int_events; >> + ulong flags; >> irqreturn_t ret = IRQ_NONE; >> >> do { >> @@ -813,7 +833,18 @@ fec_enet_interrupt(int irq, void *dev_id) >> >> if (int_events & FEC_ENET_RXF) { >> ret = IRQ_HANDLED; >> - fec_enet_rx(ndev); >> + spin_lock_irqsave(&fep->hw_lock, flags); >> + >> + if (fep->use_napi) { >> + /* Disable the RX interrupt */ >> + if (napi_schedule_prep(&fep->napi)) { >> + fec_enet_rx_int_is_enabled(ndev, false); >> + __napi_schedule(&fep->napi); >> + } >> + } else >> + fec_enet_rx(ndev, INT32_MAX); > > As mentioned above, you may want to check this a bit closer since this > can run for a very, very long time if you have constant packets flowing > when not using NAPI. > > Also, the code formatting here should also have braces around the else > clause since the if clause has them. Okay, I will update in v2 patch. > >> + >> + spin_unlock_irqrestore(&fep->hw_lock, flags); >> } >> >> /* Transmit OK, or non-fatal error. Update the buffer >> @@ -834,7 +865,16 @@ fec_enet_interrupt(int irq, void *dev_id) >> return ret; >> } >> >> - >> +static int fec_enet_rx_napi(struct napi_struct *napi, int budget) >> +{ >> + struct net_device *ndev = napi->dev; >> + int pkgs = fec_enet_rx(ndev, budget); >> + if (pkgs < budget) { >> + napi_complete(napi); >> + fec_enet_rx_int_is_enabled(ndev, true); >> + } >> + return pkgs; >> +} >> >> /* ------------------------------------------------------------------------- */ >> static void fec_get_mac(struct net_device *ndev) >> @@ -1392,6 +1432,9 @@ fec_enet_open(struct net_device *ndev) >> struct fec_enet_private *fep = netdev_priv(ndev); >> int ret; >> >> + if (fep->use_napi) >> + napi_enable(&fep->napi); >> + >> /* I should reset the ring buffers here, but I don't yet know >> * a simple way to do that. >> */ >> @@ -1604,6 +1647,11 @@ static int fec_enet_init(struct net_device *ndev) >> ndev->netdev_ops = &fec_netdev_ops; >> ndev->ethtool_ops = &fec_enet_ethtool_ops; >> >> + if (fep->use_napi) { >> + fec_enet_rx_int_is_enabled(ndev, false); >> + netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, fep->napi_weight); > > Make sure not to exceed 80 characters here. > Okay, I will update in v2 patch. >> + } >> + >> /* Initialize the receive buffer descriptors. */ >> bdp = fep->rx_bd_base; >> for (i = 0; i < RX_RING_SIZE; i++) { >> @@ -1698,6 +1746,7 @@ fec_probe(struct platform_device *pdev) >> static int dev_id; >> struct pinctrl *pinctrl; >> struct regulator *reg_phy; >> + struct device_node *np = pdev->dev.of_node; >> >> of_id = of_match_device(fec_dt_ids, &pdev->dev); >> if (of_id) >> @@ -1811,6 +1860,11 @@ fec_probe(struct platform_device *pdev) >> } >> } >> >> + fep->use_napi = !of_property_read_bool(np, "disable_napi"); >> + >> + if (of_property_read_u32(np, "napi_weight", &fep->napi_weight)) >> + fep->napi_weight = FEC_NAPI_WEIGHT; /*using default value*/ >> + >> fec_reset_phy(pdev); >> >> ret = fec_enet_init(ndev); >> diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h >> index 2ebedaf..31fcdd0 100644 >> --- a/drivers/net/ethernet/freescale/fec.h >> +++ b/drivers/net/ethernet/freescale/fec.h >> @@ -249,6 +249,10 @@ struct fec_enet_private { >> int bufdesc_ex; >> int pause_flag; >> >> + struct napi_struct napi; >> + int napi_weight; >> + bool use_napi; >> + >> struct ptp_clock *ptp_clock; >> struct ptp_clock_info ptp_caps; >> unsigned long last_overflow_check; >> -- >> 1.7.1 >> >> >> -- >> To unsubscribe from this list: send the line "unsubscribe netdev" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Why do you want to make it configurable anyway? So far no other driver > seems to need this. My suggestion is to make it unconfigurable. > > For adding properties like this to the devicetree remember that they > should be OS agnostic. napi is a implementation detail relevant only > for Linux. And even Linux may decide to drop napi support in favour for > something better in the future. > Okay, I will remove such config in v2 version. best regards Frank Li
On Wed, 2013-01-23 at 15:37 +0800, Frank Li wrote: > 2013/1/23 Eric Dumazet <eric.dumazet@gmail.com>: > > On Wed, 2013-01-23 at 12:12 +0800, Frank Li wrote: > >> Add napi support > >> > >> Before this patch > >> > >> iperf -s -i 1 > >> ------------------------------------------------------------ > >> Server listening on TCP port 5001 > >> TCP window size: 85.3 KByte (default) > >> ------------------------------------------------------------ > >> [ 4] local 10.192.242.153 port 5001 connected with 10.192.242.138 port 50004 > >> [ ID] Interval Transfer Bandwidth > >> [ 4] 0.0- 1.0 sec 41.2 MBytes 345 Mbits/sec > >> [ 4] 1.0- 2.0 sec 43.7 MBytes 367 Mbits/sec > >> [ 4] 2.0- 3.0 sec 42.8 MBytes 359 Mbits/sec > >> [ 4] 3.0- 4.0 sec 43.7 MBytes 367 Mbits/sec > >> [ 4] 4.0- 5.0 sec 42.7 MBytes 359 Mbits/sec > >> [ 4] 5.0- 6.0 sec 43.8 MBytes 367 Mbits/sec > >> [ 4] 6.0- 7.0 sec 43.0 MBytes 361 Mbits/sec > >> > >> After this patch > >> [ 4] 2.0- 3.0 sec 51.6 MBytes 433 Mbits/sec > >> [ 4] 3.0- 4.0 sec 51.8 MBytes 435 Mbits/sec > >> [ 4] 4.0- 5.0 sec 52.2 MBytes 438 Mbits/sec > >> [ 4] 5.0- 6.0 sec 52.1 MBytes 437 Mbits/sec > >> [ 4] 6.0- 7.0 sec 52.1 MBytes 437 Mbits/sec > >> [ 4] 7.0- 8.0 sec 52.3 MBytes 439 Mbits/sec > > > > Strange, as you still call netif_rx() > > > > NAPI should call netif_receive_skb() instead > > > > Thank you point out. > After re-test, I found performance is almost no change if use netif_receive_skb. > I am not sure if it is my NAPI implement problem. > > napi_gro_received is better than netif_receive_skb, but worse than netif_rx. > > From performance point view, > > netif_rx --- fastest > napi_gro_received --- middle, near to netif_rx > netif_receive_skb --- slowest, almost the same as original no-napi version. > > Do you have any idea about this phenomena? No idea, you'll have to find out using perf tool if available. Is your machine SMP, and the application running on another cpu than the softirq handler for your device ? A NAPI driver must call netif_receive_skb(), especially if the RX path does a full copy of the frame : Its hot in cpu cache and should be processed at once. Escaping to netif_rx() is only adding an extra softirq and risk of data being evicted from cpu caches. Here your performance increase only comes from hw_lock being not anymore locked in RX path.
2013/1/23 Eric Dumazet <eric.dumazet@gmail.com>: > On Wed, 2013-01-23 at 15:37 +0800, Frank Li wrote: >> 2013/1/23 Eric Dumazet <eric.dumazet@gmail.com>: >> > On Wed, 2013-01-23 at 12:12 +0800, Frank Li wrote: >> >> Add napi support >> >> >> >> Before this patch >> >> >> >> iperf -s -i 1 >> >> ------------------------------------------------------------ >> >> Server listening on TCP port 5001 >> >> TCP window size: 85.3 KByte (default) >> >> ------------------------------------------------------------ >> >> [ 4] local 10.192.242.153 port 5001 connected with 10.192.242.138 port 50004 >> >> [ ID] Interval Transfer Bandwidth >> >> [ 4] 0.0- 1.0 sec 41.2 MBytes 345 Mbits/sec >> >> [ 4] 1.0- 2.0 sec 43.7 MBytes 367 Mbits/sec >> >> [ 4] 2.0- 3.0 sec 42.8 MBytes 359 Mbits/sec >> >> [ 4] 3.0- 4.0 sec 43.7 MBytes 367 Mbits/sec >> >> [ 4] 4.0- 5.0 sec 42.7 MBytes 359 Mbits/sec >> >> [ 4] 5.0- 6.0 sec 43.8 MBytes 367 Mbits/sec >> >> [ 4] 6.0- 7.0 sec 43.0 MBytes 361 Mbits/sec >> >> >> >> After this patch >> >> [ 4] 2.0- 3.0 sec 51.6 MBytes 433 Mbits/sec >> >> [ 4] 3.0- 4.0 sec 51.8 MBytes 435 Mbits/sec >> >> [ 4] 4.0- 5.0 sec 52.2 MBytes 438 Mbits/sec >> >> [ 4] 5.0- 6.0 sec 52.1 MBytes 437 Mbits/sec >> >> [ 4] 6.0- 7.0 sec 52.1 MBytes 437 Mbits/sec >> >> [ 4] 7.0- 8.0 sec 52.3 MBytes 439 Mbits/sec >> > >> > Strange, as you still call netif_rx() >> > >> > NAPI should call netif_receive_skb() instead >> > >> >> Thank you point out. >> After re-test, I found performance is almost no change if use netif_receive_skb. >> I am not sure if it is my NAPI implement problem. >> >> napi_gro_received is better than netif_receive_skb, but worse than netif_rx. >> >> From performance point view, >> >> netif_rx --- fastest >> napi_gro_received --- middle, near to netif_rx >> netif_receive_skb --- slowest, almost the same as original no-napi version. >> >> Do you have any idea about this phenomena? > > No idea, you'll have to find out using perf tool if available. > > Is your machine SMP, and the application running on another cpu than the > softirq handler for your device ? Yes, we support SMP. Possibly run on another cpu. I will check. > > A NAPI driver must call netif_receive_skb(), especially if > the RX path does a full copy of the frame : Its hot in cpu cache and > should be processed at once. How about napi_gro_receive? is it correct function? I found many driver use it. > > Escaping to netif_rx() is only adding an extra softirq and risk of data > being evicted from cpu caches. > > Here your performance increase only comes from hw_lock being not anymore > locked in RX path. > > >
On Thu, Jan 24, 2013 at 10:26:00AM +0800, Frank Li wrote: > > A NAPI driver must call netif_receive_skb(), especially if > > the RX path does a full copy of the frame : Its hot in cpu cache and > > should be processed at once. > > How about napi_gro_receive? is it correct function? I found many driver use it. If you want to coalesce your Rx frames to reduce overhead, then yes. It's a good performance improvement overall. -PJ
diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c index f52ba33..8fa420c 100644 --- a/drivers/net/ethernet/freescale/fec.c +++ b/drivers/net/ethernet/freescale/fec.c @@ -67,6 +67,8 @@ #endif #define DRIVER_NAME "fec" +#define FEC_NAPI_WEIGHT 64 +#define INT32_MAX 0x7FFFFFFF /* Pause frame feild and FIFO threshold */ #define FEC_ENET_FCE (1 << 5) @@ -565,6 +567,20 @@ fec_timeout(struct net_device *ndev) } static void +fec_enet_rx_int_is_enabled(struct net_device *ndev, bool enabled) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + uint int_events; + + int_events = readl(fep->hwp + FEC_IMASK); + if (enabled) + int_events |= FEC_ENET_RXF; + else + int_events &= ~FEC_ENET_RXF; + writel(int_events, fep->hwp + FEC_IMASK); +} + +static void fec_enet_tx(struct net_device *ndev) { struct fec_enet_private *fep; @@ -656,8 +672,8 @@ fec_enet_tx(struct net_device *ndev) * not been given to the system, we just set the empty indicator, * effectively tossing the packet. */ -static void -fec_enet_rx(struct net_device *ndev) +static int +fec_enet_rx(struct net_device *ndev, int budget) { struct fec_enet_private *fep = netdev_priv(ndev); const struct platform_device_id *id_entry = @@ -667,13 +683,12 @@ fec_enet_rx(struct net_device *ndev) struct sk_buff *skb; ushort pkt_len; __u8 *data; + int pkt_received = 0; #ifdef CONFIG_M532x flush_cache_all(); #endif - spin_lock(&fep->hw_lock); - /* First, grab all of the stats for the incoming packet. * These get messed up if we get called due to a busy condition. */ @@ -681,6 +696,10 @@ fec_enet_rx(struct net_device *ndev) while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { + if (pkt_received >= budget) + break; + pkt_received++; + /* Since we have allocated space to hold a complete frame, * the last indicator should be set. */ @@ -796,7 +815,7 @@ rx_processing_done: } fep->cur_rx = bdp; - spin_unlock(&fep->hw_lock); + return pkt_received; } static irqreturn_t @@ -805,6 +824,7 @@ fec_enet_interrupt(int irq, void *dev_id) struct net_device *ndev = dev_id; struct fec_enet_private *fep = netdev_priv(ndev); uint int_events; + ulong flags; irqreturn_t ret = IRQ_NONE; do { @@ -813,7 +833,18 @@ fec_enet_interrupt(int irq, void *dev_id) if (int_events & FEC_ENET_RXF) { ret = IRQ_HANDLED; - fec_enet_rx(ndev); + spin_lock_irqsave(&fep->hw_lock, flags); + + if (fep->use_napi) { + /* Disable the RX interrupt */ + if (napi_schedule_prep(&fep->napi)) { + fec_enet_rx_int_is_enabled(ndev, false); + __napi_schedule(&fep->napi); + } + } else + fec_enet_rx(ndev, INT32_MAX); + + spin_unlock_irqrestore(&fep->hw_lock, flags); } /* Transmit OK, or non-fatal error. Update the buffer @@ -834,7 +865,16 @@ fec_enet_interrupt(int irq, void *dev_id) return ret; } - +static int fec_enet_rx_napi(struct napi_struct *napi, int budget) +{ + struct net_device *ndev = napi->dev; + int pkgs = fec_enet_rx(ndev, budget); + if (pkgs < budget) { + napi_complete(napi); + fec_enet_rx_int_is_enabled(ndev, true); + } + return pkgs; +} /* ------------------------------------------------------------------------- */ static void fec_get_mac(struct net_device *ndev) @@ -1392,6 +1432,9 @@ fec_enet_open(struct net_device *ndev) struct fec_enet_private *fep = netdev_priv(ndev); int ret; + if (fep->use_napi) + napi_enable(&fep->napi); + /* I should reset the ring buffers here, but I don't yet know * a simple way to do that. */ @@ -1604,6 +1647,11 @@ static int fec_enet_init(struct net_device *ndev) ndev->netdev_ops = &fec_netdev_ops; ndev->ethtool_ops = &fec_enet_ethtool_ops; + if (fep->use_napi) { + fec_enet_rx_int_is_enabled(ndev, false); + netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, fep->napi_weight); + } + /* Initialize the receive buffer descriptors. */ bdp = fep->rx_bd_base; for (i = 0; i < RX_RING_SIZE; i++) { @@ -1698,6 +1746,7 @@ fec_probe(struct platform_device *pdev) static int dev_id; struct pinctrl *pinctrl; struct regulator *reg_phy; + struct device_node *np = pdev->dev.of_node; of_id = of_match_device(fec_dt_ids, &pdev->dev); if (of_id) @@ -1811,6 +1860,11 @@ fec_probe(struct platform_device *pdev) } } + fep->use_napi = !of_property_read_bool(np, "disable_napi"); + + if (of_property_read_u32(np, "napi_weight", &fep->napi_weight)) + fep->napi_weight = FEC_NAPI_WEIGHT; /*using default value*/ + fec_reset_phy(pdev); ret = fec_enet_init(ndev); diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 2ebedaf..31fcdd0 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -249,6 +249,10 @@ struct fec_enet_private { int bufdesc_ex; int pause_flag; + struct napi_struct napi; + int napi_weight; + bool use_napi; + struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_caps; unsigned long last_overflow_check;