diff mbox series

IB/rxe: Make counters thread safe

Message ID 1542992927-39110-1-git-send-email-parav@mellanox.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series IB/rxe: Make counters thread safe | expand

Commit Message

Parav Pandit Nov. 23, 2018, 5:08 p.m. UTC
Current rxe device counters are not thread safe.
When multiple QPs are used, they can be racy.
Make them thread safe by making it per cpu.

Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats")
Signed-off-by: Parav Pandit <parav@mellanox.com>
---
 drivers/infiniband/sw/rxe/rxe.c             | 17 +++++++++++++++++
 drivers/infiniband/sw/rxe/rxe_hw_counters.c | 17 +++++++++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h       | 14 ++++++++++----
 3 files changed, 42 insertions(+), 6 deletions(-)

Comments

Leon Romanovsky Nov. 23, 2018, 5:48 p.m. UTC | #1
On Fri, Nov 23, 2018 at 11:08:47AM -0600, Parav Pandit wrote:
> Current rxe device counters are not thread safe.
> When multiple QPs are used, they can be racy.
> Make them thread safe by making it per cpu.
>
> Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats")
> Signed-off-by: Parav Pandit <parav@mellanox.com>
> ---
>  drivers/infiniband/sw/rxe/rxe.c             | 17 +++++++++++++++++
>  drivers/infiniband/sw/rxe/rxe_hw_counters.c | 17 +++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe_verbs.h       | 14 ++++++++++----
>  3 files changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c..722aca9 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -39,6 +39,17 @@
>  MODULE_DESCRIPTION("Soft RDMA transport");
>  MODULE_LICENSE("Dual BSD/GPL");
>
> +static int rxe_init_stats(struct rxe_dev *rxe)
> +{
> +	rxe->pcpu_stats = alloc_percpu(struct rxe_stats);
> +	return rxe->pcpu_stats ? 0 : -ENOMEM;
> +}
> +
> +static void rxe_cleanup_stats(struct rxe_dev *rxe)
> +{
> +	free_percpu(rxe->pcpu_stats);

Just my personal opinion,
I completely dislike this type of abstractions where standard kernel API
is hidden by some fancy one liner, which is called once only.

> +}
> +
>  /* free resources for all ports on a device */
>  static void rxe_cleanup_ports(struct rxe_dev *rxe)
>  {
> @@ -64,6 +75,7 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>  	rxe_pool_cleanup(&rxe->mc_elem_pool);
>
>  	rxe_cleanup_ports(rxe);
> +	rxe_cleanup_stats(rxe);
>
>  	crypto_free_shash(rxe->tfm);
>  }
> @@ -267,6 +279,10 @@ static int rxe_init(struct rxe_dev *rxe)
>  	/* init default device parameters */
>  	rxe_init_device_param(rxe);
>
> +	err = rxe_init_stats(rxe);
> +	if (err)
> +		return err;
> +
>  	err = rxe_init_ports(rxe);
>  	if (err)
>  		goto err1;
> @@ -288,6 +304,7 @@ static int rxe_init(struct rxe_dev *rxe)
>  err2:
>  	rxe_cleanup_ports(rxe);
>  err1:
> +	rxe_cleanup_stats(rxe);
>  	return err;
>  }
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> index 6aeb7a1..0fb567a 100644
> --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> @@ -48,6 +48,19 @@
>  	[RXE_CNT_SEND_ERR]            =  "send_err",
>  };
>
> +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int index)
> +{
> +	struct rxe_stats *stats;
> +	u64 sum = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> +		sum += stats->counters[index];
> +	}
> +	return sum;
> +}
> +
>  int rxe_ib_get_hw_stats(struct ib_device *ibdev,
>  			struct rdma_hw_stats *stats,
>  			u8 port, int index)
> @@ -58,8 +71,8 @@ int rxe_ib_get_hw_stats(struct ib_device *ibdev,
>  	if (!port || !stats)
>  		return -EINVAL;
>
> -	for (cnt = 0; cnt  < ARRAY_SIZE(rxe_counter_name); cnt++)
> -		stats->value[cnt] = dev->stats_counters[cnt];
> +	for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_name); cnt++)
> +		stats->value[cnt] = rxe_stats_read_by_index(dev, cnt);
>
>  	return ARRAY_SIZE(rxe_counter_name);
>  }
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 82e670d..7daff30 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -380,6 +380,10 @@ struct rxe_port {
>  	u32			qp_gsi_index;
>  };
>
> +struct rxe_stats {
> +	u64	counters[RXE_NUM_OF_COUNTERS];
> +};
> +
>  struct rxe_dev {
>  	struct ib_device	ib_dev;
>  	struct ib_device_attr	attr;
> @@ -409,16 +413,18 @@ struct rxe_dev {
>  	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
>  	int			mmap_offset;
>
> -	u64			stats_counters[RXE_NUM_OF_COUNTERS];
> -
> +	struct rxe_stats	__percpu *pcpu_stats;
>  	struct rxe_port		port;
>  	struct list_head	list;
>  	struct crypto_shash	*tfm;
>  };
>
> -static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters cnt)
> +static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
>  {
> -	rxe->stats_counters[cnt]++;
> +	struct rxe_stats *stats;
> +
> +	stats = this_cpu_ptr(rxe->pcpu_stats);
> +	stats->counters[index]++;
>  }
>
>  static inline struct rxe_dev *to_rdev(struct ib_device *dev)
> --
> 1.8.3.1
>
Jason Gunthorpe Nov. 23, 2018, 7:19 p.m. UTC | #2
On Fri, Nov 23, 2018 at 11:08:47AM -0600, Parav Pandit wrote:
> Current rxe device counters are not thread safe.
> When multiple QPs are used, they can be racy.
> Make them thread safe by making it per cpu.
> 
> Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats")
> Signed-off-by: Parav Pandit <parav@mellanox.com>
>  drivers/infiniband/sw/rxe/rxe.c             | 17 +++++++++++++++++
>  drivers/infiniband/sw/rxe/rxe_hw_counters.c | 17 +++++++++++++++--
>  drivers/infiniband/sw/rxe/rxe_verbs.h       | 14 ++++++++++----
>  3 files changed, 42 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index 383e65c..722aca9 100644
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -39,6 +39,17 @@
>  MODULE_DESCRIPTION("Soft RDMA transport");
>  MODULE_LICENSE("Dual BSD/GPL");
>  
> +static int rxe_init_stats(struct rxe_dev *rxe)
> +{
> +	rxe->pcpu_stats = alloc_percpu(struct rxe_stats);
> +	return rxe->pcpu_stats ? 0 : -ENOMEM;
> +}
> +
> +static void rxe_cleanup_stats(struct rxe_dev *rxe)
> +{
> +	free_percpu(rxe->pcpu_stats);
> +}
> +
>  /* free resources for all ports on a device */
>  static void rxe_cleanup_ports(struct rxe_dev *rxe)
>  {
> @@ -64,6 +75,7 @@ static void rxe_cleanup(struct rxe_dev *rxe)
>  	rxe_pool_cleanup(&rxe->mc_elem_pool);
>  
>  	rxe_cleanup_ports(rxe);
> +	rxe_cleanup_stats(rxe);
>  
>  	crypto_free_shash(rxe->tfm);
>  }
> @@ -267,6 +279,10 @@ static int rxe_init(struct rxe_dev *rxe)
>  	/* init default device parameters */
>  	rxe_init_device_param(rxe);
>  
> +	err = rxe_init_stats(rxe);
> +	if (err)
> +		return err;
> +
>  	err = rxe_init_ports(rxe);
>  	if (err)
>  		goto err1;
> @@ -288,6 +304,7 @@ static int rxe_init(struct rxe_dev *rxe)
>  err2:
>  	rxe_cleanup_ports(rxe);
>  err1:
> +	rxe_cleanup_stats(rxe);
>  	return err;
>  }
>  
> diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> index 6aeb7a1..0fb567a 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> @@ -48,6 +48,19 @@
>  	[RXE_CNT_SEND_ERR]            =  "send_err",
>  };
>  
> +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int index)
> +{
> +	struct rxe_stats *stats;
> +	u64 sum = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> +		sum += stats->counters[index];
> +	}

This is still racy with a writer on another CPU.

Jason
Parav Pandit Nov. 24, 2018, 2:25 p.m. UTC | #3
Hi Leon,

> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Leon Romanovsky
> Sent: Friday, November 23, 2018 11:48 AM
> To: Parav Pandit <parav@mellanox.com>
> Cc: linux-rdma@vger.kernel.org; dledford@redhat.com; Moni Shoua
> <monis@mellanox.com>; jgg@ziepe.ca
> Subject: Re: [PATCH] IB/rxe: Make counters thread safe
> 
> On Fri, Nov 23, 2018 at 11:08:47AM -0600, Parav Pandit wrote:
> > Current rxe device counters are not thread safe.
> > When multiple QPs are used, they can be racy.
> > Make them thread safe by making it per cpu.
> >
> > Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats")
> > Signed-off-by: Parav Pandit <parav@mellanox.com>
> > ---
> >  drivers/infiniband/sw/rxe/rxe.c             | 17 +++++++++++++++++
> >  drivers/infiniband/sw/rxe/rxe_hw_counters.c | 17 +++++++++++++++--
> >  drivers/infiniband/sw/rxe/rxe_verbs.h       | 14 ++++++++++----
> >  3 files changed, 42 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/infiniband/sw/rxe/rxe.c
> > b/drivers/infiniband/sw/rxe/rxe.c index 383e65c..722aca9 100644
> > --- a/drivers/infiniband/sw/rxe/rxe.c
> > +++ b/drivers/infiniband/sw/rxe/rxe.c
> > @@ -39,6 +39,17 @@
> >  MODULE_DESCRIPTION("Soft RDMA transport");  MODULE_LICENSE("Dual
> > BSD/GPL");
> >
> > +static int rxe_init_stats(struct rxe_dev *rxe) {
> > +	rxe->pcpu_stats = alloc_percpu(struct rxe_stats);
> > +	return rxe->pcpu_stats ? 0 : -ENOMEM; }
> > +
> > +static void rxe_cleanup_stats(struct rxe_dev *rxe) {
> > +	free_percpu(rxe->pcpu_stats);
> 
> Just my personal opinion,
> I completely dislike this type of abstractions where standard kernel API is
> hidden by some fancy one liner, which is called once only.
> 
rxe_cleanup_stats() is called twice.
One in rxe_init() and second in rxe_cleanup().
I think you wanted to say this comment for rxe_init_stats() which is only called once.
I added init_stats() to balance it with cleanup_stats() function.
But I can inline both of them.

> > +}
> > +
> >  /* free resources for all ports on a device */  static void
> > rxe_cleanup_ports(struct rxe_dev *rxe)  { @@ -64,6 +75,7 @@ static
> > void rxe_cleanup(struct rxe_dev *rxe)
> >  	rxe_pool_cleanup(&rxe->mc_elem_pool);
> >
> >  	rxe_cleanup_ports(rxe);
> > +	rxe_cleanup_stats(rxe);
> >
> >  	crypto_free_shash(rxe->tfm);
> >  }
> > @@ -267,6 +279,10 @@ static int rxe_init(struct rxe_dev *rxe)
> >  	/* init default device parameters */
> >  	rxe_init_device_param(rxe);
> >
> > +	err = rxe_init_stats(rxe);
> > +	if (err)
> > +		return err;
> > +
> >  	err = rxe_init_ports(rxe);
> >  	if (err)
> >  		goto err1;
> > @@ -288,6 +304,7 @@ static int rxe_init(struct rxe_dev *rxe)
> >  err2:
> >  	rxe_cleanup_ports(rxe);
> >  err1:
> > +	rxe_cleanup_stats(rxe);
> >  	return err;
> >  }
> >
> > diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > index 6aeb7a1..0fb567a 100644
> > --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > @@ -48,6 +48,19 @@
> >  	[RXE_CNT_SEND_ERR]            =  "send_err",
> >  };
> >
> > +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int
> > +index) {
> > +	struct rxe_stats *stats;
> > +	u64 sum = 0;
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> > +		sum += stats->counters[index];
> > +	}
> > +	return sum;
> > +}
> > +
> >  int rxe_ib_get_hw_stats(struct ib_device *ibdev,
> >  			struct rdma_hw_stats *stats,
> >  			u8 port, int index)
> > @@ -58,8 +71,8 @@ int rxe_ib_get_hw_stats(struct ib_device *ibdev,
> >  	if (!port || !stats)
> >  		return -EINVAL;
> >
> > -	for (cnt = 0; cnt  < ARRAY_SIZE(rxe_counter_name); cnt++)
> > -		stats->value[cnt] = dev->stats_counters[cnt];
> > +	for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_name); cnt++)
> > +		stats->value[cnt] = rxe_stats_read_by_index(dev, cnt);
> >
> >  	return ARRAY_SIZE(rxe_counter_name);  } diff --git
> > a/drivers/infiniband/sw/rxe/rxe_verbs.h
> > b/drivers/infiniband/sw/rxe/rxe_verbs.h
> > index 82e670d..7daff30 100644
> > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> > @@ -380,6 +380,10 @@ struct rxe_port {
> >  	u32			qp_gsi_index;
> >  };
> >
> > +struct rxe_stats {
> > +	u64	counters[RXE_NUM_OF_COUNTERS];
> > +};
> > +
> >  struct rxe_dev {
> >  	struct ib_device	ib_dev;
> >  	struct ib_device_attr	attr;
> > @@ -409,16 +413,18 @@ struct rxe_dev {
> >  	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
> >  	int			mmap_offset;
> >
> > -	u64			stats_counters[RXE_NUM_OF_COUNTERS];
> > -
> > +	struct rxe_stats	__percpu *pcpu_stats;
> >  	struct rxe_port		port;
> >  	struct list_head	list;
> >  	struct crypto_shash	*tfm;
> >  };
> >
> > -static inline void rxe_counter_inc(struct rxe_dev *rxe, enum
> > rxe_counters cnt)
> > +static inline void rxe_counter_inc(struct rxe_dev *rxe, enum
> > +rxe_counters index)
> >  {
> > -	rxe->stats_counters[cnt]++;
> > +	struct rxe_stats *stats;
> > +
> > +	stats = this_cpu_ptr(rxe->pcpu_stats);
> > +	stats->counters[index]++;
> >  }
> >
> >  static inline struct rxe_dev *to_rdev(struct ib_device *dev)
> > --
> > 1.8.3.1
> >
Parav Pandit Nov. 24, 2018, 2:27 p.m. UTC | #4
Hi Jason,

> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Friday, November 23, 2018 1:20 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: linux-rdma@vger.kernel.org; dledford@redhat.com; Moni Shoua
> <monis@mellanox.com>
> Subject: Re: [PATCH] IB/rxe: Make counters thread safe
> 
> On Fri, Nov 23, 2018 at 11:08:47AM -0600, Parav Pandit wrote:
> > Current rxe device counters are not thread safe.
> > When multiple QPs are used, they can be racy.
> > Make them thread safe by making it per cpu.
> >
> > Fixes: 0b1e5b99a48b ("IB/rxe: Add port protocol stats")
> > Signed-off-by: Parav Pandit <parav@mellanox.com>
> >  drivers/infiniband/sw/rxe/rxe.c             | 17 +++++++++++++++++
> >  drivers/infiniband/sw/rxe/rxe_hw_counters.c | 17 +++++++++++++++--
> >  drivers/infiniband/sw/rxe/rxe_verbs.h       | 14 ++++++++++----
> >  3 files changed, 42 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/infiniband/sw/rxe/rxe.c
> > b/drivers/infiniband/sw/rxe/rxe.c index 383e65c..722aca9 100644
> > +++ b/drivers/infiniband/sw/rxe/rxe.c
> > @@ -39,6 +39,17 @@
> >  MODULE_DESCRIPTION("Soft RDMA transport");  MODULE_LICENSE("Dual
> > BSD/GPL");
> >
> > +static int rxe_init_stats(struct rxe_dev *rxe) {
> > +	rxe->pcpu_stats = alloc_percpu(struct rxe_stats);
> > +	return rxe->pcpu_stats ? 0 : -ENOMEM; }
> > +
> > +static void rxe_cleanup_stats(struct rxe_dev *rxe) {
> > +	free_percpu(rxe->pcpu_stats);
> > +}
> > +
> >  /* free resources for all ports on a device */  static void
> > rxe_cleanup_ports(struct rxe_dev *rxe)  { @@ -64,6 +75,7 @@ static
> > void rxe_cleanup(struct rxe_dev *rxe)
> >  	rxe_pool_cleanup(&rxe->mc_elem_pool);
> >
> >  	rxe_cleanup_ports(rxe);
> > +	rxe_cleanup_stats(rxe);
> >
> >  	crypto_free_shash(rxe->tfm);
> >  }
> > @@ -267,6 +279,10 @@ static int rxe_init(struct rxe_dev *rxe)
> >  	/* init default device parameters */
> >  	rxe_init_device_param(rxe);
> >
> > +	err = rxe_init_stats(rxe);
> > +	if (err)
> > +		return err;
> > +
> >  	err = rxe_init_ports(rxe);
> >  	if (err)
> >  		goto err1;
> > @@ -288,6 +304,7 @@ static int rxe_init(struct rxe_dev *rxe)
> >  err2:
> >  	rxe_cleanup_ports(rxe);
> >  err1:
> > +	rxe_cleanup_stats(rxe);
> >  	return err;
> >  }
> >
> > diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > index 6aeb7a1..0fb567a 100644
> > +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
> > @@ -48,6 +48,19 @@
> >  	[RXE_CNT_SEND_ERR]            =  "send_err",
> >  };
> >
> > +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int
> > +index) {
> > +	struct rxe_stats *stats;
> > +	u64 sum = 0;
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> > +		sum += stats->counters[index];
> > +	}
> 
> This is still racy with a writer on another CPU.
> 
I didn't follow you. If other cpu is updating counter for given index, sum will be updated to user in next read cycle whenever user does it.
Counters are always a snapshot when read.
Idea here is to have right sum when multiple cpu writers are writing, which was not the case before this patch.

> Jason
Jason Gunthorpe Nov. 26, 2018, 3:19 a.m. UTC | #5
On Sat, Nov 24, 2018 at 02:27:53PM +0000, Parav Pandit wrote:

> > > +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int
> > > +index) {
> > > +	struct rxe_stats *stats;
> > > +	u64 sum = 0;
> > > +	int cpu;
> > > +
> > > +	for_each_possible_cpu(cpu) {
> > > +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> > > +		sum += stats->counters[index];
> > > +	}
> > 
> > This is still racy with a writer on another CPU.
> 
> I didn't follow you. If other cpu is updating counter for given
> index, sum will be updated to user in next read cycle whenever user
> does it.  Counters are always a snapshot when read.  Idea here is to
> have right sum when multiple cpu writers are writing, which was not
> the case before this patch.

Where is the snapshot?

Reading a u64 is not possible atomically on 32 bit platforms, you get
garbage if there is a race with a writer.

Jason
Parav Pandit Nov. 26, 2018, 4:50 a.m. UTC | #6
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Sunday, November 25, 2018 9:19 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: linux-rdma@vger.kernel.org; dledford@redhat.com; Moni Shoua
> <monis@mellanox.com>
> Subject: Re: [PATCH] IB/rxe: Make counters thread safe
> 
> On Sat, Nov 24, 2018 at 02:27:53PM +0000, Parav Pandit wrote:
> 
> > > > +static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int
> > > > +index) {
> > > > +	struct rxe_stats *stats;
> > > > +	u64 sum = 0;
> > > > +	int cpu;
> > > > +
> > > > +	for_each_possible_cpu(cpu) {
> > > > +		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
> > > > +		sum += stats->counters[index];
> > > > +	}
> > >
> > > This is still racy with a writer on another CPU.
> >
> > I didn't follow you. If other cpu is updating counter for given index,
> > sum will be updated to user in next read cycle whenever user does it.
> > Counters are always a snapshot when read.  Idea here is to have right
> > sum when multiple cpu writers are writing, which was not the case
> > before this patch.
> 
> Where is the snapshot?
> 
> Reading a u64 is not possible atomically on 32 bit platforms, you get garbage
> if there is a race with a writer.
> 
Yes. Didn't consider 32-bit.
Additionally, I am not using this_cpu_inc() variant so it is not preemption safe given that we update these counters from thread context too.
So atomic64_t is better choice that addresses both the issues.

> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 383e65c..722aca9 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -39,6 +39,17 @@ 
 MODULE_DESCRIPTION("Soft RDMA transport");
 MODULE_LICENSE("Dual BSD/GPL");
 
+static int rxe_init_stats(struct rxe_dev *rxe)
+{
+	rxe->pcpu_stats = alloc_percpu(struct rxe_stats);
+	return rxe->pcpu_stats ? 0 : -ENOMEM;
+}
+
+static void rxe_cleanup_stats(struct rxe_dev *rxe)
+{
+	free_percpu(rxe->pcpu_stats);
+}
+
 /* free resources for all ports on a device */
 static void rxe_cleanup_ports(struct rxe_dev *rxe)
 {
@@ -64,6 +75,7 @@  static void rxe_cleanup(struct rxe_dev *rxe)
 	rxe_pool_cleanup(&rxe->mc_elem_pool);
 
 	rxe_cleanup_ports(rxe);
+	rxe_cleanup_stats(rxe);
 
 	crypto_free_shash(rxe->tfm);
 }
@@ -267,6 +279,10 @@  static int rxe_init(struct rxe_dev *rxe)
 	/* init default device parameters */
 	rxe_init_device_param(rxe);
 
+	err = rxe_init_stats(rxe);
+	if (err)
+		return err;
+
 	err = rxe_init_ports(rxe);
 	if (err)
 		goto err1;
@@ -288,6 +304,7 @@  static int rxe_init(struct rxe_dev *rxe)
 err2:
 	rxe_cleanup_ports(rxe);
 err1:
+	rxe_cleanup_stats(rxe);
 	return err;
 }
 
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
index 6aeb7a1..0fb567a 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -48,6 +48,19 @@ 
 	[RXE_CNT_SEND_ERR]            =  "send_err",
 };
 
+static u64 rxe_stats_read_by_index(const struct rxe_dev *dev, int index)
+{
+	struct rxe_stats *stats;
+	u64 sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		stats = per_cpu_ptr(dev->pcpu_stats, cpu);
+		sum += stats->counters[index];
+	}
+	return sum;
+}
+
 int rxe_ib_get_hw_stats(struct ib_device *ibdev,
 			struct rdma_hw_stats *stats,
 			u8 port, int index)
@@ -58,8 +71,8 @@  int rxe_ib_get_hw_stats(struct ib_device *ibdev,
 	if (!port || !stats)
 		return -EINVAL;
 
-	for (cnt = 0; cnt  < ARRAY_SIZE(rxe_counter_name); cnt++)
-		stats->value[cnt] = dev->stats_counters[cnt];
+	for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_name); cnt++)
+		stats->value[cnt] = rxe_stats_read_by_index(dev, cnt);
 
 	return ARRAY_SIZE(rxe_counter_name);
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 82e670d..7daff30 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -380,6 +380,10 @@  struct rxe_port {
 	u32			qp_gsi_index;
 };
 
+struct rxe_stats {
+	u64	counters[RXE_NUM_OF_COUNTERS];
+};
+
 struct rxe_dev {
 	struct ib_device	ib_dev;
 	struct ib_device_attr	attr;
@@ -409,16 +413,18 @@  struct rxe_dev {
 	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
 	int			mmap_offset;
 
-	u64			stats_counters[RXE_NUM_OF_COUNTERS];
-
+	struct rxe_stats	__percpu *pcpu_stats;
 	struct rxe_port		port;
 	struct list_head	list;
 	struct crypto_shash	*tfm;
 };
 
-static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters cnt)
+static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
 {
-	rxe->stats_counters[cnt]++;
+	struct rxe_stats *stats;
+
+	stats = this_cpu_ptr(rxe->pcpu_stats);
+	stats->counters[index]++;
 }
 
 static inline struct rxe_dev *to_rdev(struct ib_device *dev)