diff mbox series

PCI: optimize proc sequential file read

Message ID 20241018054728.116519-1-kanie@linux.alibaba.com (mailing list archive)
State Changes Requested
Headers show
Series PCI: optimize proc sequential file read | expand

Commit Message

Guixin Liu Oct. 18, 2024, 5:47 a.m. UTC
PCI driver will traverse pci device list in pci_seq_start in every
sequential file reading, use xarry to store all pci devices to
accelerate finding the start.

Use "time cat /proc/bus/pci/devices" to test on a machine with 13k
pci devices,  get an increase of about 90%.

Without this patch:
  real 0m0.917s
  user 0m0.000s
  sys  0m0.913s
With this patch:
  real 0m0.093s
  user 0m0.000s
  sys  0m0.093s

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
---
 drivers/pci/pci.h    |  3 +++
 drivers/pci/probe.c  |  1 +
 drivers/pci/proc.c   | 64 +++++++++++++++++++++++++++++++++++++++-----
 drivers/pci/remove.c |  1 +
 include/linux/pci.h  |  2 ++
 5 files changed, 64 insertions(+), 7 deletions(-)

Comments

Bjorn Helgaas Oct. 18, 2024, 10:22 p.m. UTC | #1
On Fri, Oct 18, 2024 at 01:47:28PM +0800, Guixin Liu wrote:
> PCI driver will traverse pci device list in pci_seq_start in every
> sequential file reading, use xarry to store all pci devices to
> accelerate finding the start.
>
> Use "time cat /proc/bus/pci/devices" to test on a machine with 13k
> pci devices,  get an increase of about 90%.

s/pci/PCI/ (several times)
s/pci_seq_start/pci_seq_start()/
s/xarry/xarray/
s/,  /, / (remove extra space)

> Without this patch:
>   real 0m0.917s
>   user 0m0.000s
>   sys  0m0.913s
> With this patch:
>   real 0m0.093s
>   user 0m0.000s
>   sys  0m0.093s

Nice speedup, for sure!

> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
> ---
>  drivers/pci/pci.h    |  3 +++
>  drivers/pci/probe.c  |  1 +
>  drivers/pci/proc.c   | 64 +++++++++++++++++++++++++++++++++++++++-----
>  drivers/pci/remove.c |  1 +
>  include/linux/pci.h  |  2 ++
>  5 files changed, 64 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 14d00ce45bfa..1a7da91eeb80 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -962,4 +962,7 @@ void pcim_release_region(struct pci_dev *pdev, int bar);
>  	(PCI_CONF1_ADDRESS(bus, dev, func, reg) | \
>  	 PCI_CONF1_EXT_REG(reg))
>  
> +void pci_seq_tree_add_dev(struct pci_dev *dev);
> +void pci_seq_tree_remove_dev(struct pci_dev *dev);
> +
>  #endif /* DRIVERS_PCI_H */
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 4f68414c3086..1fd9e9022f70 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2592,6 +2592,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
>  	WARN_ON(ret < 0);
>  
>  	pci_npem_create(dev);
> +	pci_seq_tree_add_dev(dev);
>  }
>  
>  struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
> diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
> index f967709082d6..30ca071ccad5 100644
> --- a/drivers/pci/proc.c
> +++ b/drivers/pci/proc.c
> @@ -19,6 +19,9 @@
>  
>  static int proc_initialized;	/* = 0 */
>  
> +DEFINE_XARRAY_FLAGS(pci_seq_tree, 0);
> +static unsigned long pci_max_idx;
> +
>  static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
>  {
>  	struct pci_dev *dev = pde_data(file_inode(file));
> @@ -334,25 +337,72 @@ static const struct proc_ops proc_bus_pci_ops = {
>  #endif /* HAVE_PCI_MMAP */
>  };
>  
> +void pci_seq_tree_add_dev(struct pci_dev *dev)
> +{
> +	int ret;
> +
> +	if (dev) {

I don't think we should test "dev" for NULL here.  If it's NULL, I
think we have bigger problems and we should oops.

> +		xa_lock(&pci_seq_tree);
> +		pci_dev_get(dev);
> +		ret = __xa_insert(&pci_seq_tree, pci_max_idx, dev, GFP_KERNEL);
> +		if (!ret) {
> +			dev->proc_seq_idx = pci_max_idx;
> +			pci_max_idx++;
> +		} else {
> +			pci_dev_put(dev);
> +			WARN_ON(ret);
> +		}
> +		xa_unlock(&pci_seq_tree);
> +	}
> +}
> +
> +void pci_seq_tree_remove_dev(struct pci_dev *dev)
> +{
> +	unsigned long idx = dev->proc_seq_idx;
> +	struct pci_dev *latest_dev = NULL;
> +	struct pci_dev *ret;
> +
> +	if (!dev)
> +		return;

Same comment about testing "dev" for NULL.

> +	xa_lock(&pci_seq_tree);
> +	__xa_erase(&pci_seq_tree, idx);
> +	pci_dev_put(dev);
> +	/*
> +	 * Move the latest pci_dev to this idx to keep the continuity.
> +	 */
> +	if (idx != pci_max_idx - 1) {
> +		latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
> +		ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
> +				GFP_KERNEL);
> +		if (!ret)
> +			latest_dev->proc_seq_idx = idx;
> +		WARN_ON(ret);
> +	}
> +	pci_max_idx--;
> +	xa_unlock(&pci_seq_tree);
> +}
> +
>  /* iterator */
>  static void *pci_seq_start(struct seq_file *m, loff_t *pos)
>  {
> -	struct pci_dev *dev = NULL;
> +	struct pci_dev *dev;
>  	loff_t n = *pos;
>  
> -	for_each_pci_dev(dev) {
> -		if (!n--)
> -			break;
> -	}
> +	dev = xa_load(&pci_seq_tree, n);
> +	if (dev)
> +		pci_dev_get(dev);
>  	return dev;

I'm a little hesitant to add another place that keeps track of every
PCI device.  It's a fair bit of code here, and it's redundant
information, which means more work to keep them all synchronized.

This proc interface feels inherently racy.  We keep track of the
current item (n) in a struct seq_file, but I don't think there's
anything to prevent a pci_dev hot-add or -remove between calls to
pci_seq_start().

Is the proc interface the only place to get this information?  If
there's a way to get it from sysfs, maybe that is better and we don't
need to spend effort optimizing the less-desirable path?

>  }
>  
>  static void *pci_seq_next(struct seq_file *m, void *v, loff_t *pos)
>  {
> -	struct pci_dev *dev = v;
> +	struct pci_dev *dev;
>  
>  	(*pos)++;
> -	dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
> +	dev = xa_load(&pci_seq_tree, *pos);
> +	if (dev)
> +		pci_dev_get(dev);

Where is the pci_dev_put() that corresponds with this new
pci_dev_get()?

>  	return dev;
>  }
>  
> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> index e4ce1145aa3e..257ea46233a3 100644
> --- a/drivers/pci/remove.c
> +++ b/drivers/pci/remove.c
> @@ -53,6 +53,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
>  	pci_npem_remove(dev);
>  
>  	device_del(&dev->dev);
> +	pci_seq_tree_remove_dev(dev);
>  
>  	down_write(&pci_bus_sem);
>  	list_del(&dev->bus_list);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 573b4c4c2be6..aeb3d4cce06a 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -534,6 +534,8 @@ struct pci_dev {
>  
>  	/* These methods index pci_reset_fn_methods[] */
>  	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
> +
> +	unsigned long long	proc_seq_idx;
>  };
>  
>  static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
> -- 
> 2.43.0
>
kernel test robot Oct. 19, 2024, 6:39 a.m. UTC | #2
Hi Guixin,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus linus/master v6.12-rc3 next-20241018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Guixin-Liu/PCI-optimize-proc-sequential-file-read/20241018-135026
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20241018054728.116519-1-kanie%40linux.alibaba.com
patch subject: [PATCH] PCI: optimize proc sequential file read
config: i386-buildonly-randconfig-003-20241019 (https://download.01.org/0day-ci/archive/20241019/202410191439.yQ27wvB6-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241019/202410191439.yQ27wvB6-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410191439.yQ27wvB6-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: drivers/pci/probe.o: in function `pci_device_add':
>> drivers/pci/probe.c:2595: undefined reference to `pci_seq_tree_add_dev'
   ld: drivers/pci/remove.o: in function `pci_destroy_dev':
>> drivers/pci/remove.c:56: undefined reference to `pci_seq_tree_remove_dev'


vim +2595 drivers/pci/probe.c

  2546	
  2547	void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
  2548	{
  2549		int ret;
  2550	
  2551		pci_configure_device(dev);
  2552	
  2553		device_initialize(&dev->dev);
  2554		dev->dev.release = pci_release_dev;
  2555	
  2556		set_dev_node(&dev->dev, pcibus_to_node(bus));
  2557		dev->dev.dma_mask = &dev->dma_mask;
  2558		dev->dev.dma_parms = &dev->dma_parms;
  2559		dev->dev.coherent_dma_mask = 0xffffffffull;
  2560	
  2561		dma_set_max_seg_size(&dev->dev, 65536);
  2562		dma_set_seg_boundary(&dev->dev, 0xffffffff);
  2563	
  2564		pcie_failed_link_retrain(dev);
  2565	
  2566		/* Fix up broken headers */
  2567		pci_fixup_device(pci_fixup_header, dev);
  2568	
  2569		pci_reassigndev_resource_alignment(dev);
  2570	
  2571		dev->state_saved = false;
  2572	
  2573		pci_init_capabilities(dev);
  2574	
  2575		/*
  2576		 * Add the device to our list of discovered devices
  2577		 * and the bus list for fixup functions, etc.
  2578		 */
  2579		down_write(&pci_bus_sem);
  2580		list_add_tail(&dev->bus_list, &bus->devices);
  2581		up_write(&pci_bus_sem);
  2582	
  2583		ret = pcibios_device_add(dev);
  2584		WARN_ON(ret < 0);
  2585	
  2586		/* Set up MSI IRQ domain */
  2587		pci_set_msi_domain(dev);
  2588	
  2589		/* Notifier could use PCI capabilities */
  2590		dev->match_driver = false;
  2591		ret = device_add(&dev->dev);
  2592		WARN_ON(ret < 0);
  2593	
  2594		pci_npem_create(dev);
> 2595		pci_seq_tree_add_dev(dev);
  2596	}
  2597
kernel test robot Oct. 19, 2024, 6:41 p.m. UTC | #3
Hi Guixin,

kernel test robot noticed the following build errors:

[auto build test ERROR on pci/next]
[also build test ERROR on pci/for-linus linus/master v6.12-rc3 next-20241018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Guixin-Liu/PCI-optimize-proc-sequential-file-read/20241018-135026
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20241018054728.116519-1-kanie%40linux.alibaba.com
patch subject: [PATCH] PCI: optimize proc sequential file read
config: loongarch-randconfig-r062-20241019 (https://download.01.org/0day-ci/archive/20241020/202410200211.RFdwjJSq-lkp@intel.com/config)
compiler: loongarch64-linux-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241020/202410200211.RFdwjJSq-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410200211.RFdwjJSq-lkp@intel.com/

All errors (new ones prefixed by >>):

   loongarch64-linux-ld: drivers/pci/probe.o: in function `.L5491':
>> probe.c:(.text+0x2ff8): undefined reference to `pci_seq_tree_add_dev'
   loongarch64-linux-ld: drivers/pci/remove.o: in function `.LBB262':
>> drivers/pci/remove.c:56:(.text+0x398): undefined reference to `pci_seq_tree_remove_dev'
kernel test robot Oct. 20, 2024, 7:04 a.m. UTC | #4
Hi Guixin,

kernel test robot noticed the following build warnings:

[auto build test WARNING on pci/next]
[also build test WARNING on pci/for-linus linus/master v6.12-rc3 next-20241018]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Guixin-Liu/PCI-optimize-proc-sequential-file-read/20241018-135026
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20241018054728.116519-1-kanie%40linux.alibaba.com
patch subject: [PATCH] PCI: optimize proc sequential file read
config: x86_64-randconfig-122-20241019 (https://download.01.org/0day-ci/archive/20241020/202410201436.9i93qAYF-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241020/202410201436.9i93qAYF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410201436.9i93qAYF-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> drivers/pci/proc.c:22:1: sparse: sparse: symbol 'pci_seq_tree' was not declared. Should it be static?

vim +/pci_seq_tree +22 drivers/pci/proc.c

    21	
  > 22	DEFINE_XARRAY_FLAGS(pci_seq_tree, 0);
    23	static unsigned long pci_max_idx;
    24
Guixin Liu Oct. 21, 2024, 2:04 a.m. UTC | #5
在 2024/10/19 06:22, Bjorn Helgaas 写道:
> On Fri, Oct 18, 2024 at 01:47:28PM +0800, Guixin Liu wrote:
>> PCI driver will traverse pci device list in pci_seq_start in every
>> sequential file reading, use xarry to store all pci devices to
>> accelerate finding the start.
>>
>> Use "time cat /proc/bus/pci/devices" to test on a machine with 13k
>> pci devices,  get an increase of about 90%.
> s/pci/PCI/ (several times)
> s/pci_seq_start/pci_seq_start()/
> s/xarry/xarray/
> s/,  /, / (remove extra space)
>
OK.
>> Without this patch:
>>    real 0m0.917s
>>    user 0m0.000s
>>    sys  0m0.913s
>> With this patch:
>>    real 0m0.093s
>>    user 0m0.000s
>>    sys  0m0.093s
> Nice speedup, for sure!
>
>> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
>> ---
>>   drivers/pci/pci.h    |  3 +++
>>   drivers/pci/probe.c  |  1 +
>>   drivers/pci/proc.c   | 64 +++++++++++++++++++++++++++++++++++++++-----
>>   drivers/pci/remove.c |  1 +
>>   include/linux/pci.h  |  2 ++
>>   5 files changed, 64 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index 14d00ce45bfa..1a7da91eeb80 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -962,4 +962,7 @@ void pcim_release_region(struct pci_dev *pdev, int bar);
>>   	(PCI_CONF1_ADDRESS(bus, dev, func, reg) | \
>>   	 PCI_CONF1_EXT_REG(reg))
>>   
>> +void pci_seq_tree_add_dev(struct pci_dev *dev);
>> +void pci_seq_tree_remove_dev(struct pci_dev *dev);
>> +
>>   #endif /* DRIVERS_PCI_H */
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 4f68414c3086..1fd9e9022f70 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -2592,6 +2592,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
>>   	WARN_ON(ret < 0);
>>   
>>   	pci_npem_create(dev);
>> +	pci_seq_tree_add_dev(dev);
>>   }
>>   
>>   struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
>> diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
>> index f967709082d6..30ca071ccad5 100644
>> --- a/drivers/pci/proc.c
>> +++ b/drivers/pci/proc.c
>> @@ -19,6 +19,9 @@
>>   
>>   static int proc_initialized;	/* = 0 */
>>   
>> +DEFINE_XARRAY_FLAGS(pci_seq_tree, 0);
>> +static unsigned long pci_max_idx;
>> +
>>   static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
>>   {
>>   	struct pci_dev *dev = pde_data(file_inode(file));
>> @@ -334,25 +337,72 @@ static const struct proc_ops proc_bus_pci_ops = {
>>   #endif /* HAVE_PCI_MMAP */
>>   };
>>   
>> +void pci_seq_tree_add_dev(struct pci_dev *dev)
>> +{
>> +	int ret;
>> +
>> +	if (dev) {
> I don't think we should test "dev" for NULL here.  If it's NULL, I
> think we have bigger problems and we should oops.
Sure.
>> +		xa_lock(&pci_seq_tree);
>> +		pci_dev_get(dev);
>> +		ret = __xa_insert(&pci_seq_tree, pci_max_idx, dev, GFP_KERNEL);
>> +		if (!ret) {
>> +			dev->proc_seq_idx = pci_max_idx;
>> +			pci_max_idx++;
>> +		} else {
>> +			pci_dev_put(dev);
>> +			WARN_ON(ret);
>> +		}
>> +		xa_unlock(&pci_seq_tree);
>> +	}
>> +}
>> +
>> +void pci_seq_tree_remove_dev(struct pci_dev *dev)
>> +{
>> +	unsigned long idx = dev->proc_seq_idx;
>> +	struct pci_dev *latest_dev = NULL;
>> +	struct pci_dev *ret;
>> +
>> +	if (!dev)
>> +		return;
> Same comment about testing "dev" for NULL.
>
Ok.
>> +	xa_lock(&pci_seq_tree);
>> +	__xa_erase(&pci_seq_tree, idx);
>> +	pci_dev_put(dev);
>> +	/*
>> +	 * Move the latest pci_dev to this idx to keep the continuity.
>> +	 */
>> +	if (idx != pci_max_idx - 1) {
>> +		latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
>> +		ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
>> +				GFP_KERNEL);
>> +		if (!ret)
>> +			latest_dev->proc_seq_idx = idx;
>> +		WARN_ON(ret);
>> +	}
>> +	pci_max_idx--;
>> +	xa_unlock(&pci_seq_tree);
>> +}
>> +
>>   /* iterator */
>>   static void *pci_seq_start(struct seq_file *m, loff_t *pos)
>>   {
>> -	struct pci_dev *dev = NULL;
>> +	struct pci_dev *dev;
>>   	loff_t n = *pos;
>>   
>> -	for_each_pci_dev(dev) {
>> -		if (!n--)
>> -			break;
>> -	}
>> +	dev = xa_load(&pci_seq_tree, n);
>> +	if (dev)
>> +		pci_dev_get(dev);
>>   	return dev;
> I'm a little hesitant to add another place that keeps track of every
> PCI device.  It's a fair bit of code here, and it's redundant
> information, which means more work to keep them all synchronized.
>
> This proc interface feels inherently racy.  We keep track of the
> current item (n) in a struct seq_file, but I don't think there's
> anything to prevent a pci_dev hot-add or -remove between calls to
> pci_seq_start().
Yes, maybe lost some information this time.
>
> Is the proc interface the only place to get this information?  If
> there's a way to get it from sysfs, maybe that is better and we don't
> need to spend effort optimizing the less-desirable path?

This is the situation I encountered: in scenarios of rapid container

scaling, when a container is started, it executes lscpu to traverse

the /proc/bus/pci/devices file, or the container process directly

traverses this file. When many containers are being started at once,

it causes numerous containers to wait due to the locks on the klist

used for traversing pci_dev, which greatly reduces the efficiency of

container scaling and also causes other CPUs to become unresponsive.


User-space programs, including Docker, are clients that we cannot easily 
modify.

Therefore, I attempted to accelerate pci_seq_start() within the kernel.

This indeed resulted in the need for more code to maintain, as we must

ensure both fast access and ordering. Initially, I considered directly

modifying the klist in the driver base module, but such changes would

impact other drivers as well.

Do you have any other good suggestions?


Best Regards,

Guixin liu

>>   }
>>   
>>   static void *pci_seq_next(struct seq_file *m, void *v, loff_t *pos)
>>   {
>> -	struct pci_dev *dev = v;
>> +	struct pci_dev *dev;
>>   
>>   	(*pos)++;
>> -	dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
>> +	dev = xa_load(&pci_seq_tree, *pos);
>> +	if (dev)
>> +		pci_dev_get(dev);
> Where is the pci_dev_put() that corresponds with this new
> pci_dev_get()?
In pci_seq_stop(), will call the pci_dev_put().
>
>>   	return dev;
>>   }
>>   
>> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
>> index e4ce1145aa3e..257ea46233a3 100644
>> --- a/drivers/pci/remove.c
>> +++ b/drivers/pci/remove.c
>> @@ -53,6 +53,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
>>   	pci_npem_remove(dev);
>>   
>>   	device_del(&dev->dev);
>> +	pci_seq_tree_remove_dev(dev);
>>   
>>   	down_write(&pci_bus_sem);
>>   	list_del(&dev->bus_list);
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 573b4c4c2be6..aeb3d4cce06a 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -534,6 +534,8 @@ struct pci_dev {
>>   
>>   	/* These methods index pci_reset_fn_methods[] */
>>   	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
>> +
>> +	unsigned long long	proc_seq_idx;
>>   };
>>   
>>   static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
>> -- 
>> 2.43.0
>>
Dan Carpenter Oct. 21, 2024, 7:17 a.m. UTC | #6
Hi Guixin,

kernel test robot noticed the following build warnings:

https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Guixin-Liu/PCI-optimize-proc-sequential-file-read/20241018-135026
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
patch link:    https://lore.kernel.org/r/20241018054728.116519-1-kanie%40linux.alibaba.com
patch subject: [PATCH] PCI: optimize proc sequential file read
config: i386-randconfig-141-20241019 (https://download.01.org/0day-ci/archive/20241019/202410190659.9MCI8EyL-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202410190659.9MCI8EyL-lkp@intel.com/

smatch warnings:
drivers/pci/proc.c:365 pci_seq_tree_remove_dev() warn: variable dereferenced before check 'dev' (see line 361)

vim +/dev +365 drivers/pci/proc.c

4ca256d9a0e58a Guixin Liu 2024-10-18  359  void pci_seq_tree_remove_dev(struct pci_dev *dev)
4ca256d9a0e58a Guixin Liu 2024-10-18  360  {
4ca256d9a0e58a Guixin Liu 2024-10-18 @361  	unsigned long idx = dev->proc_seq_idx;
                                                                    ^^^^^^^^^^^^^^^^^
Dereferenced

4ca256d9a0e58a Guixin Liu 2024-10-18  362  	struct pci_dev *latest_dev = NULL;
4ca256d9a0e58a Guixin Liu 2024-10-18  363  	struct pci_dev *ret;
4ca256d9a0e58a Guixin Liu 2024-10-18  364  
4ca256d9a0e58a Guixin Liu 2024-10-18 @365  	if (!dev)
                                                    ^^^^
Checked too late

4ca256d9a0e58a Guixin Liu 2024-10-18  366  		return;
4ca256d9a0e58a Guixin Liu 2024-10-18  367  
4ca256d9a0e58a Guixin Liu 2024-10-18  368  	xa_lock(&pci_seq_tree);
4ca256d9a0e58a Guixin Liu 2024-10-18  369  	__xa_erase(&pci_seq_tree, idx);
4ca256d9a0e58a Guixin Liu 2024-10-18  370  	pci_dev_put(dev);
4ca256d9a0e58a Guixin Liu 2024-10-18  371  	/*
4ca256d9a0e58a Guixin Liu 2024-10-18  372  	 * Move the latest pci_dev to this idx to keep the continuity.
4ca256d9a0e58a Guixin Liu 2024-10-18  373  	 */
4ca256d9a0e58a Guixin Liu 2024-10-18  374  	if (idx != pci_max_idx - 1) {
4ca256d9a0e58a Guixin Liu 2024-10-18  375  		latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
4ca256d9a0e58a Guixin Liu 2024-10-18  376  		ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
4ca256d9a0e58a Guixin Liu 2024-10-18  377  				GFP_KERNEL);
4ca256d9a0e58a Guixin Liu 2024-10-18  378  		if (!ret)
4ca256d9a0e58a Guixin Liu 2024-10-18  379  			latest_dev->proc_seq_idx = idx;
4ca256d9a0e58a Guixin Liu 2024-10-18  380  		WARN_ON(ret);
4ca256d9a0e58a Guixin Liu 2024-10-18  381  	}
4ca256d9a0e58a Guixin Liu 2024-10-18  382  	pci_max_idx--;
4ca256d9a0e58a Guixin Liu 2024-10-18  383  	xa_unlock(&pci_seq_tree);
4ca256d9a0e58a Guixin Liu 2024-10-18  384  }
Greg KH Oct. 21, 2024, 11:04 a.m. UTC | #7
On Mon, Oct 21, 2024 at 10:04:03AM +0800, Guixin Liu wrote:
> > This proc interface feels inherently racy.  We keep track of the
> > current item (n) in a struct seq_file, but I don't think there's
> > anything to prevent a pci_dev hot-add or -remove between calls to
> > pci_seq_start().
> Yes, maybe lost some information this time.
> > 
> > Is the proc interface the only place to get this information?  If
> > there's a way to get it from sysfs, maybe that is better and we don't
> > need to spend effort optimizing the less-desirable path?
> 
> This is the situation I encountered: in scenarios of rapid container
> 
> scaling, when a container is started, it executes lscpu to traverse
> 
> the /proc/bus/pci/devices file, or the container process directly
> 
> traverses this file. When many containers are being started at once,
> 
> it causes numerous containers to wait due to the locks on the klist
> 
> used for traversing pci_dev, which greatly reduces the efficiency of
> 
> container scaling and also causes other CPUs to become unresponsive.
> 
> 
> User-space programs, including Docker, are clients that we cannot easily
> modify.
> 
> Therefore, I attempted to accelerate pci_seq_start() within the kernel.
> 
> This indeed resulted in the need for more code to maintain, as we must
> 
> ensure both fast access and ordering. Initially, I considered directly
> 
> modifying the klist in the driver base module, but such changes would
> 
> impact other drivers as well.

I am not opposed to any driver core changes where we iterate over lists
of objects on a bus, but usually that's not a real "hot path" that
matters.  Also, you need to keep the lists in a specific order, which I
do not think an xarray will do very well (or at least not the last time
I looked at it, I could be wrong.)

I understand your need to want to make 'lspci' faster, but by default,
'lspci' does not access the proc files, I thought it went through sysfs.
Why not just fix the userspace tool instead if that's the real issue?

Just because you can modify the kernel, doesn't mean you should, often
the better solution is to fix userspace from doing something dumb, and
if it is doing something dumb, then let it and have it deal with the
consequences :)

thanks,

greg k-h
Guixin Liu Oct. 22, 2024, 1:54 a.m. UTC | #8
在 2024/10/21 15:17, Dan Carpenter 写道:
> Hi Guixin,
>
> kernel test robot noticed the following build warnings:
>
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/Guixin-Liu/PCI-optimize-proc-sequential-file-read/20241018-135026
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git next
> patch link:    https://lore.kernel.org/r/20241018054728.116519-1-kanie%40linux.alibaba.com
> patch subject: [PATCH] PCI: optimize proc sequential file read
> config: i386-randconfig-141-20241019 (https://download.01.org/0day-ci/archive/20241019/202410190659.9MCI8EyL-lkp@intel.com/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> | Closes: https://lore.kernel.org/r/202410190659.9MCI8EyL-lkp@intel.com/
>
> smatch warnings:
> drivers/pci/proc.c:365 pci_seq_tree_remove_dev() warn: variable dereferenced before check 'dev' (see line 361)
>
> vim +/dev +365 drivers/pci/proc.c
>
> 4ca256d9a0e58a Guixin Liu 2024-10-18  359  void pci_seq_tree_remove_dev(struct pci_dev *dev)
> 4ca256d9a0e58a Guixin Liu 2024-10-18  360  {
> 4ca256d9a0e58a Guixin Liu 2024-10-18 @361  	unsigned long idx = dev->proc_seq_idx;
>                                                                      ^^^^^^^^^^^^^^^^^
> Dereferenced

Thanks, changed in v2.

Best Regards,

Guixin Liu

> 4ca256d9a0e58a Guixin Liu 2024-10-18  362  	struct pci_dev *latest_dev = NULL;
> 4ca256d9a0e58a Guixin Liu 2024-10-18  363  	struct pci_dev *ret;
> 4ca256d9a0e58a Guixin Liu 2024-10-18  364
> 4ca256d9a0e58a Guixin Liu 2024-10-18 @365  	if (!dev)
>                                                      ^^^^
> Checked too late
>
> 4ca256d9a0e58a Guixin Liu 2024-10-18  366  		return;
> 4ca256d9a0e58a Guixin Liu 2024-10-18  367
> 4ca256d9a0e58a Guixin Liu 2024-10-18  368  	xa_lock(&pci_seq_tree);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  369  	__xa_erase(&pci_seq_tree, idx);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  370  	pci_dev_put(dev);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  371  	/*
> 4ca256d9a0e58a Guixin Liu 2024-10-18  372  	 * Move the latest pci_dev to this idx to keep the continuity.
> 4ca256d9a0e58a Guixin Liu 2024-10-18  373  	 */
> 4ca256d9a0e58a Guixin Liu 2024-10-18  374  	if (idx != pci_max_idx - 1) {
> 4ca256d9a0e58a Guixin Liu 2024-10-18  375  		latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  376  		ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
> 4ca256d9a0e58a Guixin Liu 2024-10-18  377  				GFP_KERNEL);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  378  		if (!ret)
> 4ca256d9a0e58a Guixin Liu 2024-10-18  379  			latest_dev->proc_seq_idx = idx;
> 4ca256d9a0e58a Guixin Liu 2024-10-18  380  		WARN_ON(ret);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  381  	}
> 4ca256d9a0e58a Guixin Liu 2024-10-18  382  	pci_max_idx--;
> 4ca256d9a0e58a Guixin Liu 2024-10-18  383  	xa_unlock(&pci_seq_tree);
> 4ca256d9a0e58a Guixin Liu 2024-10-18  384  }
>
Guixin Liu Oct. 22, 2024, 2:21 a.m. UTC | #9
在 2024/10/21 19:04, Greg KH 写道:
> On Mon, Oct 21, 2024 at 10:04:03AM +0800, Guixin Liu wrote:
>>> This proc interface feels inherently racy.  We keep track of the
>>> current item (n) in a struct seq_file, but I don't think there's
>>> anything to prevent a pci_dev hot-add or -remove between calls to
>>> pci_seq_start().
>> Yes, maybe lost some information this time.
>>> Is the proc interface the only place to get this information?  If
>>> there's a way to get it from sysfs, maybe that is better and we don't
>>> need to spend effort optimizing the less-desirable path?
>> This is the situation I encountered: in scenarios of rapid container
>>
>> scaling, when a container is started, it executes lscpu to traverse
>>
>> the /proc/bus/pci/devices file, or the container process directly
>>
>> traverses this file. When many containers are being started at once,
>>
>> it causes numerous containers to wait due to the locks on the klist
>>
>> used for traversing pci_dev, which greatly reduces the efficiency of
>>
>> container scaling and also causes other CPUs to become unresponsive.
>>
>>
>> User-space programs, including Docker, are clients that we cannot easily
>> modify.
>>
>> Therefore, I attempted to accelerate pci_seq_start() within the kernel.
>>
>> This indeed resulted in the need for more code to maintain, as we must
>>
>> ensure both fast access and ordering. Initially, I considered directly
>>
>> modifying the klist in the driver base module, but such changes would
>>
>> impact other drivers as well.
> I am not opposed to any driver core changes where we iterate over lists
> of objects on a bus, but usually that's not a real "hot path" that
> matters.  Also, you need to keep the lists in a specific order, which I
> do not think an xarray will do very well (or at least not the last time
> I looked at it, I could be wrong.)
>
> I understand your need to want to make 'lspci' faster, but by default,
> 'lspci' does not access the proc files, I thought it went through sysfs.
> Why not just fix the userspace tool instead if that's the real issue?
>
> Just because you can modify the kernel, doesn't mean you should, often
> the better solution is to fix userspace from doing something dumb, and
> if it is doing something dumb, then let it and have it deal with the
> consequences :)
>
> thanks,
>
> greg k-h

Actually, lscpu may access PCI proc files and could potentially iterate 
three times.

You can see this in the lscpu source code within the function

lscpu_read_virtualization()->has_pci_device(). We've previously 
optimized it to

combine those three iterations into one, as reflected in this PR:

https://github.com/util-linux/util-linux/pull/3177.


Additionally, many of our clients’ tools and programs also access PCI proc

files, but we are not allowed to modify that part; they do not provide us

with the source code. Clients tend to perceive that the kernel performance

we provide is subpar(That's why we have stopped giving 'lscpu' direct 
access to sysfs).

We have already implemented such optimizations in our operating system and

achieved significant performance benefits. If this patch can be accepted 
into

the community, it would be great for more people to benefit from it.

Best Regards,

Guixin Liu
Bjorn Helgaas Oct. 22, 2024, 3:44 p.m. UTC | #10
[+cc Greg]

On Fri, Oct 18, 2024 at 01:47:28PM +0800, Guixin Liu wrote:
> PCI driver will traverse pci device list in pci_seq_start in every
> sequential file reading, use xarry to store all pci devices to
> accelerate finding the start.

>  /* iterator */
>  static void *pci_seq_start(struct seq_file *m, loff_t *pos)
>  {
> -	struct pci_dev *dev = NULL;
> +	struct pci_dev *dev;
>  	loff_t n = *pos;
>  
> -	for_each_pci_dev(dev) {
> -		if (!n--)
> -			break;
> -	}

Maybe another approach would be to use pci_get_device() directly
instead of for_each_pci_dev().

pci_get_device() takes a "from" starting point, so instead of keeping
track of the index "n", you could keep track of the current pci_dev.
Guixin Liu Oct. 24, 2024, 3:42 a.m. UTC | #11
在 2024/10/22 23:44, Bjorn Helgaas 写道:
> [+cc Greg]
>
> On Fri, Oct 18, 2024 at 01:47:28PM +0800, Guixin Liu wrote:
>> PCI driver will traverse pci device list in pci_seq_start in every
>> sequential file reading, use xarry to store all pci devices to
>> accelerate finding the start.
>>   /* iterator */
>>   static void *pci_seq_start(struct seq_file *m, loff_t *pos)
>>   {
>> -	struct pci_dev *dev = NULL;
>> +	struct pci_dev *dev;
>>   	loff_t n = *pos;
>>   
>> -	for_each_pci_dev(dev) {
>> -		if (!n--)
>> -			break;
>> -	}
> Maybe another approach would be to use pci_get_device() directly
> instead of for_each_pci_dev().
>
> pci_get_device() takes a "from" starting point, so instead of keeping
> track of the index "n", you could keep track of the current pci_dev.

You mean let struct seq_file to keep the pci_dev? Well, we also need keep

the pci_dev which held by seq_file in klist_devices, because finding 
next rely on

it, this make things complicated.

In addition, in pci_seq_start(), the pos may not increased one by one, so

we dont know how many times we should skip to find the pci_dev which number

is *pos.

Best Regards,

Guixin Liu
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..1a7da91eeb80 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -962,4 +962,7 @@  void pcim_release_region(struct pci_dev *pdev, int bar);
 	(PCI_CONF1_ADDRESS(bus, dev, func, reg) | \
 	 PCI_CONF1_EXT_REG(reg))
 
+void pci_seq_tree_add_dev(struct pci_dev *dev);
+void pci_seq_tree_remove_dev(struct pci_dev *dev);
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..1fd9e9022f70 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2592,6 +2592,7 @@  void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	WARN_ON(ret < 0);
 
 	pci_npem_create(dev);
+	pci_seq_tree_add_dev(dev);
 }
 
 struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index f967709082d6..30ca071ccad5 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -19,6 +19,9 @@ 
 
 static int proc_initialized;	/* = 0 */
 
+DEFINE_XARRAY_FLAGS(pci_seq_tree, 0);
+static unsigned long pci_max_idx;
+
 static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 {
 	struct pci_dev *dev = pde_data(file_inode(file));
@@ -334,25 +337,72 @@  static const struct proc_ops proc_bus_pci_ops = {
 #endif /* HAVE_PCI_MMAP */
 };
 
+void pci_seq_tree_add_dev(struct pci_dev *dev)
+{
+	int ret;
+
+	if (dev) {
+		xa_lock(&pci_seq_tree);
+		pci_dev_get(dev);
+		ret = __xa_insert(&pci_seq_tree, pci_max_idx, dev, GFP_KERNEL);
+		if (!ret) {
+			dev->proc_seq_idx = pci_max_idx;
+			pci_max_idx++;
+		} else {
+			pci_dev_put(dev);
+			WARN_ON(ret);
+		}
+		xa_unlock(&pci_seq_tree);
+	}
+}
+
+void pci_seq_tree_remove_dev(struct pci_dev *dev)
+{
+	unsigned long idx = dev->proc_seq_idx;
+	struct pci_dev *latest_dev = NULL;
+	struct pci_dev *ret;
+
+	if (!dev)
+		return;
+
+	xa_lock(&pci_seq_tree);
+	__xa_erase(&pci_seq_tree, idx);
+	pci_dev_put(dev);
+	/*
+	 * Move the latest pci_dev to this idx to keep the continuity.
+	 */
+	if (idx != pci_max_idx - 1) {
+		latest_dev = __xa_erase(&pci_seq_tree, pci_max_idx - 1);
+		ret = __xa_cmpxchg(&pci_seq_tree, idx, NULL, latest_dev,
+				GFP_KERNEL);
+		if (!ret)
+			latest_dev->proc_seq_idx = idx;
+		WARN_ON(ret);
+	}
+	pci_max_idx--;
+	xa_unlock(&pci_seq_tree);
+}
+
 /* iterator */
 static void *pci_seq_start(struct seq_file *m, loff_t *pos)
 {
-	struct pci_dev *dev = NULL;
+	struct pci_dev *dev;
 	loff_t n = *pos;
 
-	for_each_pci_dev(dev) {
-		if (!n--)
-			break;
-	}
+	dev = xa_load(&pci_seq_tree, n);
+	if (dev)
+		pci_dev_get(dev);
 	return dev;
 }
 
 static void *pci_seq_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct pci_dev *dev = v;
+	struct pci_dev *dev;
 
 	(*pos)++;
-	dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+	dev = xa_load(&pci_seq_tree, *pos);
+	if (dev)
+		pci_dev_get(dev);
 	return dev;
 }
 
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index e4ce1145aa3e..257ea46233a3 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -53,6 +53,7 @@  static void pci_destroy_dev(struct pci_dev *dev)
 	pci_npem_remove(dev);
 
 	device_del(&dev->dev);
+	pci_seq_tree_remove_dev(dev);
 
 	down_write(&pci_bus_sem);
 	list_del(&dev->bus_list);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..aeb3d4cce06a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -534,6 +534,8 @@  struct pci_dev {
 
 	/* These methods index pci_reset_fn_methods[] */
 	u8 reset_methods[PCI_NUM_RESET_METHODS]; /* In priority order */
+
+	unsigned long long	proc_seq_idx;
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)