diff mbox series

[v8,2/2] x86/sgx: Add an attribute for the amount of SGX memory in a NUMA node

Message ID 20211018135744.45527-2-jarkko@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v8,1/2] x86/sgx: Rename fallback labels in sgx_init() | expand

Commit Message

Jarkko Sakkinen Oct. 18, 2021, 1:57 p.m. UTC
The amount of SGX memory on the system is determined by the BIOS and it
varies wildly between systems.  It can be from dozens of MB's on desktops
or VM's, up to many GB's on servers.  Just like for regular memory, it is
sometimes useful to know the amount of usable SGX memory in the system.

Add an attribute for the amount of SGX memory in bytes to each NUMA
node. The path is /sys/devices/system/node/node[0-9]*/sgx/size.
Calculate these values by summing up EPC section sizes for each node
during the driver initalization.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
v8:
* Fix a bug in sgx_numa_init(): node->dev should be only set after
  sysfe_create_group().  Otherwise, sysfs_remove_group() will issue a
  warning in sgx_numa_exit(), when sgx_create_group() is unsuccessful,
  because the group does not exist.

v7:
* Shorten memory_size to size. The prefix makes the name only longer
  but does not clarify things more than "size" would.
* Use device_attribute instead of kobj_attribute.
* Use named attribute group instead of creating raw kobject just for
  the "sgx" subdirectory.

v6:
* Initialize node->size to zero in sgx_setup_epc_section(), when the
  node is first accessed.

v5
* A new patch based on the discussion on
  https://lore.kernel.org/linux-sgx/3a7cab4115b4f902f3509ad8652e616b91703e1d.camel@kernel.org/T/#t
---
 Documentation/ABI/stable/sysfs-devices-node |  7 ++
 arch/x86/kernel/cpu/sgx/main.c              | 85 +++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h               |  2 +
 3 files changed, 94 insertions(+)

Comments

Greg KH Oct. 18, 2021, 2:35 p.m. UTC | #1
On Mon, Oct 18, 2021 at 04:57:44PM +0300, Jarkko Sakkinen wrote:
> The amount of SGX memory on the system is determined by the BIOS and it
> varies wildly between systems.  It can be from dozens of MB's on desktops
> or VM's, up to many GB's on servers.  Just like for regular memory, it is
> sometimes useful to know the amount of usable SGX memory in the system.
> 
> Add an attribute for the amount of SGX memory in bytes to each NUMA
> node. The path is /sys/devices/system/node/node[0-9]*/sgx/size.
> Calculate these values by summing up EPC section sizes for each node
> during the driver initalization.
> 
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> ---
> v8:
> * Fix a bug in sgx_numa_init(): node->dev should be only set after
>   sysfe_create_group().  Otherwise, sysfs_remove_group() will issue a
>   warning in sgx_numa_exit(), when sgx_create_group() is unsuccessful,
>   because the group does not exist.
> 
> v7:
> * Shorten memory_size to size. The prefix makes the name only longer
>   but does not clarify things more than "size" would.
> * Use device_attribute instead of kobj_attribute.
> * Use named attribute group instead of creating raw kobject just for
>   the "sgx" subdirectory.
> 
> v6:
> * Initialize node->size to zero in sgx_setup_epc_section(), when the
>   node is first accessed.
> 
> v5
> * A new patch based on the discussion on
>   https://lore.kernel.org/linux-sgx/3a7cab4115b4f902f3509ad8652e616b91703e1d.camel@kernel.org/T/#t
> ---
>  Documentation/ABI/stable/sysfs-devices-node |  7 ++
>  arch/x86/kernel/cpu/sgx/main.c              | 85 +++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/sgx.h               |  2 +
>  3 files changed, 94 insertions(+)
> 
> diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
> index 484fc04bcc25..12dc2149e8e0 100644
> --- a/Documentation/ABI/stable/sysfs-devices-node
> +++ b/Documentation/ABI/stable/sysfs-devices-node
> @@ -176,3 +176,10 @@ Contact:	Keith Busch <keith.busch@intel.com>
>  Description:
>  		The cache write policy: 0 for write-back, 1 for write-through,
>  		other or unknown.
> +
> +What:		/sys/devices/system/node/nodeX/sgx/size
> +Date:		October 2021
> +Contact:	Jarkko Sakkinen <jarkko@kernel.org>
> +Description:
> +		Total available physical SGX memory, also known as Enclave Page
> +		Cache (EPC), in bytes.
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index a6e313f1a82d..dc1d46c51323 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -714,9 +714,11 @@ static bool __init sgx_page_cache_init(void)
>  			spin_lock_init(&sgx_numa_nodes[nid].lock);
>  			INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
>  			node_set(nid, sgx_numa_mask);
> +			sgx_numa_nodes[nid].size = 0;
>  		}
>  
>  		sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
> +		sgx_numa_nodes[nid].size += size;
>  
>  		sgx_nr_epc_sections++;
>  	}
> @@ -790,6 +792,81 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
>  }
>  EXPORT_SYMBOL_GPL(sgx_set_attribute);
>  
> +#ifdef CONFIG_NUMA
> +static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf)
> +{
> +	unsigned long size = 0;
> +	int nid;
> +
> +	for (nid = 0; nid < num_possible_nodes(); nid++) {
> +		if (dev == sgx_numa_nodes[nid].dev) {
> +			size = sgx_numa_nodes[nid].size;
> +			break;
> +		}
> +	}
> +
> +	return sysfs_emit(buf, "%lu\n", size);
> +}
> +DEVICE_ATTR_RO(size);
> +
> +static struct attribute *sgx_node_attrs[] = {
> +	&dev_attr_size.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group sgx_node_attr_group = {
> +	.name = "sgx",
> +	.attrs = sgx_node_attrs,
> +};
> +
> +static void sgx_numa_exit(void)
> +{
> +	struct device *dev;
> +	int nid;
> +
> +	for (nid = 0; nid < num_possible_nodes(); nid++) {
> +		dev = &node_devices[nid]->dev;
> +		if (dev)
> +			sysfs_remove_group(&dev->kobj, &sgx_node_attr_group);
> +	}
> +}
> +
> +static bool sgx_numa_init(void)
> +{
> +	struct sgx_numa_node *node;
> +	struct device *dev;
> +	int nid;
> +	int ret;
> +
> +	for (nid = 0; nid < num_possible_nodes(); nid++) {
> +		if (!sgx_numa_nodes[nid].size)
> +			continue;
> +
> +		node = &sgx_numa_nodes[nid];
> +		dev = &node_devices[nid]->dev;
> +
> +		ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);

A huge hint, if a driver has to call a sysfs_* call, something is wrong.

Something is wrong here.

Why are you messing around with a kobject?  This is a device, that you
control, you can just set the default attribute group for it and then
the driver core will add and remove the sysfs group at the proper time,
in the proper way.  Right now you are racing userspace and loosing.

Use the default group list, that is what it is there for.

thanks,

greg k-h
Jarkko Sakkinen Oct. 23, 2021, 1:02 a.m. UTC | #2
On Mon, 2021-10-18 at 16:35 +0200, Greg Kroah-Hartman wrote:
> > +               ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
> 
> A huge hint, if a driver has to call a sysfs_* call, something is wrong.
> 
> Something is wrong here.
> 
> Why are you messing around with a kobject?  This is a device, that you
> control, you can just set the default attribute group for it and then
> the driver core will add and remove the sysfs group at the proper time,
> in the proper way.  Right now you are racing userspace and loosing.
> 
> Use the default group list, that is what it is there for.

I used sysfs_create_group() because node_devices is not owned by SGX
code. It is managed in drivers/base/node.c, and also initialized before
SGX.

/Jarkko
Greg KH Oct. 23, 2021, 6:33 a.m. UTC | #3
On Sat, Oct 23, 2021 at 04:02:48AM +0300, Jarkko Sakkinen wrote:
> On Mon, 2021-10-18 at 16:35 +0200, Greg Kroah-Hartman wrote:
> > > +               ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
> > 
> > A huge hint, if a driver has to call a sysfs_* call, something is wrong.
> > 
> > Something is wrong here.
> > 
> > Why are you messing around with a kobject?  This is a device, that you
> > control, you can just set the default attribute group for it and then
> > the driver core will add and remove the sysfs group at the proper time,
> > in the proper way.  Right now you are racing userspace and loosing.
> > 
> > Use the default group list, that is what it is there for.
> 
> I used sysfs_create_group() because node_devices is not owned by SGX
> code. It is managed in drivers/base/node.c, and also initialized before
> SGX.

Then that is broken, please do not use that device as your code does not
"own" it.  Or fix the logic to be initialized earlier.

thanks,

greg k-h
Jarkko Sakkinen Oct. 24, 2021, 2:24 p.m. UTC | #4
On Sat, 2021-10-23 at 08:33 +0200, Greg Kroah-Hartman wrote:
> On Sat, Oct 23, 2021 at 04:02:48AM +0300, Jarkko Sakkinen wrote:
> > On Mon, 2021-10-18 at 16:35 +0200, Greg Kroah-Hartman wrote:
> > > > +               ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
> > > 
> > > A huge hint, if a driver has to call a sysfs_* call, something is wrong.
> > > 
> > > Something is wrong here.
> > > 
> > > Why are you messing around with a kobject?  This is a device, that you
> > > control, you can just set the default attribute group for it and then
> > > the driver core will add and remove the sysfs group at the proper time,
> > > in the proper way.  Right now you are racing userspace and loosing.
> > > 
> > > Use the default group list, that is what it is there for.
> > 
> > I used sysfs_create_group() because node_devices is not owned by SGX
> > code. It is managed in drivers/base/node.c, and also initialized before
> > SGX.
> 
> Then that is broken, please do not use that device as your code does not
> "own" it.  Or fix the logic to be initialized earlier.

To get a synchronous initialization, I'd need to add the attributes as
part of this declaration:

static struct attribute *node_dev_attrs[] = {
	&dev_attr_cpumap.attr,
	&dev_attr_cpulist.attr,
	&dev_attr_meminfo.attr,
	&dev_attr_numastat.attr,
	&dev_attr_distance.attr,
	&dev_attr_vmstat.attr,
	NULL
};
ATTRIBUTE_GROUPS(node_dev);

That guarantees that the attribute exists at the time when the
node is created, e.g. in that sense this will fix the race with
uevent code.

However, up until sgx_init() has been completed, the attribute
will emit '0'.

If I change sgx_init() from device_initcall() to
core_initcall() (i.e. one before postcore_initcall(), can I
expect these to work:

* node_isset()
* node_set()
* num_possibles_nodes()
* numa_node_id()
* next_node_in()

?

Dave, perhaps you know this?

[*] register_node_type() and kobject_event_init() are postcore init
    calls, so this would be non-racy.

> thanks,
> 
> greg k-h

/Jarkko
Greg KH Oct. 25, 2021, 5:15 a.m. UTC | #5
On Sun, Oct 24, 2021 at 05:24:43PM +0300, Jarkko Sakkinen wrote:
> On Sat, 2021-10-23 at 08:33 +0200, Greg Kroah-Hartman wrote:
> > On Sat, Oct 23, 2021 at 04:02:48AM +0300, Jarkko Sakkinen wrote:
> > > On Mon, 2021-10-18 at 16:35 +0200, Greg Kroah-Hartman wrote:
> > > > > +               ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
> > > > 
> > > > A huge hint, if a driver has to call a sysfs_* call, something is wrong.
> > > > 
> > > > Something is wrong here.
> > > > 
> > > > Why are you messing around with a kobject?  This is a device, that you
> > > > control, you can just set the default attribute group for it and then
> > > > the driver core will add and remove the sysfs group at the proper time,
> > > > in the proper way.  Right now you are racing userspace and loosing.
> > > > 
> > > > Use the default group list, that is what it is there for.
> > > 
> > > I used sysfs_create_group() because node_devices is not owned by SGX
> > > code. It is managed in drivers/base/node.c, and also initialized before
> > > SGX.
> > 
> > Then that is broken, please do not use that device as your code does not
> > "own" it.  Or fix the logic to be initialized earlier.
> 
> To get a synchronous initialization, I'd need to add the attributes as
> part of this declaration:
> 
> static struct attribute *node_dev_attrs[] = {
> 	&dev_attr_cpumap.attr,
> 	&dev_attr_cpulist.attr,
> 	&dev_attr_meminfo.attr,
> 	&dev_attr_numastat.attr,
> 	&dev_attr_distance.attr,
> 	&dev_attr_vmstat.attr,
> 	NULL
> };
> ATTRIBUTE_GROUPS(node_dev);
> 
> That guarantees that the attribute exists at the time when the
> node is created, e.g. in that sense this will fix the race with
> uevent code.
> 
> However, up until sgx_init() has been completed, the attribute
> will emit '0'.

Is that a problem?  Who would be wanting to use sgx until that happens?
You have this issue today anyway, right?

> If I change sgx_init() from device_initcall() to
> core_initcall() (i.e. one before postcore_initcall(), can I
> expect these to work:
> 
> * node_isset()
> * node_set()
> * num_possibles_nodes()
> * numa_node_id()
> * next_node_in()
> 
> ?

You should be able to test this out yourself :)

thanks,

greg k-h
Jarkko Sakkinen Oct. 25, 2021, 11:46 p.m. UTC | #6
On Mon, 2021-10-25 at 07:15 +0200, Greg Kroah-Hartman wrote:
> On Sun, Oct 24, 2021 at 05:24:43PM +0300, Jarkko Sakkinen wrote:
> > On Sat, 2021-10-23 at 08:33 +0200, Greg Kroah-Hartman wrote:
> > > On Sat, Oct 23, 2021 at 04:02:48AM +0300, Jarkko Sakkinen wrote:
> > > > On Mon, 2021-10-18 at 16:35 +0200, Greg Kroah-Hartman wrote:
> > > > > > +               ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
> > > > > 
> > > > > A huge hint, if a driver has to call a sysfs_* call, something is wrong.
> > > > > 
> > > > > Something is wrong here.
> > > > > 
> > > > > Why are you messing around with a kobject?  This is a device, that you
> > > > > control, you can just set the default attribute group for it and then
> > > > > the driver core will add and remove the sysfs group at the proper time,
> > > > > in the proper way.  Right now you are racing userspace and loosing.
> > > > > 
> > > > > Use the default group list, that is what it is there for.
> > > > 
> > > > I used sysfs_create_group() because node_devices is not owned by SGX
> > > > code. It is managed in drivers/base/node.c, and also initialized before
> > > > SGX.
> > > 
> > > Then that is broken, please do not use that device as your code does not
> > > "own" it.  Or fix the logic to be initialized earlier.
> > 
> > To get a synchronous initialization, I'd need to add the attributes as
> > part of this declaration:
> > 
> > static struct attribute *node_dev_attrs[] = {
> >         &dev_attr_cpumap.attr,
> >         &dev_attr_cpulist.attr,
> >         &dev_attr_meminfo.attr,
> >         &dev_attr_numastat.attr,
> >         &dev_attr_distance.attr,
> >         &dev_attr_vmstat.attr,
> >         NULL
> > };
> > ATTRIBUTE_GROUPS(node_dev);
> > 
> > That guarantees that the attribute exists at the time when the
> > node is created, e.g. in that sense this will fix the race with
> > uevent code.
> > 
> > However, up until sgx_init() has been completed, the attribute
> > will emit '0'.
> 
> Is that a problem?  Who would be wanting to use sgx until that happens?
> You have this issue today anyway, right?

Yeah, I guess I can just document this (as part of sysfs abi documentation).

Thank you for the feedback.

/Jarkko
diff mbox series

Patch

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 484fc04bcc25..12dc2149e8e0 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -176,3 +176,10 @@  Contact:	Keith Busch <keith.busch@intel.com>
 Description:
 		The cache write policy: 0 for write-back, 1 for write-through,
 		other or unknown.
+
+What:		/sys/devices/system/node/nodeX/sgx/size
+Date:		October 2021
+Contact:	Jarkko Sakkinen <jarkko@kernel.org>
+Description:
+		Total available physical SGX memory, also known as Enclave Page
+		Cache (EPC), in bytes.
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index a6e313f1a82d..dc1d46c51323 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -714,9 +714,11 @@  static bool __init sgx_page_cache_init(void)
 			spin_lock_init(&sgx_numa_nodes[nid].lock);
 			INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
 			node_set(nid, sgx_numa_mask);
+			sgx_numa_nodes[nid].size = 0;
 		}
 
 		sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
+		sgx_numa_nodes[nid].size += size;
 
 		sgx_nr_epc_sections++;
 	}
@@ -790,6 +792,81 @@  int sgx_set_attribute(unsigned long *allowed_attributes,
 }
 EXPORT_SYMBOL_GPL(sgx_set_attribute);
 
+#ifdef CONFIG_NUMA
+static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	unsigned long size = 0;
+	int nid;
+
+	for (nid = 0; nid < num_possible_nodes(); nid++) {
+		if (dev == sgx_numa_nodes[nid].dev) {
+			size = sgx_numa_nodes[nid].size;
+			break;
+		}
+	}
+
+	return sysfs_emit(buf, "%lu\n", size);
+}
+DEVICE_ATTR_RO(size);
+
+static struct attribute *sgx_node_attrs[] = {
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group sgx_node_attr_group = {
+	.name = "sgx",
+	.attrs = sgx_node_attrs,
+};
+
+static void sgx_numa_exit(void)
+{
+	struct device *dev;
+	int nid;
+
+	for (nid = 0; nid < num_possible_nodes(); nid++) {
+		dev = &node_devices[nid]->dev;
+		if (dev)
+			sysfs_remove_group(&dev->kobj, &sgx_node_attr_group);
+	}
+}
+
+static bool sgx_numa_init(void)
+{
+	struct sgx_numa_node *node;
+	struct device *dev;
+	int nid;
+	int ret;
+
+	for (nid = 0; nid < num_possible_nodes(); nid++) {
+		if (!sgx_numa_nodes[nid].size)
+			continue;
+
+		node = &sgx_numa_nodes[nid];
+		dev = &node_devices[nid]->dev;
+
+		ret = sysfs_create_group(&dev->kobj, &sgx_node_attr_group);
+		if (ret) {
+			sgx_numa_exit();
+			return false;
+		}
+
+		node->dev = dev;
+	}
+
+	return true;
+}
+#else
+static inline void sgx_numa_exit(void)
+{
+}
+
+static inline bool sgx_numa_init(void)
+{
+	return true;
+}
+#endif /* CONFIG_NUMA */
+
 static int __init sgx_init(void)
 {
 	int ret;
@@ -806,6 +883,11 @@  static int __init sgx_init(void)
 		goto err_reclaimer;
 	}
 
+	if (!sgx_numa_init()) {
+		ret = -ENOMEM;
+		goto err_numa_nodes;
+	}
+
 	ret = misc_register(&sgx_dev_provision);
 	if (ret)
 		goto err_provision;
@@ -829,6 +911,9 @@  static int __init sgx_init(void)
 	misc_deregister(&sgx_dev_provision);
 
 err_provision:
+	sgx_numa_exit();
+
+err_numa_nodes:
 	kthread_stop(ksgxd_tsk);
 
 err_reclaimer:
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..1de8c627a286 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -39,6 +39,8 @@  struct sgx_epc_page {
  */
 struct sgx_numa_node {
 	struct list_head free_page_list;
+	struct device *dev;
+	unsigned long size;
 	spinlock_t lock;
 };