diff mbox series

drivers/perf: riscv_pmu_sbi: add perf_user_access sysctl

Message ID 20220826151556.1708879-1-heiko@sntech.de (mailing list archive)
State New, archived
Headers show
Series drivers/perf: riscv_pmu_sbi: add perf_user_access sysctl | expand

Commit Message

Heiko Stuebner Aug. 26, 2022, 3:15 p.m. UTC
Add a sysctl similar to the one on arm64 to enable/disable
access to counter CSRs from u-mode on RISC-V.

The default is of course set to disabled keeping the current
state of access - to only the TIME CSR.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 Documentation/admin-guide/sysctl/kernel.rst |  6 +--
 drivers/perf/riscv_pmu_sbi.c                | 43 ++++++++++++++++++++-
 2 files changed, 44 insertions(+), 5 deletions(-)

Comments

Conor Dooley Aug. 26, 2022, 5:28 p.m. UTC | #1
Hey Heiko,
Couple comments, mostly nitpicky sorts of things..

On 26/08/2022 16:15, Heiko Stuebner wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
> Add a sysctl similar to the one on arm64 to enable/disable
> access to counter CSRs from u-mode on RISC-V.
> 
> The default is of course set to disabled keeping the current
> state of access - to only the TIME CSR.
> 
> Signed-off-by: Heiko Stuebner <heiko@sntech.de>
> ---
>  Documentation/admin-guide/sysctl/kernel.rst |  6 +--
>  drivers/perf/riscv_pmu_sbi.c                | 43 ++++++++++++++++++++-
>  2 files changed, 44 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index ee6572b1edad..efd4bc385e7a 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -894,15 +894,15 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
>  The default value is 8.
> 
> 
> -perf_user_access (arm64 only)
> -=================================
> +perf_user_access (arm64 and riscv only)
> +=======================================
> 
>  Controls user space access for reading perf event counters. When set to 1,
>  user space can read performance monitor counter registers directly.
> 
>  The default value is 0 (access disabled).
> 
> -See Documentation/arm64/perf.rst for more information.
> +See Documentation/arm64/perf.rst for more information on arm64

Code aside, this reads somewhat confusingly. How come riscv doesn't have
further information to offer?

Even if we don't have anything from a riscv point of view, it would read
better if this statement mentioned what it is about arm64 that requires
more information. Something like "for more information on arm64 only
fetures x, y & z".

> 
> 
>  pid_max
> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> index 6f6681bbfd36..7aab8d673357 100644
> --- a/drivers/perf/riscv_pmu_sbi.c
> +++ b/drivers/perf/riscv_pmu_sbi.c
> @@ -41,6 +41,8 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = {
>         NULL,
>  };
> 
> +static int sysctl_perf_user_access __read_mostly;
> +
>  /*
>   * RISC-V doesn't have hetergenous harts yet. This need to be part of

Not in your changes, but s/hetergenous/heterogeneous

>   * per_cpu in case of harts with different pmu counters
> @@ -640,13 +642,22 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>         return IRQ_HANDLED;
>  }
> 
> +/*
> + * Depending on the perf_user_access setting, enable the access
> + * from usermode either for all counters or for TIME csr only.
> + */
> +static void riscv_pmu_update_user_access(void *info)
> +{
> +       csr_write(CSR_SCOUNTEREN, sysctl_perf_user_access ? GENMASK(31, 0) :
> +                                                           0x2);

This file does not comply to 80 character limits to begin with, so I
think this would be better on an 82 character line than broken up.

> +}
> +
>  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>  {
>         struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
>         struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
> 
> -       /* Enable the access for TIME csr only from the user mode now */
> -       csr_write(CSR_SCOUNTEREN, 0x2);
> +       riscv_pmu_update_user_access(NULL);
> 
>         /* Stop all the counters so that they can be enabled from perf */
>         pmu_sbi_stop_all(pmu);
> @@ -785,6 +796,32 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu)
>         cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
>  }
> 
> +static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
> +                       int write, void *buffer, size_t *lenp, loff_t *ppos)

I assume checkpatch will whinge about the alignment here.

> +{
> +       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +
> +       if (ret || !write)
> +               return ret;
> +
> +       on_each_cpu(riscv_pmu_update_user_access, NULL, 1);
> +
> +       return 0;
> +}
> +
> +static struct ctl_table sbi_pmu_sysctl_table[] = {
> +       {
> +               .procname       = "perf_user_access",
> +               .data           = &sysctl_perf_user_access,
> +               .maxlen         = sizeof(unsigned int),
> +               .mode           = 0644,

Mixed use of whitespace here, some are spaces and some tabs..

Thanks,
Conor.

> +               .proc_handler   = riscv_pmu_proc_user_access_handler,
> +               .extra1         = SYSCTL_ZERO,
> +               .extra2         = SYSCTL_ONE,
> +       },
> +       { }
> +};
> +
>  static int pmu_sbi_device_probe(struct platform_device *pdev)
>  {
>         struct riscv_pmu *pmu = NULL;
> @@ -834,6 +871,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>         if (ret)
>                 goto out_unregister;
> 
> +       register_sysctl("kernel", sbi_pmu_sysctl_table);
> +
>         return 0;
> 
>  out_unregister:
> --
> 2.35.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
Atish Patra Sept. 29, 2022, 8:11 a.m. UTC | #2
On Fri, Aug 26, 2022 at 3:10 PM Atish Patra <atishp@atishpatra.org> wrote:
>
>
>
> On Fri, Aug 26, 2022 at 8:16 AM Heiko Stuebner <heiko@sntech.de> wrote:
>>
>> Add a sysctl similar to the one on arm64 to enable/disable
>> access to counter CSRs from u-mode on RISC-V.
>>
>> The default is of course set to disabled keeping the current
>> state of access - to only the TIME CSR.
>>
>> Signed-off-by: Heiko Stuebner <heiko@sntech.de>
>> ---
>>  Documentation/admin-guide/sysctl/kernel.rst |  6 +--
>>  drivers/perf/riscv_pmu_sbi.c                | 43 ++++++++++++++++++++-
>>  2 files changed, 44 insertions(+), 5 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
>> index ee6572b1edad..efd4bc385e7a 100644
>> --- a/Documentation/admin-guide/sysctl/kernel.rst
>> +++ b/Documentation/admin-guide/sysctl/kernel.rst
>> @@ -894,15 +894,15 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
>>  The default value is 8.
>>
>>
>> -perf_user_access (arm64 only)
>> -=================================
>> +perf_user_access (arm64 and riscv only)
>> +=======================================
>>
>>  Controls user space access for reading perf event counters. When set to 1,
>>  user space can read performance monitor counter registers directly.
>>
>>  The default value is 0 (access disabled).
>>
>> -See Documentation/arm64/perf.rst for more information.
>> +See Documentation/arm64/perf.rst for more information on arm64
>>
>>
>>  pid_max
>> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
>> index 6f6681bbfd36..7aab8d673357 100644
>> --- a/drivers/perf/riscv_pmu_sbi.c
>> +++ b/drivers/perf/riscv_pmu_sbi.c
>> @@ -41,6 +41,8 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = {
>>         NULL,
>>  };
>>
>> +static int sysctl_perf_user_access __read_mostly;
>> +
>>  /*
>>   * RISC-V doesn't have hetergenous harts yet. This need to be part of
>>   * per_cpu in case of harts with different pmu counters
>> @@ -640,13 +642,22 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>>         return IRQ_HANDLED;
>>  }
>>
>> +/*
>> + * Depending on the perf_user_access setting, enable the access
>> + * from usermode either for all counters or for TIME csr only.
>> + */
>> +static void riscv_pmu_update_user_access(void *info)
>> +{
>> +       csr_write(CSR_SCOUNTEREN, sysctl_perf_user_access ? GENMASK(31, 0) :
>> +                                                           0x2);
>> +}
>> +
>>  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>>  {
>>         struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
>>         struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
>>
>> -       /* Enable the access for TIME csr only from the user mode now */
>> -       csr_write(CSR_SCOUNTEREN, 0x2);
>> +       riscv_pmu_update_user_access(NULL);
>>
>>         /* Stop all the counters so that they can be enabled from perf */
>>         pmu_sbi_stop_all(pmu);
>> @@ -785,6 +796,32 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu)
>>         cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
>>  }
>>
>> +static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
>> +                       int write, void *buffer, size_t *lenp, loff_t *ppos)
>> +{
>> +       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>> +
>> +       if (ret || !write)
>> +               return ret;
>> +
>> +       on_each_cpu(riscv_pmu_update_user_access, NULL, 1);
>> +
>
>
> This enables all the counter access from the user space. Not the ones that were being monitored through perf system calls.
> Looking at ARM64 code, they clear out all other counters during the start to avoid the leakage.
>
> This is a bit expensive to achieve with the current SBI interface as there is no simpler interface to clear out the mhpmcounter value without starting it.
> We have to start the other counters with 0 value and stop it immediately which is very ugly.
>
> There is a supervisor counter delegation proposal that is in discussion which can write the mhpmevent/mhpmcounter (via a different CSR) directly from S-mode.
> Once that is in place, we can enable such functionality.
>
> I am not sure how commonly this feature is used. Personally, I would prefer to not enable this until supervisor counter delegation is implemented.
> But I would like to hear what others think.
>
> If it is a very common feature that RISC-V should support immediately, we have to resort to the ugly route of enabling/disabling the unused counters.
>

I just realized my above response was never delivered to the mailing
list. In a way that's a good thing as I am trying to retract my above
opinion :)
There are some discussions around enabling cycle/instret for the user
space unconditionally [1] and a patch from palmer[2]
To allow userspace to selectively enable the cycle counter, we need
this patch. Thus, it should be merged asap.

I am still not sure how big the problem is for exposing the previously
used counter values. We can have separate discussions on that and a
follow up patch if required.

[1] https://lore.kernel.org/all/YxEhC%2FmDW1lFt36J@aurel32.net/
[2] https://lore.kernel.org/linux-riscv/YzS0RNzH2CprLSyc@spud/T/#ma8e0034fa9172a66820d6d5fbc3314619ff657a5


>> +       return 0;
>> +}
>> +
>> +static struct ctl_table sbi_pmu_sysctl_table[] = {
>> +       {
>> +               .procname       = "perf_user_access",
>> +               .data           = &sysctl_perf_user_access,
>> +               .maxlen         = sizeof(unsigned int),
>> +               .mode           = 0644,
>> +               .proc_handler   = riscv_pmu_proc_user_access_handler,
>> +               .extra1         = SYSCTL_ZERO,
>> +               .extra2         = SYSCTL_ONE,
>> +       },
>> +       { }
>> +};
>> +
>>  static int pmu_sbi_device_probe(struct platform_device *pdev)
>>  {
>>         struct riscv_pmu *pmu = NULL;
>> @@ -834,6 +871,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>>         if (ret)
>>                 goto out_unregister;
>>
>> +       register_sysctl("kernel", sbi_pmu_sysctl_table);
>> +
>>         return 0;
>>
>>  out_unregister:
>> --
>> 2.35.1
>>
>
>
> --
> Regards,
> Atish



--
Regards,
Atish
Palmer Dabbelt Oct. 7, 2022, 2:26 a.m. UTC | #3
On Fri, 26 Aug 2022 08:15:56 PDT (-0700), heiko@sntech.de wrote:
> Add a sysctl similar to the one on arm64 to enable/disable
> access to counter CSRs from u-mode on RISC-V.
>
> The default is of course set to disabled keeping the current
> state of access - to only the TIME CSR.

Sorry for being slow on this one, but IMO this is the wrong way to go: 
this was pretty clearly described by the PDFs as a non-optional 
instruction when we committed to uABI stability, and there's userspace 
binaries that use these instructions.  I know the ISA folks changed 
their minds about these being in the base, but that doesn't mean we can 
break userspace.

If you're worried about a side channel from the high resolution timers 
that makes sense, but we can sort that out without breaking userspace: 
we just trap the counter accesses and handle them in the kernel with 
less precision.  We'll need to do that in the long run anyway, as 
there's no way to make sure these are implement.  That all applies to 
the time counter as well.

I'd also argue this should be a prctl(), as that'll allow users to flip 
it on/off for specific processes.  We can make it sticky to deal with 
the side channels, but at least starting with a per-process flag will 
let us avoid breaking compatibility for everyone.

> Signed-off-by: Heiko Stuebner <heiko@sntech.de>
> ---
>  Documentation/admin-guide/sysctl/kernel.rst |  6 +--
>  drivers/perf/riscv_pmu_sbi.c                | 43 ++++++++++++++++++++-
>  2 files changed, 44 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index ee6572b1edad..efd4bc385e7a 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -894,15 +894,15 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
>  The default value is 8.
>
>
> -perf_user_access (arm64 only)
> -=================================
> +perf_user_access (arm64 and riscv only)
> +=======================================
>
>  Controls user space access for reading perf event counters. When set to 1,
>  user space can read performance monitor counter registers directly.
>
>  The default value is 0 (access disabled).
>
> -See Documentation/arm64/perf.rst for more information.
> +See Documentation/arm64/perf.rst for more information on arm64
>
>
>  pid_max
> diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
> index 6f6681bbfd36..7aab8d673357 100644
> --- a/drivers/perf/riscv_pmu_sbi.c
> +++ b/drivers/perf/riscv_pmu_sbi.c
> @@ -41,6 +41,8 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = {
>  	NULL,
>  };
>
> +static int sysctl_perf_user_access __read_mostly;
> +
>  /*
>   * RISC-V doesn't have hetergenous harts yet. This need to be part of
>   * per_cpu in case of harts with different pmu counters
> @@ -640,13 +642,22 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
>  	return IRQ_HANDLED;
>  }
>
> +/*
> + * Depending on the perf_user_access setting, enable the access
> + * from usermode either for all counters or for TIME csr only.
> + */
> +static void riscv_pmu_update_user_access(void *info)
> +{
> +	csr_write(CSR_SCOUNTEREN, sysctl_perf_user_access ? GENMASK(31, 0) :
> +							    0x2);
> +}
> +
>  static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
>  {
>  	struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
>  	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
>
> -	/* Enable the access for TIME csr only from the user mode now */
> -	csr_write(CSR_SCOUNTEREN, 0x2);
> +	riscv_pmu_update_user_access(NULL);
>
>  	/* Stop all the counters so that they can be enabled from perf */
>  	pmu_sbi_stop_all(pmu);
> @@ -785,6 +796,32 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu)
>  	cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
>  }
>
> +static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
> +			int write, void *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +
> +	if (ret || !write)
> +		return ret;
> +
> +	on_each_cpu(riscv_pmu_update_user_access, NULL, 1);
> +
> +	return 0;
> +}
> +
> +static struct ctl_table sbi_pmu_sysctl_table[] = {
> +	{
> +		.procname       = "perf_user_access",
> +		.data		= &sysctl_perf_user_access,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode           = 0644,
> +		.proc_handler	= riscv_pmu_proc_user_access_handler,
> +		.extra1		= SYSCTL_ZERO,
> +		.extra2		= SYSCTL_ONE,
> +	},
> +	{ }
> +};
> +
>  static int pmu_sbi_device_probe(struct platform_device *pdev)
>  {
>  	struct riscv_pmu *pmu = NULL;
> @@ -834,6 +871,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
>  	if (ret)
>  		goto out_unregister;
>
> +	register_sysctl("kernel", sbi_pmu_sysctl_table);
> +
>  	return 0;
>
>  out_unregister:
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index ee6572b1edad..efd4bc385e7a 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -894,15 +894,15 @@  enabled, otherwise writing to this file will return ``-EBUSY``.
 The default value is 8.
 
 
-perf_user_access (arm64 only)
-=================================
+perf_user_access (arm64 and riscv only)
+=======================================
 
 Controls user space access for reading perf event counters. When set to 1,
 user space can read performance monitor counter registers directly.
 
 The default value is 0 (access disabled).
 
-See Documentation/arm64/perf.rst for more information.
+See Documentation/arm64/perf.rst for more information on arm64
 
 
 pid_max
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 6f6681bbfd36..7aab8d673357 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -41,6 +41,8 @@  static const struct attribute_group *riscv_pmu_attr_groups[] = {
 	NULL,
 };
 
+static int sysctl_perf_user_access __read_mostly;
+
 /*
  * RISC-V doesn't have hetergenous harts yet. This need to be part of
  * per_cpu in case of harts with different pmu counters
@@ -640,13 +642,22 @@  static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
+/*
+ * Depending on the perf_user_access setting, enable the access
+ * from usermode either for all counters or for TIME csr only.
+ */
+static void riscv_pmu_update_user_access(void *info)
+{
+	csr_write(CSR_SCOUNTEREN, sysctl_perf_user_access ? GENMASK(31, 0) :
+							    0x2);
+}
+
 static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
 {
 	struct riscv_pmu *pmu = hlist_entry_safe(node, struct riscv_pmu, node);
 	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
 
-	/* Enable the access for TIME csr only from the user mode now */
-	csr_write(CSR_SCOUNTEREN, 0x2);
+	riscv_pmu_update_user_access(NULL);
 
 	/* Stop all the counters so that they can be enabled from perf */
 	pmu_sbi_stop_all(pmu);
@@ -785,6 +796,32 @@  static void riscv_pmu_destroy(struct riscv_pmu *pmu)
 	cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
 }
 
+static int riscv_pmu_proc_user_access_handler(struct ctl_table *table,
+			int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	on_each_cpu(riscv_pmu_update_user_access, NULL, 1);
+
+	return 0;
+}
+
+static struct ctl_table sbi_pmu_sysctl_table[] = {
+	{
+		.procname       = "perf_user_access",
+		.data		= &sysctl_perf_user_access,
+		.maxlen		= sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= riscv_pmu_proc_user_access_handler,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
 static int pmu_sbi_device_probe(struct platform_device *pdev)
 {
 	struct riscv_pmu *pmu = NULL;
@@ -834,6 +871,8 @@  static int pmu_sbi_device_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_unregister;
 
+	register_sysctl("kernel", sbi_pmu_sysctl_table);
+
 	return 0;
 
 out_unregister: